In [1]:
#Packages used
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_val_predict 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from skll.metrics import kappa
import matplotlib.pyplot as plt

In [2]:
#Macros
NAME_DICT_SIZE=10


In [3]:
breed_labels=pd.read_csv("./breed_labels.csv")
color_labels=pd.read_csv("./color_labels.csv")
state_labels=pd.read_csv("./state_labels.csv")
train=pd.read_csv("./train.csv")
Name=train["Name"]

In [4]:
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 24 columns):
Type             14993 non-null int64
Name             13736 non-null object
Age              14993 non-null int64
Breed1           14993 non-null int64
Breed2           14993 non-null int64
Gender           14993 non-null int64
Color1           14993 non-null int64
Color2           14993 non-null int64
Color3           14993 non-null int64
MaturitySize     14993 non-null int64
FurLength        14993 non-null int64
Vaccinated       14993 non-null int64
Dewormed         14993 non-null int64
Sterilized       14993 non-null int64
Health           14993 non-null int64
Quantity         14993 non-null int64
Fee              14993 non-null int64
State            14993 non-null int64
RescuerID        14993 non-null object
VideoAmt         14993 non-null int64
Description      14981 non-null object
PetID            14993 non-null object
PhotoAmt         14993 non-null float64
AdoptionSpe

#Creating polynomial features for cat
attr=["Age","Fee","VideoAmt", "Quantity"]
tem_cats=train_cats[attr]
from sklearn.preprocessing import PolynomialFeatures
poly_features=PolynomialFeatures(degree=2, include_bias=False)
x_poly_cats=poly_features.fit_transform(tem_cats)
x_poly_cats=np.transpose(x_poly_cats)

scaler=StandardScaler()
x_poly_cats=scaler.fit_transform(x_poly_cats)
print(x_poly_cats, x_poly_cats.shape)


poly_df=pd.DataFrame({
                        "x0":x_poly_cats[0],
                        "x1":x_poly_cats[1],
                        "x2":x_poly_cats[2],
                        "x3":x_poly_cats[3],
                        "x0^2":x_poly_cats[4],
                        "x0x1":x_poly_cats[5],
                        "x0x2":x_poly_cats[6],
                        "x0x3":x_poly_cats[7],
                        "x1^2":x_poly_cats[8],
                        "x1x2":x_poly_cats[9],
                        "x1x3":x_poly_cats[10],
                        "x2^2":x_poly_cats[11],
                        "x2x3":x_poly_cats[12],
                        "x3^2":x_poly_cats[13],
                        "PetID":train_cats["PetID"]
                        
})
train_cats=train_cats.drop(attr, axis=1)
train_cats=pd.concat([poly_df, train_cats],axis=1)
train_cats=train_cats.drop("PetID", axis=1)

In [5]:
class Preprocess:
    def __init__(self, size=10):
        self.NAME_DICT_SIZE=size
        self.DESCRIPTION_DICT=["home", "dog", "adoption", "cat", "puppy","very", "please", "good","adopt","contact","puppies", "loving","call","interested", "old","love", "found", "care", "friendly", "rescued","house", "playful", "healthy","female", "new","cute","little","help", "forever", "adoptied", "adorable","male","hope","breed","food","pet", "sweet","needs","mother","vaccination","abandoned","pups","pup","month","vaccinated","lovely"]
        self.STATE_DICT=[41336, 41325, 41367, 41401, 41415, 41324, 41332, 41335, 41330, 41380, 41327, 41345, 41342, 41326, 41361]
        self.name_dict=[]
        
    def descriptionToOneHot(self, d):
        d=str(d)
        d=d.lower()
        new_d=""
        for i in d:
            if(i.isalpha() or i==" "):
                new_d+=i
        split=new_d.split(" ")
        onehot=[]
        for i in self.DESCRIPTION_DICT:
            if i in split:
                onehot.append(1)
            else:
                onehot.append(0)
        return onehot


    #function to create dictionary for name attributes
    def makeDictName(self, Name):
        size=self.NAME_DICT_SIZE
        dic=dict(Name.value_counts()[:size])
        name_list=list(dic.keys())
        return name_list
    
    def nameToOneHot(self, n):
        l=[]
        flag=True
        for i in range(len(self.name_dict)):
            if(n==self.name_dict[i]):
                l.append(1)
                flag=False
            else:
                l.append(0)
        if(flag==False):
            l.append(0)
        else:
            l.append(1)
        return l

    #function to convert a state id to one hot vector
    def stateToOneHot(self, s):
        l=[]
        s=int(s)
        for i in self.STATE_DICT:
            if(i==s):
                l.append(1)
            else:
                l.append(0)
        return l        

    #function to convert breed labels to one hot vector
    def breedToOneHot(self, b):
        b=int(b)
        l=[]
        for i in range(308):
            if(i==b):
                l.append(1)
            else:
                l.append(0)
        return l

    #function to convert gender to one hot vector
    def genderToOneHot(self, g):
        g=int(g)
        l=[]
        for i in range(1,4):
            if(i==g):
                l.append(1)
            else:
                l.append(0)
        return l


    #function to convert color1, color2 and color3 to one hot vector
    def colorToOneHot(self, c):
        c=int(c)
        l=[]
        for i in range(8):
            if(i==c):
                l.append(1)
            else:
                l.append(0)
        return l    


    #function to convert maturity to one hot vector
    def maturityToOneHot(self, m):
        m=int(m)
        l=[]
        for i in range(5):
            if(i == m):
                l.append(1)
            else:
                l.append(0)
        return l

    #function to convert furLength to one hot vector
    def furLengthToOneHot(self, f):
        f=int(f)
        l=[]
        for i in range(4):
            if(i==f):
                l.append(1)
            else:
                l.append(0)
        return l


    #Function to convert vaccinated to one hot vector
    def vaccinatedToOneHot(self, v):
        v=int(v)
        l=[]
        for i in range(1,4):
            if(i==v):
                l.append(1)
            else:
                l.append(0)
        return l

    #Function to convert dewormed to one hot vector
    def dewormedToOneHot(self, d):
        d=int(d)
        l=[]
        for i in range(1,4):
            if(i==d):
                l.append(1)
            else:
                l.append(0)
        return l

    #Function to convert sterilized to one hot vector
    def sterilizedToOneHot(self, s):
        s=int(s)
        l=[]
        for i in range(1,4):
            if(i==s):
                l.append(1)
            else:
                l.append(0)
        return l

    #Function to convert Health to one hot vector
    def healthToOneHot(self, h):
        h=int(h)
        l=[]
        for i in range(4):
            if(i==h):
                l.append(1)
            else:
                l.append(0)
        return l

    def typeToOneHot(self, t):
        t=int(t)
        if(t==1):
            return [1, 0]
        elif(t==2):
            return [0, 1]
        else:
            print("something is terribly wrong in typeToOneHot")
        return [0, 0]
    
    def transformType(self, typ):
        t=list(typ)
        l=[]
        for i in t:
            l.append(self.typeToOneHot(i))
        return l
    
    def transformName(self, n):
        temp=list(n)
        l=[]
        for i in temp:
            l.append(self.nameToOneHot(i))
        return l
     
    def transformAge(self, age):
        temp=np.array(age)
        return temp
    
    def transformFee(self, fee):
        temp=np.array(fee)
        return temp
    
    def transformVideoAmt(self, vid):
        temp=np.array(vid)
        return temp
    
    def transformQuantity(self, quan):
        temp=np.array(quan)
        return temp
    
    def transformMisc(self, Data):
        temp=[]
        temp.append(np.array(Data["x0^2"]))
        temp.append(np.array(Data["x0x1"]))
        temp.append(np.array(Data["x0x2"]))
        temp.append(np.array(Data["x0x3"]))
        temp.append(np.array(Data["x1^2"]))
        temp.append(np.array(Data["x1x2"]))
        temp.append(np.array(Data["x1x3"]))
        temp.append(np.array(Data["x2^2"]))
        temp.append(np.array(Data["x2x3"]))
        temp.append(np.array(Data["x3^2"]))
        return temp
    
    def transformDesc(self, des):
        temp=np.array(des)
        l=[]
        for i in temp:
            l.append(self.descriptionToOneHot(i))
        return np.array(l)
    
    def transformState(self, st):
        temp=np.array(st)
        l=[]
        for i in temp:
            l.append(self.stateToOneHot(i))
        return np.array(l)
    
    def transformBreed(self, breed):
        temp=np.array(breed)
        l=[]
        for i in temp:
            l.append(self.breedToOneHot(i))
        return np.array(l)
    
    def transformColor(self, col):
        temp=np.array(col)
        l=[]
        for i in temp:
            l.append(self.colorToOneHot(i))
        return np.array(l)
    
    def transformGender(self, gen):
        temp=np.array(gen)
        l=[]
        for i in temp:
            l.append(self.genderToOneHot(i))
        return np.array(l)
    
    def transformMaturity(self, mat):
        temp=np.array(mat)
        l=[]
        for i in temp:
            l.append(self.maturityToOneHot(i))
        return np.array(l)
    
    def transformFurLength(self, fur):
        temp=np.array(fur)
        l=[]
        for i in temp:
            l.append(self.furLengthToOneHot(i))
        return np.array(l)
    
    def transformVaccinated(self, vac):
        temp=np.array(vac)
        l=[]
        for i in temp:
            l.append(self.vaccinatedToOneHot(i))
        return np.array(l)
    
    def transformDewormed(self, dewormed):
        temp=np.array(dewormed)
        l=[]
        for i in temp:
            l.append(self.dewormedToOneHot(i))
        return np.array(l)
    
    def transformSterilized(self, ster):
        temp=np.array(ster)
        l=[]
        for i in temp:
            l.append(self.sterilizedToOneHot(i))
        return np.array(l)
    
    def transformHealth(self, health):
        temp=np.array(health)
        l=[]
        for i in temp:
            l.append(self.healthToOneHot(i))
        return np.array(l)
    
    def combineData(self, l, lf):
        lf=np.array(lf)
        lf=np.transpose(lf)
        for i in l:
            lf=np.concatenate((lf,i), axis=1)    # LOOKS LIKE IT WILL WORK BUT TEST IT OUT
        return np.array(lf)
    
    def addPhotos(self, fdata, extra):
        out=[]
        final_labels=[]
        length=len(fdata)
        for i in range(length):
            tmp=fdata[i]
            for j in range(int(extra[1][i])):
                im=np.reshape(plt.imread("./resized_32x32_images/"+extra[0][i]+"-"+str(j+1)+".jpg"), [3072])
                out.append(np.concatenate((tmp, im)))
                final_labels.append(int(extra[2][i]))
        return np.array(out), np.array(final_labels)
    
    def fit(self, data):
        self.name_dict=self.makeDictName(data["Name"])
        attr=["Age", "Fee", "VideoAmt", "Quantity"]
        temp=data[attr]
        poly_features=PolynomialFeatures(degree=2, include_bias=False)
        x_poly=poly_features.fit_transform(temp)
        x_poly=np.transpose(x_poly)
        scaler=StandardScaler()
        x_poly=scaler.fit_transform(x_poly)
        poly_df=pd.DataFrame({
                                "Age":x_poly[0],
                                "Fee":x_poly[1],
                                "VideoAmt":x_poly[2],
                                "Quantity":x_poly[3],
                                "x0^2":x_poly[4],
                                "x0x1":x_poly[5],
                                "x0x2":x_poly[6],
                                "x0x3":x_poly[7],
                                "x1^2":x_poly[8],
                                "x1x2":x_poly[9],
                                "x1x3":x_poly[10],
                                "x2^2":x_poly[11],
                                "x2x3":x_poly[12],
                                "x3^2":x_poly[13],
                                "PetID":data["PetID"]

        })
        data=data.drop(attr,axis=1)
        pid=list(data["PetID"])
        pamt=list(data["PhotoAmt"])
        flabel=list(data["AdoptionSpeed"])
        ex=[pid, pamt, flabel]
        data=pd.concat([poly_df, data],axis=1)
        data=data.drop("PetID", axis=1)
        return data, ex
        
    def transform(self, data):
        l=[]
        lf=[]
        age=self.transformAge(data["Age"])
        lf.append(age)
        fee=self.transformFee(data["Fee"])
        lf.append(fee)
        videoamt=self.transformVideoAmt(data["VideoAmt"])
        lf.append(videoamt)
        quantity=self.transformQuantity(data["Quantity"])
        lf.append(quantity)
        misc=self.transformMisc(data)
        for i in misc:
            lf.append(i)
        tpe=self.transformType(data["Type"])
        l.append(tpe)
        desc=self.transformDesc(data["Description"])
        l.append(desc)
        state=self.transformState(data["State"])
        l.append(state)
        name=self.transformName(data["Name"])
        l.append(name)
        breed1=self.transformBreed(data["Breed1"])
        l.append(breed1)
        breed2=self.transformBreed(data["Breed2"])
        l.append(breed2)
        gender=self.transformGender(data["Gender"])
        l.append(gender)
        color1=self.transformColor(data["Color1"])
        l.append(color1)
        color2=self.transformColor(data["Color2"])
        l.append(color2)
        color3=self.transformColor(data["Color3"])
        l.append(color3)
        maturity=self.transformMaturity(data["MaturitySize"])
        l.append(maturity)
        fur=self.transformFurLength(data["FurLength"])
        l.append(fur)
        vac=self.transformVaccinated(data["Vaccinated"])
        l.append(vac)
        dewormed=self.transformDewormed(data["Dewormed"])
        l.append(dewormed)
        ster=self.transformSterilized(data["Sterilized"])
        l.append(ster)
        health=self.transformHealth(data["Health"])
        l.append(health)
        final_data=self.combineData(l, lf)
        return final_data
    
    def fit_transform(self, data):
        data, ex =self.fit(data)
        final_data=self.transform(data)
        return self.addPhotos(final_data, ex)
        


train_cats=train[train["Type"]==2]
train_dogs=train[train["Type"]==1]

In [6]:
pre=Preprocess()
prepared_data, labels=pre.fit_transform(train)


In [19]:
prepared_data.shape

(58311, 3825)

In [22]:
labels.shape

(58311,)

In [21]:
clf=LogisticRegression(multi_class='multinomial', solver="sag")
#clf.fit(prepared_data, labels)


In [14]:
nn= MLPClassifier(hidden_layer_sizes=(500),early_stopping=True)

In [15]:
nn_pred=cross_val_predict(nn, prepared_data, labels,cv=10)

In [16]:
print("kappa= ",kappa(labels, nn_pred))

kappa on dog=  0.002104082290123821
