In [1]:
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import root_mean_squared_error

%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score

plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})

In [2]:
data = pd.read_csv("ready_drop.csv")

In [3]:
data

Unnamed: 0,AnimalType,PrimaryColor,SecondaryColor,PrimaryBreed,Sex,Age,IntakeCondition,IntakeType,IntakeSubtype,Jurisdiction,Adopted
0,DOG,GOLD,GOLD,GOLDEN RETR,NEUTERED,5475.0,MED R,STRAY,OTC,SAN JOSE,0.0
1,DOG,TAN,WHITE,CHIHUAHUA SH,NEUTERED,6570.0,MED R,FOSTER,FOSTER,SAN JOSE,0.0
2,DOG,TAN,WHITE,CHIHUAHUA SH,NEUTERED,6570.0,MED R,OWNER SUR,OTC,SAN JOSE,0.0
3,CAT,GRAY,GRAY,DOMESTIC SH,SPAYED,5840.0,MED R,STRAY,MEDVET,SAN JOSE,0.0
4,DOG,RED,SABLE,SHIBA INU,SPAYED,6205.0,MED R,STRAY,MEDVET,SAN JOSE,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12068,CAT,BUFF,BUFF,DOMESTIC SH,MALE,35.0,NURSING,STRAY,OTC,SAN JOSE,0.0
12069,DOG,BLACK,WHITE,ALASK MALAMUTE,MALE,1095.0,HEALTHY,STRAY,FIELD,SAN JOSE,0.0
12070,CAT,ORANGE,ORANGE,DOMESTIC SH,SPAYED,150.0,MED M,STRAY,OTC,SAN JOSE,0.0
12071,CAT,ORANGE,ORANGE,DOMESTIC SH,NEUTERED,150.0,HEALTHY,FOSTER,FOSTER,SAN JOSE,1.0


In [5]:
a_type = pd.get_dummies(data['AnimalType'])

In [6]:
a_type

Unnamed: 0,BIRD,CAT,DOG,OTHER
0,False,False,True,False
1,False,False,True,False
2,False,False,True,False
3,False,True,False,False
4,False,False,True,False
...,...,...,...,...
12068,False,True,False,False
12069,False,False,True,False
12070,False,True,False,False
12071,False,True,False,False


In [7]:
data=pd.concat([data,a_type],axis=1)

In [8]:
data

Unnamed: 0,AnimalType,PrimaryColor,SecondaryColor,PrimaryBreed,Sex,Age,IntakeCondition,IntakeType,IntakeSubtype,Jurisdiction,Adopted,BIRD,CAT,DOG,OTHER
0,DOG,GOLD,GOLD,GOLDEN RETR,NEUTERED,5475.0,MED R,STRAY,OTC,SAN JOSE,0.0,False,False,True,False
1,DOG,TAN,WHITE,CHIHUAHUA SH,NEUTERED,6570.0,MED R,FOSTER,FOSTER,SAN JOSE,0.0,False,False,True,False
2,DOG,TAN,WHITE,CHIHUAHUA SH,NEUTERED,6570.0,MED R,OWNER SUR,OTC,SAN JOSE,0.0,False,False,True,False
3,CAT,GRAY,GRAY,DOMESTIC SH,SPAYED,5840.0,MED R,STRAY,MEDVET,SAN JOSE,0.0,False,True,False,False
4,DOG,RED,SABLE,SHIBA INU,SPAYED,6205.0,MED R,STRAY,MEDVET,SAN JOSE,0.0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12068,CAT,BUFF,BUFF,DOMESTIC SH,MALE,35.0,NURSING,STRAY,OTC,SAN JOSE,0.0,False,True,False,False
12069,DOG,BLACK,WHITE,ALASK MALAMUTE,MALE,1095.0,HEALTHY,STRAY,FIELD,SAN JOSE,0.0,False,False,True,False
12070,CAT,ORANGE,ORANGE,DOMESTIC SH,SPAYED,150.0,MED M,STRAY,OTC,SAN JOSE,0.0,False,True,False,False
12071,CAT,ORANGE,ORANGE,DOMESTIC SH,NEUTERED,150.0,HEALTHY,FOSTER,FOSTER,SAN JOSE,1.0,False,True,False,False


In [10]:
data.drop(columns=['AnimalType'],inplace=True)

In [11]:
data

Unnamed: 0,PrimaryColor,SecondaryColor,PrimaryBreed,Sex,Age,IntakeCondition,IntakeType,IntakeSubtype,Jurisdiction,Adopted,BIRD,CAT,DOG,OTHER
0,GOLD,GOLD,GOLDEN RETR,NEUTERED,5475.0,MED R,STRAY,OTC,SAN JOSE,0.0,False,False,True,False
1,TAN,WHITE,CHIHUAHUA SH,NEUTERED,6570.0,MED R,FOSTER,FOSTER,SAN JOSE,0.0,False,False,True,False
2,TAN,WHITE,CHIHUAHUA SH,NEUTERED,6570.0,MED R,OWNER SUR,OTC,SAN JOSE,0.0,False,False,True,False
3,GRAY,GRAY,DOMESTIC SH,SPAYED,5840.0,MED R,STRAY,MEDVET,SAN JOSE,0.0,False,True,False,False
4,RED,SABLE,SHIBA INU,SPAYED,6205.0,MED R,STRAY,MEDVET,SAN JOSE,0.0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12068,BUFF,BUFF,DOMESTIC SH,MALE,35.0,NURSING,STRAY,OTC,SAN JOSE,0.0,False,True,False,False
12069,BLACK,WHITE,ALASK MALAMUTE,MALE,1095.0,HEALTHY,STRAY,FIELD,SAN JOSE,0.0,False,False,True,False
12070,ORANGE,ORANGE,DOMESTIC SH,SPAYED,150.0,MED M,STRAY,OTC,SAN JOSE,0.0,False,True,False,False
12071,ORANGE,ORANGE,DOMESTIC SH,NEUTERED,150.0,HEALTHY,FOSTER,FOSTER,SAN JOSE,1.0,False,True,False,False


In [12]:
color = pd.get_dummies(data['PrimaryColor'])
data=pd.concat([data,color],axis=1)
data.drop(columns=['PrimaryColor'],inplace=True)

In [13]:
color_s = pd.get_dummies(data['SecondaryColor'])
data=pd.concat([data,color_s],axis=1)
data.drop(columns=['SecondaryColor'],inplace=True)

In [15]:
breed = pd.get_dummies(data['PrimaryBreed'])
data=pd.concat([data,breed],axis=1)
data.drop(columns=['PrimaryBreed'],inplace=True)

In [16]:
sex = pd.get_dummies(data['Sex'])
data=pd.concat([data,sex],axis=1)
data.drop(columns=['Sex'],inplace=True)

In [None]:
IntakeCondition 	IntakeType 	IntakeSubtype 	Jurisdiction

In [17]:
cond = pd.get_dummies(data['IntakeCondition'])
data=pd.concat([data,cond],axis=1)
data.drop(columns=['IntakeCondition'],inplace=True)

In [18]:
i_type = pd.get_dummies(data['IntakeType'])
data=pd.concat([data,i_type],axis=1)
data.drop(columns=['IntakeType'],inplace=True)

In [19]:
i_stype = pd.get_dummies(data['IntakeSubtype'])
data=pd.concat([data,i_stype],axis=1)
data.drop(columns=['IntakeSubtype'],inplace=True)

In [20]:
j = pd.get_dummies(data['Jurisdiction'])
data=pd.concat([data,j],axis=1)
data.drop(columns=['Jurisdiction'],inplace=True)

In [21]:
data

Unnamed: 0,Age,Adopted,BIRD,CAT,DOG,OTHER,BEIGE,BLACK,BLUE,BRINDLE-BN,...,MILPITAS,MONTE SERENO,MORGAN HILL,MOUNTAIN VIEW,OUT OF COUNTY,PALO ALTO,SAN JOSE,SANTA CLARA,SARATOGA,SUNNYVALE
0,5475.0,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,6570.0,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,6570.0,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,5840.0,0.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,6205.0,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12068,35.0,0.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
12069,1095.0,0.0,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
12070,150.0,0.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
12071,150.0,1.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [22]:
data.Adopted = data.Adopted.astype('bool')

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12073 entries, 0 to 12072
Columns: 320 entries, Age to SUNNYVALE
dtypes: bool(319), float64(1)
memory usage: 3.8 MB


In [25]:
data.to_csv('data_encoded.csv',index=False)