Importing Important packages and Reading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import randint

In [2]:
df1 = pd.read_csv('train.csv')
df1.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [3]:
df1.columns

Index(['pet_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category',
       'pet_category'],
      dtype='object')

In [4]:
print((df1['breed_category']).unique(),df1['pet_category'].unique(),df1['condition'].unique())

[0. 2. 1.] [1 2 4 0] [ 2.  1. nan  0.]


In [5]:
df2 = df1.copy()
df3 = df2.drop('pet_id',axis = 'columns')
df3.head()

Unnamed: 0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


Handling Date 

In [6]:
df3['issue_date'] = pd.to_datetime(df3['issue_date'])
df3['listing_date'] = pd.to_datetime(df3['listing_date'])

In [7]:
df3['time'] = df3['listing_date']-df3['issue_date']
df3.head()

Unnamed: 0,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,time
0,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73 days 16:25:00
1,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2,1862 days 17:47:00
2,2014-09-28,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4,752 days 08:24:00
3,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2,755 days 18:30:00
4,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1,52 days 09:38:00


In [8]:
df4 = df3.drop(['issue_date','listing_date'],axis='columns')
df4.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,time
0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73 days 16:25:00
1,1.0,White,0.72,14.19,13,9,0.0,2,1862 days 17:47:00
2,,Brown,0.15,40.9,15,4,2.0,4,752 days 08:24:00
3,1.0,White,0.62,17.82,0,1,0.0,2,755 days 18:30:00
4,2.0,Black,0.5,11.06,18,4,0.0,1,52 days 09:38:00


In [9]:
df4['time'] = df4['time'].apply(lambda x : str(x))

In [10]:
df4['time'] = df4['time'].apply(lambda x : int(x.split(' ')[0]))
df4.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,time
0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73
1,1.0,White,0.72,14.19,13,9,0.0,2,1862
2,,Brown,0.15,40.9,15,4,2.0,4,752
3,1.0,White,0.62,17.82,0,1,0.0,2,755
4,2.0,Black,0.5,11.06,18,4,0.0,1,52


In [11]:
df5 = df4.copy()
df5.head(3)

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,time
0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73
1,1.0,White,0.72,14.19,13,9,0.0,2,1862
2,,Brown,0.15,40.9,15,4,2.0,4,752


Handing color_type using ONE HOT ENCODING

In [12]:
cname = df5['color_type'].unique()
cname

array(['Brown Tabby', 'White', 'Brown', 'Black', 'Red', 'Tricolor',
       'Calico', 'Brown Brindle', 'Blue', 'Tortie', 'Gray', 'Fawn',
       'Buff', 'Torbie', 'Orange Tabby', 'Sable', 'Tan', 'Blue Tick',
       'Orange', 'Blue Tabby', 'Chocolate', 'Black Tabby',
       'Black Brindle', 'Cream Tabby', 'Gold', 'Calico Point',
       'Blue Merle', 'Red Merle', 'Yellow', 'Lynx Point', 'Blue Cream',
       'Flame Point', 'Cream', 'Yellow Brindle', 'Blue Tiger',
       'Seal Point', 'Red Tick', 'Brown Merle', 'Black Smoke',
       'Gray Tabby', 'Green', 'Tortie Point', 'Blue Smoke', 'Apricot',
       'Blue Point', 'Silver Tabby', 'Lilac Point', 'Silver',
       'Brown Tiger', 'Liver', 'Agouti', 'Chocolate Point', 'Liver Tick',
       'Pink', 'Black Tiger', 'Silver Lynx Point'], dtype=object)

In [13]:
df_dumm = pd.get_dummies(df5['color_type'],columns=cname)
df_dumm.head()

Unnamed: 0,Agouti,Apricot,Black,Black Brindle,Black Smoke,Black Tabby,Black Tiger,Blue,Blue Cream,Blue Merle,...,Silver Lynx Point,Silver Tabby,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df7 = df5.join(df_dumm)
df7.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,time,Agouti,...,Silver Lynx Point,Silver Tabby,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle
0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1,73,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,White,0.72,14.19,13,9,0.0,2,1862,0,...,0,0,0,0,0,0,0,1,0,0
2,,Brown,0.15,40.9,15,4,2.0,4,752,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,White,0.62,17.82,0,1,0.0,2,755,0,...,0,0,0,0,0,0,0,1,0,0
4,2.0,Black,0.5,11.06,18,4,0.0,1,52,0,...,0,0,0,0,0,0,0,0,0,0


Making the order of columns same as test dataset

In [15]:
df7 = df7.drop('color_type',axis = 'columns')
df7.head()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category,time,Agouti,Apricot,...,Silver Lynx Point,Silver Tabby,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle
0,2.0,0.8,7.78,13,9,0.0,1,73,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,14.19,13,9,0.0,2,1862,0,0,...,0,0,0,0,0,0,0,1,0,0
2,,0.15,40.9,15,4,2.0,4,752,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,17.82,0,1,0.0,2,755,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2.0,0.5,11.06,18,4,0.0,1,52,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
bltg = df7['Black Tiger']
brtg = df7['Brown Tiger']

In [17]:
df7 = df7.drop(['Black Tiger','Brown Tiger'],axis='columns')
df7.head()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category,time,Agouti,Apricot,...,Silver Lynx Point,Silver Tabby,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle
0,2.0,0.8,7.78,13,9,0.0,1,73,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,14.19,13,9,0.0,2,1862,0,0,...,0,0,0,0,0,0,0,1,0,0
2,,0.15,40.9,15,4,2.0,4,752,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,17.82,0,1,0.0,2,755,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2.0,0.5,11.06,18,4,0.0,1,52,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df7 = df7.join(bltg)
df7.head()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category,time,Agouti,Apricot,...,Silver Tabby,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger
0,2.0,0.8,7.78,13,9,0.0,1,73,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,14.19,13,9,0.0,2,1862,0,0,...,0,0,0,0,0,0,1,0,0,0
2,,0.15,40.9,15,4,2.0,4,752,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,17.82,0,1,0.0,2,755,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2.0,0.5,11.06,18,4,0.0,1,52,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df7 = df7.join(brtg)
df7.head()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category,time,Agouti,Apricot,...,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger,Brown Tiger
0,2.0,0.8,7.78,13,9,0.0,1,73,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,14.19,13,9,0.0,2,1862,0,0,...,0,0,0,0,0,1,0,0,0,0
2,,0.15,40.9,15,4,2.0,4,752,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,17.82,0,1,0.0,2,755,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2.0,0.5,11.06,18,4,0.0,1,52,0,0,...,0,0,0,0,0,0,0,0,0,0


Scaling

In [20]:
df7['height(cm)'] = df7['height(cm)'].apply(lambda x: x/100)
df7.head()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category,time,Agouti,Apricot,...,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger,Brown Tiger
0,2.0,0.8,0.0778,13,9,0.0,1,73,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,0.1419,13,9,0.0,2,1862,0,0,...,0,0,0,0,0,1,0,0,0,0
2,,0.15,0.409,15,4,2.0,4,752,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,0.1782,0,1,0.0,2,755,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2.0,0.5,0.1106,18,4,0.0,1,52,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df7.rename(columns = {'height(cm)':'height(m)'},inplace=True)
df7.head()

Unnamed: 0,condition,length(m),height(m),X1,X2,breed_category,pet_category,time,Agouti,Apricot,...,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger,Brown Tiger
0,2.0,0.8,0.0778,13,9,0.0,1,73,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,0.1419,13,9,0.0,2,1862,0,0,...,0,0,0,0,0,1,0,0,0,0
2,,0.15,0.409,15,4,2.0,4,752,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,0.1782,0,1,0.0,2,755,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2.0,0.5,0.1106,18,4,0.0,1,52,0,0,...,0,0,0,0,0,0,0,0,0,0


Seperating X and y

In [22]:
X = df7.drop(['breed_category','pet_category'],axis='columns')
y = df7[['breed_category','pet_category']]
y1 = df7['breed_category']
y2 = df7['pet_category']
X.head()

Unnamed: 0,condition,length(m),height(m),X1,X2,time,Agouti,Apricot,Black,Black Brindle,...,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger,Brown Tiger
0,2.0,0.8,0.0778,13,9,73,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.72,0.1419,13,9,1862,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,,0.15,0.409,15,4,752,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.62,0.1782,0,1,755,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2.0,0.5,0.1106,18,4,52,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Importing Some Packages used for training

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [24]:
clf = xgb.XGBClassifier()

Seeing how the train.csv performs on xgboost using cross_val_score

In [25]:
cross_val_score(clf,X,y1,cv=5).mean()

0.8995428195860832

In [26]:
cross_val_score(clf,X,y2,cv=5).mean()

0.9023044125966886

In [27]:
model1 = clf.fit(X,y1)

In [28]:
model2 = clf.fit(X,y2)

Importing Test dataset

In [29]:
df_test = pd.read_csv('test2.csv')
df_test.head(3)

Unnamed: 0.1,Unnamed: 0,condition,length(m),height(m),X1,X2,time,Agouti,Apricot,Black,...,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger,Brown Tiger
0,0,0.0,0.87,0.4273,0,7,4404,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,0.06,0.0671,0,1,174,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1.0,0.24,0.4121,0,7,1999,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_test = df_test.drop('Unnamed: 0',axis = 'columns')
df_test.head()

Unnamed: 0,condition,length(m),height(m),X1,X2,time,Agouti,Apricot,Black,Black Brindle,...,Tan,Torbie,Tortie,Tortie Point,Tricolor,White,Yellow,Yellow Brindle,Black Tiger,Brown Tiger
0,0.0,0.87,0.4273,0,7,4404,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.06,0.0671,0,1,174,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.24,0.4121,0,7,1999,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.29,0.0846,7,1,1148,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.71,0.3092,0,7,463,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Predicting and converting results into dataframe

In [31]:
pred_final1 = model1.predict(df_test)
pred_final1

array([2, 1, 2, ..., 2, 4, 2])

In [32]:
pred_final2 = model2.predict(df_test)
pred_final2

array([2, 1, 2, ..., 2, 4, 2])

In [33]:
pred_final1 = np.array(pred_final1)
pred_final2 = np.array(pred_final2)

In [34]:
df_after_pred1 = pd.DataFrame(data=pred_final1[:],columns=['breed_category'])
df_after_pred1

Unnamed: 0,breed_category
0,2
1,1
2,2
3,2
4,2
...,...
8067,2
8068,2
8069,2
8070,4


In [35]:
df_after_pred2 = pd.DataFrame(data=pred_final2[:],columns=['pet_category'])
df_after_pred2

Unnamed: 0,pet_category
0,2
1,1
2,2
3,2
4,2
...,...
8067,2
8068,2
8069,2
8070,4


In [36]:
df_after_pred = df_after_pred1.join(df_after_pred2)
df_after_pred

Unnamed: 0,breed_category,pet_category
0,2,2
1,1,1
2,2,2
3,2,2
4,2,2
...,...,...
8067,2,2
8068,2,2
8069,2,2
8070,4,4


Importing test.csv for ped_id

In [37]:
dff = pd.read_csv('test.csv')
dff.head(3)

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0.24,41.21,0,7


In [38]:
new = dff[['pet_id']].copy()
new.head()

Unnamed: 0,pet_id
0,ANSL_75005
1,ANSL_76663
2,ANSL_58259
3,ANSL_67171
4,ANSL_72871


In [39]:
dff_finalist = new.join(df_after_pred)
dff_finalist

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,2,2
1,ANSL_76663,1,1
2,ANSL_58259,2,2
3,ANSL_67171,2,2
4,ANSL_72871,2,2
...,...,...,...
8067,ANSL_66809,2,2
8068,ANSL_59041,2,2
8069,ANSL_60034,2,2
8070,ANSL_58066,4,4


Saving the answers

In [40]:
dff_finalist.to_csv('answers3.csv',index=False)

In [41]:
ddff1 = pd.read_csv('answers3.csv')
ddff1.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,2,2
1,ANSL_76663,1,1
2,ANSL_58259,2,2
3,ANSL_67171,2,2
4,ANSL_72871,2,2
