In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv('train_updated.csv')
test = pd.read_csv('test_updated.csv')

In [3]:
list1 = list(train['pet_id'].value_counts().sort_values().index)
list2 = list(test['pet_id'].value_counts().sort_values().index)
set(list1) & set(list2)

set()

In [4]:
train = train.drop_duplicates()

In [5]:
train[train['issue_date_dt']>train['listing_date_dt']]

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),X1,X2,breed_category,pet_category,issue_date_dt,listing_date_dt,date_diff,conditionnan,height(m)
1504,ANSL_52243,2018-01-17 00:00:00,2018-01-14 15:13:00,2.0,Orange Tabby,0.72,13,9,0.0,1,2018-01-17,2018-01-14,-3,0,0.4319
5301,ANSL_63737,2016-11-18 00:00:00,2016-09-03 17:01:00,0.0,Black,0.88,0,1,1.0,1,2016-11-18,2016-09-03,-76,0,0.2782


In [6]:
test[test['issue_date_dt']>test['listing_date_dt']]

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),X1,X2,issue_date_dt,listing_date_dt,date_diff,conditionnan,height(m)


Seems like some data anomaly is present. We can simply ignore those two data points.

In [7]:
train = train[train['issue_date_dt']<train['listing_date_dt']] 

In [8]:
train[train['issue_date_dt']>train['listing_date_dt']]

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),X1,X2,breed_category,pet_category,issue_date_dt,listing_date_dt,date_diff,conditionnan,height(m)


In [9]:
train = train.drop(['issue_date', 'listing_date'], axis=1)
test = test.drop(['issue_date', 'listing_date'], axis=1)

Let's now handle the categorical features

In [10]:
len(train.color_type.value_counts())

56

In [11]:
len(test.color_type.value_counts())

54

As it is observed there are clearly too many color types. Hence, we definitely should be wise and not choose one-hot encoding.

In [12]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

train['pet+breed'] = train['pet_category'].astype(str) + train['breed_category'].astype(str)
train['pet+breed']= label_encoder.fit_transform(train['pet+breed']) 

In [13]:
train['pet+breed'].value_counts()

6     5692
7     4869
4     3405
3     3194
11     783
5      583
9       83
10      75
8       60
2       51
0       30
1        7
Name: pet+breed, dtype: int64

In [14]:
color_type_hot=train.groupby(['color_type'])['pet+breed'].mean().sort_values().index
color_type_hot={k:i for i,k in enumerate(color_type_hot,0)}
train['color_type_encode'] = train['color_type'].map(color_type_hot)

test['color_type_encode'] = test['color_type'].map(color_type_hot)

In [15]:
train = train.drop(['pet+breed'], axis=1)
df=train
df_test=test

In [16]:
num_features=[feature for feature in df.columns if df[feature].dtypes!='O' and feature != 'pet_category' and feature != 'breed_category']
for feature in num_features:
    df[feature]=(df[feature]-df[feature].min())/(df[feature].max()-df[feature].min())

In [17]:
df.to_csv('C:/Pet-Adoption/train_norm.csv',index=False)

In [18]:
num_features=[feature for feature in df_test.columns if df_test[feature].dtypes!='O' and feature != 'pet_category' and feature != 'breed_category']
for feature in num_features:
    df_test[feature]=(df_test[feature]-df_test[feature].min())/(df_test[feature].max()-df_test[feature].min())

In [19]:
df_test.to_csv('C:/Pet-Adoption/test_norm.csv',index=False)

In [20]:
df.corr()

Unnamed: 0,condition,length(m),X1,X2,breed_category,pet_category,date_diff,conditionnan,height(m),color_type_encode
condition,1.0,-0.006032,0.457002,0.359059,0.103486,0.19606,-0.159945,0.60982,-0.006145,0.026807
length(m),-0.006032,1.0,-0.002883,-0.011156,0.007224,-0.003892,0.001862,0.004066,-0.004514,-0.001197
X1,0.457002,-0.002883,1.0,0.584346,0.240836,-0.032579,-0.102617,0.345107,-0.003875,-0.132773
X2,0.359059,-0.011156,0.584346,1.0,0.052634,-0.032107,0.118708,0.106482,-0.008296,-0.026934
breed_category,0.103486,0.007224,0.240836,0.052634,1.0,0.209232,-0.102793,0.648145,0.011708,0.060408
pet_category,0.19606,-0.003892,-0.032579,-0.032107,0.209232,1.0,0.093253,0.360549,0.002041,0.606243
date_diff,-0.159945,0.001862,-0.102617,0.118708,-0.102793,0.093253,1.0,-0.132064,-0.000466,0.128795
conditionnan,0.60982,0.004066,0.345107,0.106482,0.648145,0.360549,-0.132064,1.0,0.003528,0.109123
height(m),-0.006145,-0.004514,-0.003875,-0.008296,0.011708,0.002041,-0.000466,0.003528,1.0,0.001314
color_type_encode,0.026807,-0.001197,-0.132773,-0.026934,0.060408,0.606243,0.128795,0.109123,0.001314,1.0
