In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [3]:
# Feature Engineering
class Transformer:
    outcome_counter = 0
    id_counter = 0
    sex_counter = 0
    breed_counter = 0
    color_counter = 0
    outcome_types = {}
    sex_types = {}
    breeds = {}
    colors = {}
    age_units = {'year': 365, 'years': 365, 'month': 30, 'months': 30,
                 'week': 7, 'weeks': 7, 'day': 1, 'days': 1}
    train_mode = True
    
    def train(self, is_train):
        self.train_mode = is_train
    
    def __call__(self, x, *args, **kwargs):        
        # Outcome
        if self.train_mode:
            key = x['OutcomeType']
            if key not in self.outcome_types:
                self.outcome_types[key] = self.outcome_counter
                self.outcome_counter += 1
            x['outcome_type'] = self.outcome_types[key]
        
        # is_dog
        x['is_dog'] = 1 if x['AnimalType'] == 'Dog' else 0

        # sex
        key = x['SexuponOutcome']
        if key not in self.sex_types:
            self.sex_types[key] = self.sex_counter
            self.sex_counter += 1
        elif key not in self.sex_types and not self.train_mode:
            self.sex_types[key] = -1
        x['sex'] = self.sex_types[key]
        
        # age
        if isinstance(x['AgeuponOutcome'], float):
            x['age'] = -1
        else:
            num, unit = x['AgeuponOutcome'].split(' ')
            x['age'] = int(num) * self.age_units[unit]
        
        # breed
        key = x['Breed']
        if key not in self.breeds:
            self.breeds[key] = self.breed_counter
            self.breed_counter += 1
        elif key not in self.breeds and not self.train_mode:
            self.breeds[key] = -1
        x['breed'] = self.breeds[key]
        
        # color
        key = ':'.join(x['Color'].split('/ '))
        if key not in self.colors:
            self.colors[key] = self.color_counter
            self.color_counter += 1
        elif key not in self.colors and not self.train_mode:
            self.colors[key] = -1
        x['color'] = self.colors[key]
        return x
    
def transform_df(df, trans, train=True):
    if train:
        cols = ['AnimalID', 'Name', 'DateTime', 'OutcomeType',
                'OutcomeSubtype', 'AnimalType', 'SexuponOutcome',
                'Breed', 'Color', 'AgeuponOutcome']
        df['OutcomeType'] = df['OutcomeType'].astype('str')
        df['OutcomeSubtype'] = df['OutcomeSubtype'].astype('str')
    else:
        cols = ['Name', 'DateTime', 'AnimalType', 'SexuponOutcome',
                'Breed', 'Color', 'AgeuponOutcome']
        
    df = df.apply(trans, axis=1)
    df.drop(cols, axis=1, inplace=True)
    return df
    
trans = Transformer()

In [4]:
%%time
trans.train(True)
df_train_trans = transform_df(df_train, trans)

CPU times: user 1min 22s, sys: 144 ms, total: 1min 22s
Wall time: 1min 22s


In [5]:
%%time
trans.train(False)
df_test_trans = transform_df(df_test, trans, train=False)

CPU times: user 30 s, sys: 30.5 ms, total: 30.1 s
Wall time: 30.1 s


In [6]:
df_train_trans.head()

Unnamed: 0,outcome_type,is_dog,sex,age,breed,color
0,0,1,0,365,0,0
1,1,0,1,365,1,1
2,2,1,0,730,2,2
3,3,0,2,21,1,3
4,3,1,0,730,3,4


In [7]:
df_test_trans.head()

Unnamed: 0,ID,is_dog,sex,age,breed,color
0,1,1,3,300,18,8
1,2,1,1,730,104,5
2,3,0,0,365,1,7
3,4,1,2,120,327,28
4,5,1,0,730,66,9


In [8]:
y_train = df_train_trans['outcome_type']
x_train = df_train_trans.drop('outcome_type', axis=1)

ids = df_test_trans['ID']
x_test = df_test_trans.drop('ID', axis=1)

In [9]:
import xgboost as xgb



In [10]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

param = {'max_depth': 6, 'eta': 0.01, 'silent': 1,
         'objective': 'multi:softprob', 'num_class': trans.outcome_counter}
num_round = 500
watch_list = [(dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round)

In [11]:
predicts = bst.predict(dtest).reshape(x_test.shape[0], trans.outcome_counter)
predicts

array([[ 0.18788256,  0.07614568,  0.04348592,  0.6645422 ,  0.02794362],
       [ 0.29707035,  0.02786294,  0.49521691,  0.16688395,  0.0129658 ],
       [ 0.1203969 ,  0.03549758,  0.39177766,  0.43809819,  0.01422967],
       ..., 
       [ 0.01193555,  0.0867198 ,  0.01419437,  0.87308359,  0.01406675],
       [ 0.42363667,  0.0481517 ,  0.34261122,  0.17168956,  0.01391087],
       [ 0.3646192 ,  0.24943037,  0.03564737,  0.33713725,  0.0131658 ]], dtype=float32)

In [20]:
labels = sorted(list(trans.outcome_types.items()), key=lambda x: x[0])
label_indices = [idx for label, idx in labels]
print(labels)
label_indices

[('Adoption', 2), ('Died', 4), ('Euthanasia', 1), ('Return_to_owner', 0), ('Transfer', 3)]


[2, 4, 1, 0, 3]

In [31]:
ans = np.zeros((predicts.shape[0], predicts.shape[1]+1))
ans[:, 0] = ids
for ans_idx, (label, idx) in enumerate(labels, 1):
    ans[:, ans_idx] = predicts[:, idx]
df_ans = pd.DataFrame(ans, columns=['ID', 'Adoption', 'Died',
                                    'Euthanasia', 'Return_to_owner', 'Transfer'])
df_ans['ID'] = df_ans['ID'].astype(int)
df_ans.to_csv('data/submit.csv', index=False)
df_ans

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.043486,0.027944,0.076146,0.187883,0.664542
1,2,0.495217,0.012966,0.027863,0.297070,0.166884
2,3,0.391778,0.014230,0.035498,0.120397,0.438098
3,4,0.041037,0.009198,0.037468,0.092755,0.819541
4,5,0.436589,0.014557,0.034655,0.321403,0.192796
5,6,0.364850,0.013882,0.052771,0.375473,0.193024
6,7,0.363221,0.016640,0.287968,0.151836,0.180335
7,8,0.692918,0.012656,0.013037,0.031531,0.249858
8,9,0.590468,0.010145,0.013780,0.188102,0.197504
9,10,0.546669,0.012186,0.061225,0.223468,0.156452
