In [None]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize, normalize, LabelEncoder
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA

from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier

from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
df = pd.read_csv('../input/train.csv')
df.info()

In [None]:
df.head()

Obviously the age of the animals should be importante, so we need to clean it up a bit.
From the frist lines we can see that the age is in the format Numberperiod. 

Lets see what periods are there.

In [None]:
df['AgeuponOutcome'].str.split(expand=True)[1].unique()

So lets map each of these to a "multiplier", and calculate the age of the animals in days.

In [None]:
time_multiplier = dict(year = 365, years=365, weeks = 7, month = 30, months=30, days = 1, week = 7, day= 1)
foo = df['AgeuponOutcome'].str.split(expand=True)
age = foo[0].astype('float32')
period = foo[1]
del foo
period = period.map(time_multiplier)
df['age_in_days'] = age * period

In [None]:
print(df['SexuponOutcome'].str.split(expand=True)[0].unique())
print(df['SexuponOutcome'].str.split(expand=True)[1].unique())

In [None]:
foo = df['SexuponOutcome'].str.split(expand=True)
df['condition'] = foo[0].map({'Neutered': 'operated', 'Spayed':'operated', 'Intact': 'intact', 'Unknown': 'unknown'})
df['sex'] = foo[1]
sns.countplot(x='OutcomeType', data=df)

So, it looks like most of the animals are being adopted or transfered. With some returning to their owner.
Only a small part of them are suffering euthanasia or dying (thanks god).

But how are these outcomes related to other characteristics of the animal? Are dogs and cats being are threated alike?

In [None]:
sns.countplot(x='OutcomeType', data=df, hue='AnimalType')

Dogs are prefered (as one would expect). The vast majority the animals returned to their owner are dogs. 
Dogs also haave a nice leading on adoption rate.
 
What about the sex?

In [None]:
sns.countplot(x='OutcomeType', data=df, hue='sex')

So, in general the sex of the animal doesn't appear to have much influence on their final destinty.

In [None]:
sns.countplot(x='OutcomeType', data=df, hue='condition')

Thats not the case when dealing on the codition of the animal. Most of the adopted or returned animals are operated.

Now lets see how the age influences this, for this we will use a boxplot.

In [None]:
sns.boxplot(x='OutcomeType', y='age_in_days', data=df, showfliers=False) #showfliers=false disable the display of the outliers

Contrary to my initial expectations, young animals are the ones that die most, while old animmals return to their owner (the owner miss them?) or suffer from euthanasia (putting their out of their misery).

Adoptions are mainly about youg animals.

Lets see how the name of the animals influence their odds.

In [None]:
df['has_name'] = pd.notnull(df['Name'])
sns.countplot(x='OutcomeType', data=df, hue='has_name')

So, if the animal has a name people are more propensed to get them back or even adopt them (I get the case of return_to_owner, but... can't new owners give their pets a name?).

In [None]:
df['pure_breed'] = df.Breed.apply(lambda x : 'mix' not in x.lower())
sns.countplot(x='OutcomeType', data=df, hue='pure_breed')

In [None]:
df.DateTime = pd.to_datetime(df.DateTime)
df['month'] = df.DateTime.dt.month.astype('category')
df['weekday'] = df.DateTime.dt.weekday.astype('category')
df['hour'] = df.DateTime.dt.hour.astype('category')
sns.countplot(x='OutcomeType', data=df, hue='month')

In [None]:
sns.countplot(x='OutcomeType', data=df, hue='weekday')

In [None]:
sns.countplot(x='OutcomeType', data=df, hue='hour')

In [None]:
color_split = df.Color.str.split('/', expand=True)
df['color_1'] = color_split[0]
print(df.color_1.unique())
df['color_2'] = color_split[1]
print(df.color_2.unique())

In [None]:
print(df.Breed.value_counts()[:10])
print('---------')
print(df.Breed.str.replace(' Mix', '').value_counts()[:10])
print()
print()
print(len(df.Breed.value_counts()))
print(len(df.Breed.str.replace(' Mix', '').value_counts()))
df.Breed.value_counts()
df['breed_rarity']=  pd.cut(df.Breed.value_counts(), 5, include_lowest=True, labels=['super rare', 'rare', 'comon', 'very comon', 'very very common' ])

In [None]:
y = df['OutcomeType']
x = df[['age_in_days','condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'breed_rarity', 'hour', 'weekday', 'month']]
x = pd.get_dummies(x)
x.info(null_counts=True, memory_usage='deep', verbose=True)

Age in days has some null values. Lets fix it.

In [None]:
df2 = df[['OutcomeType', 'age_in_days','condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'breed_rarity', 'hour', 'weekday', 'month']]
df2 = pd.get_dummies(df2.reindex(), columns=['condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'breed_rarity', 'hour', 'weekday', 'month'])
df2.dropna(axis=0, inplace=True)
y = df2['OutcomeType']
y = LabelEncoder().fit_transform(y)
x = df2.drop('OutcomeType', axis=1)
print(x.shape)
print(y.shape)

In [None]:
rf = RandomForestClassifier()
rf.fit(x, y)

y_pred = rf.predict_proba(x)
print(log_loss(y, y_pred))

In [None]:
print('Top features')
for score, feat in sorted(zip(rf.feature_importances_, x.columns), reverse=True)[:10]:
    print('{:.3f} {}'.format(score,feat))
print()    
print('Botton features')
for score, feat in sorted(zip(rf.feature_importances_, x.columns), reverse=True)[-20:]:
    print('{:.5f} {}'.format(score,feat))

So age in days is really important... What if we bin it?
 
Also breed_rarity didn't help a bit. So lets remove it.

In [None]:
df['age_bined']=  pd.cut(df.age_in_days, 10, include_lowest=True)

df2 = df[['age_bined', 'OutcomeType', 'condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'hour', 'weekday', 'month']]
df2 = pd.get_dummies(df2.reindex(), columns=['age_bined', 'condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'hour', 'weekday', 'month'])
df2.dropna(axis=0, inplace=True)
y = df2['OutcomeType']
y = LabelEncoder().fit_transform(y)
x = df2.drop('OutcomeType', axis=1)

print(x.shape)
print(y.shape)

rf = RandomForestClassifier(n_estimators=30, max_depth=None)
rf.fit(x, y)
y_pred = rf.predict_proba(x)
print(log_loss(y, y_pred))

In [None]:
print('Top features')
for score, feat in sorted(zip(rf.feature_importances_, x.columns), reverse=True)[:10]:
    print('{:.3f} {}'.format(score,feat))
print()    
print('Botton features')
for score, feat in sorted(zip(rf.feature_importances_, x.columns), reverse=True)[-10:]:
    print('{:.5f} {}'.format(score,feat))

So... binning the age didn't help a bit... Lets keep the age_in_days.
 
We can try using the breed information to augment the dataset, but its too large (lots of different values).
We can use it, but first we have to reduce the number of feature (columns) using PCA.

In [None]:
breed = PCA(0.90).fit_transform(pd.get_dummies(df['Breed']))
print(breed.shape)

breed = pd.DataFrame(breed, columns=['breed_' + str(i) for i in range(breed.shape[1]) ])

In [None]:
df2 = df[['age_in_days', 'OutcomeType', 'condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'hour', 'weekday', 'month']]
df2 = pd.get_dummies(df2.reindex(), columns=['condition', 'sex','AnimalType', 'has_name', 'color_1', 'color_2', 'pure_breed', 'hour', 'weekday', 'month'])
df2 = pd.concat((df2, breed), axis=1)

df2.dropna(axis=0, inplace=True)
y = df2['OutcomeType']
y = LabelEncoder().fit_transform(y)
x = df2.drop('OutcomeType', axis=1)

rf = RandomForestClassifier(n_estimators=30, max_depth=None)
rf.fit(x, y)
y_pred = rf.predict_proba(x)
print(log_loss(y, y_pred))

In [None]:
print('Top features')
for score, feat in sorted(zip(rf.feature_importances_, x.columns), reverse=True)[:10]:
    print('{:.3f} {}'.format(score,feat))
print()    
print('Botton features')
for score, feat in sorted(zip(rf.feature_importances_, x.columns), reverse=True)[-10:]:
    print('{:.5f} {}'.format(score,feat))

While not among the top 10 features, obviously the breed of the animal helped a bit
 
Lets see how our model behaves on unseen data. To do it we will do crossvalidation

In [None]:
cross_val_score(RandomForestClassifier(), x, y , cv =3, scoring='log_loss')

eek! This is bad! The predictions are really BAD. But on the training set all was good... This indicates a problem of overfitting: the model understands the training really well but can't generalize to unseen problems.
When dealing with trees one way to overcome this is to control the max_depth of the tree. Lets see how this influences the predictions.

In [None]:
X_train, X_test, y_train,  y_test = train_test_split(x, y)
depth = list(range(1, 30, 2))
test_score = []
train_score = []
for d in depth:
    rfc = RandomForestClassifier(max_depth=d)
    rfc.fit(X_train, y_train)
    train_score.append(log_loss(y_train, rfc.predict_proba(X_train)))
    test_score.append(log_loss(y_test, rfc.predict_proba(X_test)))

ax = sns.pointplot(depth, train_score, color='red')
ax = sns.pointplot(depth, test_score, color='blue')

Thats the problem! The depth plays an important role here. Until depth is 9-10 all is good and is helps lower the training scores (a bit), but when we go above 13 the test results start increasing REALLY fast. So lets keep the depth at check with a value of 10.

As our data is also very unbalanced (very few animals die), we might take a look at the min_samples_leaf variable on RandomForests as well (this contrl how many itens must be at the leaf).

In [None]:
min_samples_leaf = list(range(1, 50, 2))
test_score = []
train_score = []
for m in min_samples_leaf:
    rfc = RandomForestClassifier(max_depth=10, min_samples_leaf= m, random_state=42)
    rfc.fit(X_train, y_train)
    train_score.append(log_loss(y_train, rfc.predict_proba(X_train)))
    test_score.append(log_loss(y_test, rfc.predict_proba(X_test)))

ax = sns.pointplot(min_samples_leaf, train_score, color='red')
ax = sns.pointplot(min_samples_leaf, test_score, color='blue')

In [None]:
estimators = list(range(10, 150, 10))
test_score = []
train_score = []
for e in estimators:
    rfc = RandomForestClassifier(max_depth=15, min_samples_leaf=1, random_state=42, n_estimators=e)
    rfc.fit(X_train, y_train)
    train_score.append(log_loss(y_train, rfc.predict_proba(X_train)))
    test_score.append(log_loss(y_test, rfc.predict_proba(X_test)))

ax = sns.pointplot(estimators, train_score, color='red')
ax = sns.pointplot(estimators, test_score, color='blue')

So, above 40 trees we don't see much improvement? But maybe, now that we have more trees some of the other parameters can be changed. While we could go back and check max_depth and min_samples_leaf with the new values there is a better way: grid search with cross validation.

In [None]:
rfc_params = {
    'n_estimators' : np.linspace(10,60, 3, dtype='int'),
    'max_depth': np.linspace(2, 30, 3, dtype='int'),
    'criterion' : ['gini', 'entropy'],
    'min_samples_leaf': np.linspace(1, 15, 3, dtype='int'),    
}
grid = GridSearchCV(RandomForestClassifier(), rfc_params, verbose=0, scoring='log_loss', n_jobs=-1)
grid.fit(x, y)
print(grid.best_score_)
print(grid.best_params_)

In [None]:
xgb_params = dict(objective='multi:softprob',                    
                  max_depth=10, 
                  learning_rate=0.1,
                  num_class=5)
data = xgb.DMatrix(x, y)
xgb.cv(xgb_params, data, num_boost_round=30,  metrics='mlogloss', verbose_eval=True)