In [8]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.preprocessing import label_binarize, normalize, LabelEncoder
from sklearn.metrics import log_loss, classification_report
from sklearn.decomposition import PCA

from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV


In [9]:
df = pd.read_csv('../input/train.csv')
df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [10]:
df['has_name'] = pd.notnull(df['Name']).astype('bool8')
#####
df.DateTime = pd.to_datetime(df.DateTime)
df['year'] = df.DateTime.dt.year.astype('category')
df['month'] = df.DateTime.dt.month.astype('category')
df['weekday'] = df.DateTime.dt.weekday.astype('category')
df['week_end'] = df.weekday.isin([5,6])
df['hour'] = df.DateTime.dt.hour.astype('category')
def hour_period_map(x):
    if x in [7,8,9,10,11,12]:
        return 'morning'
    elif x in [13,14,15,16,17,18]:
        return 'afternoon'
    elif x in [19,20,21,22,23]:
        return 'night'
    else:  # [0,1,2,3,4,5,6]
        return 'dawn'
    
df['period'] = df.hour.apply(hour_period_map)
#####
df['is_dog'] = (df['AnimalType'] == 'Dog')
####
foo = df['SexuponOutcome'].str.split(expand=True)
df['condition'] = foo[0].map({'Neutered': 'operated', 'Spayed':'operated', 'Intact': 'intact', 'Unknown': 'unknown'})
df['is_male'] = (foo[1] == 'Male')
#####
time_multiplier = dict(year = 365, years=365, weeks = 7, month = 30, months=30, days = 1, week = 7, day= 1)
foo = df['AgeuponOutcome'].str.split(expand=True)
age = foo[0].astype('float32')
period = foo[1]
period = period.map(time_multiplier)
df['age_in_days'] = age * period
df['age_in_years'] = (df['age_in_days'] / 365).round()
df['age_in_months'] = (df['age_in_days'] / 30).round()
#####
df['pure_breed'] = df.Breed.apply(lambda x : 'mix' not in x.lower()).astype('bool8')
####
cores = df.Color.str.split('/', expand=True)

df['cores_1'] = cores[0].str.split(expand=True).ix[:,0]
df['cores_2'] = cores[1].str.split(expand=True).ix[:,0]
df['bi_color'] =  df.Color.str.contains('/').astype('bool8')
####
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26729 entries, 0 to 26728
Data columns (total 27 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null datetime64[ns]
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
has_name          26729 non-null bool
year              26729 non-null category
month             26729 non-null category
weekday           26729 non-null category
week_end          26729 non-null bool
hour              26729 non-null category
period            26729 non-null object
is_dog            26729 non-null bool
condition         26728 non-null object
is_male           26729 non-null bool
age_in_days       26711 non-null float64
age_in_years      26711 non-null float64
a

In [11]:
#foo = df.groupby(['OutcomeType','cores_1'])['AnimalID'].count()
#foo = foo.unstack(level=0)

#foo.divide(np.sum(foo, axis=1), axis=0)
#foo = ' '.join(df.Breed.ravel()).replace('/', ' ').split()
#hair =  pd.value_counts([f for f in foo if 'hair' in f.lower()])


In [12]:
df2 = df[['has_name', 
          'year',
          'month', 
          'weekday', 
          #'weekend',
          #'period', 
          'hour',
          'is_dog', 
          'condition', 
          'is_male', 
          #'age_in_days', 
          #'age_in_months',
          'age_in_years',
          'pure_breed', 
          'cores_1', 
          'cores_2', 
          'bi_color', 
          'OutcomeType']]
df2 = pd.get_dummies(df2, columns=['year', 
                                   'month', 
                                   'weekday', 
                                   #'weekend',
                                   #'period', 
                                   'hour',                                   
                                   'condition', 
                                   'cores_1', 
                                   'cores_2'])
df2.dropna(axis=0, inplace=True)
y = df2['OutcomeType']
x = df2.drop('OutcomeType', axis=1)
#x_norm = normalize(x, 'l2')

In [13]:
rfc_params = {
    'n_estimators' : [50], # np.linspace(100, 700, 3, dtype='int'),
    'max_depth': np.linspace(15, 30, 5, dtype='int'),
    'min_samples_leaf': [1,2,3], # np.linspace(1, 3, 2, dtype='int'),   
    #'class_weight' : [None, 'balanced'],
}
rf_grid = GridSearchCV(RandomForestClassifier(), rfc_params, verbose=3, scoring='log_loss', n_jobs=4)
rf_grid.fit(x, y)
print(rf_grid.best_score_)
print(rf_grid.best_params_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] min_samples_leaf=1, n_estimators=50, max_depth=15 ...............
[CV] min_samples_leaf=1, n_estimators=50, max_depth=15 ...............
[CV] min_samples_leaf=1, n_estimators=50, max_depth=15 ...............
[CV] min_samples_leaf=2, n_estimators=50, max_depth=15 ...............
[CV]  min_samples_leaf=1, n_estimators=50, max_depth=15, score=-0.826133 -   2.6s
[CV] min_samples_leaf=2, n_estimators=50, max_depth=15 ...............


[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:    2.6s


[CV]  min_samples_leaf=1, n_estimators=50, max_depth=15, score=-0.820980 -   2.8s
[CV]  min_samples_leaf=2, n_estimators=50, max_depth=15, score=-0.832302 -   2.6s
[CV] min_samples_leaf=2, n_estimators=50, max_depth=15 ...............
[CV] min_samples_leaf=3, n_estimators=50, max_depth=15 ...............
[CV]  min_samples_leaf=1, n_estimators=50, max_depth=15, score=-0.813493 -   2.9s
[CV] min_samples_leaf=3, n_estimators=50, max_depth=15 ...............
[CV]  min_samples_leaf=2, n_estimators=50, max_depth=15, score=-0.819005 -   2.5s
[CV] min_samples_leaf=3, n_estimators=50, max_depth=15 ...............
[CV]  min_samples_leaf=2, n_estimators=50, max_depth=15, score=-0.824517 -   2.9s
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=15, score=-0.834168 -   2.5s
[CV] min_samples_leaf=1, n_estimators=50, max_depth=18 ...............
[CV] min_samples_leaf=1, n_estimators=50, max_depth=18 ...............
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=15, score=-0.831539 -   2.6s


[Parallel(n_jobs=4)]: Done  32 jobs       | elapsed:   26.8s


[CV]  min_samples_leaf=2, n_estimators=50, max_depth=26, score=-0.809888 -   3.2s
[CV] min_samples_leaf=1, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=26, score=-0.826535 -   3.1s
[CV] min_samples_leaf=1, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=26, score=-0.816385 -   3.1s
[CV] min_samples_leaf=1, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=26, score=-0.813794 -   3.3s
[CV] min_samples_leaf=2, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=1, n_estimators=50, max_depth=30, score=-0.943921 -   4.1s
[CV] min_samples_leaf=2, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=1, n_estimators=50, max_depth=30, score=-0.945700 -   3.8s
[CV] min_samples_leaf=2, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=1, n_estimators=50, max_depth=30, score=-0.953527 -   4.0s


[Parallel(n_jobs=4)]: Done  39 out of  45 | elapsed:   33.0s remaining:    5.1s


[CV]  min_samples_leaf=2, n_estimators=50, max_depth=30, score=-0.822863 -   3.4s
[CV] min_samples_leaf=3, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=2, n_estimators=50, max_depth=30, score=-0.822546 -   3.4s
[CV] min_samples_leaf=3, n_estimators=50, max_depth=30 ...............
[CV]  min_samples_leaf=2, n_estimators=50, max_depth=30, score=-0.810669 -   3.4s
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=30, score=-0.831143 -   3.1s
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=30, score=-0.819814 -   2.7s
[CV]  min_samples_leaf=3, n_estimators=50, max_depth=30, score=-0.816479 -   1.9s


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   37.7s finished


-0.810125806577
{'min_samples_leaf': 2, 'n_estimators': 50, 'max_depth': 22}


In [15]:
grad_params = {
    'n_estimators' : [10], # np.linspace(10, 40, 3, dtype='int'),
    'max_depth': np.linspace(4, 8, 3, dtype='int'),    
    'subsample' : np.linspace(0.6, 1.0, 3),
    'learning_rate' : np.linspace(0.1, 0.3, 3),
}
grad_grid = GridSearchCV(GradientBoostingClassifier(), grad_params, verbose=3, scoring='log_loss', n_jobs=4)
grad_grid.fit(x, y)
print(grad_grid.best_score_)
print(grad_grid.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=4 ..
[CV] n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=4 ..
[CV] n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=4 ..
[CV] n_estimators=10, subsample=0.8, learning_rate=0.1, max_depth=4 ..
[CV]  n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=4, score=-0.960435 -   8.2s
[CV] n_estimators=10, subsample=0.8, learning_rate=0.1, max_depth=4 ..
[CV]  n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=4, score=-0.958243 -   8.2s
[CV] n_estimators=10, subsample=0.8, learning_rate=0.1, max_depth=4 ..


[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:    8.3s


[CV]  n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=4, score=-0.956274 -   8.3s
[CV] n_estimators=10, subsample=1.0, learning_rate=0.1, max_depth=4 ..
[CV]  n_estimators=10, subsample=0.8, learning_rate=0.1, max_depth=4, score=-0.962637 -   9.8s
[CV] n_estimators=10, subsample=1.0, learning_rate=0.1, max_depth=4 ..
[CV]  n_estimators=10, subsample=0.8, learning_rate=0.1, max_depth=4, score=-0.958735 -   9.0s
[CV] n_estimators=10, subsample=1.0, learning_rate=0.1, max_depth=4 ..
[CV]  n_estimators=10, subsample=1.0, learning_rate=0.1, max_depth=4, score=-0.963287 -   9.3s
[CV] n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=6 ..
[CV]  n_estimators=10, subsample=0.8, learning_rate=0.1, max_depth=4, score=-0.955485 -   9.5s
[CV] n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=6 ..
[CV]  n_estimators=10, subsample=1.0, learning_rate=0.1, max_depth=4, score=-0.959254 -   9.2s
[CV] n_estimators=10, subsample=0.6, learning_rate=0.1, max_depth=6 ..
[CV]

[Parallel(n_jobs=4)]: Done  32 jobs       | elapsed:  2.4min


[CV]  n_estimators=10, subsample=0.8, learning_rate=0.2, max_depth=4, score=-0.848452 -   9.6s
[CV] n_estimators=10, subsample=1.0, learning_rate=0.2, max_depth=4 ..
[CV]  n_estimators=10, subsample=1.0, learning_rate=0.1, max_depth=8, score=-0.918481 -  34.6s
[CV] n_estimators=10, subsample=0.6, learning_rate=0.2, max_depth=6 ..
[CV]  n_estimators=10, subsample=1.0, learning_rate=0.2, max_depth=4, score=-0.858427 -   9.4s
[CV] n_estimators=10, subsample=0.6, learning_rate=0.2, max_depth=6 ..
[CV]  n_estimators=10, subsample=1.0, learning_rate=0.2, max_depth=4, score=-0.855656 -  10.0s
[CV] n_estimators=10, subsample=0.6, learning_rate=0.2, max_depth=6 ..
[CV]  n_estimators=10, subsample=1.0, learning_rate=0.2, max_depth=4, score=-0.848619 -   9.5s
[CV] n_estimators=10, subsample=0.8, learning_rate=0.2, max_depth=6 ..
[CV]  n_estimators=10, subsample=0.6, learning_rate=0.2, max_depth=6, score=-0.844474 -  15.1s
[CV] n_estimators=10, subsample=0.8, learning_rate=0.2, max_depth=6 ..
[CV]

KeyboardInterrupt: 

In [None]:
bag_params = {
    'max_samples' : np.linspace(0.6, 1.0, 3),
    'max_features' : np.linspace(0.6, 1.0, 3),
    'n_estimators' : np.linspace(10, 40, 3, dtype='int'),    
    'bootstrap': [True, False],
    'bootstrap_features' : [True, False]
}
bag_grid = GridSearchCV(BaggingClassifier(), bag_params, verbose=3, scoring='log_loss', n_jobs=4)
bag_grid.fit(x, y)
print(bag_grid.best_score_)
print(bag_grid.best_params_)

In [None]:
bag = BaggingClassifier(max_samples=0.8, max_features=0.7, n_estimators=40)
#sgd = SGDClassifier(loss='log')
grad = GradientBoostingClassifier(n_estimators=20, max_depth=4, verbose=0, subsample=0.9)
rfc = RandomForestClassifier(max_depth=20, n_estimators=300)
votecf = VotingClassifier([('rf1', rfc),
                           ('bag', bag),
                           ('grad', grad),
                           ('sgd', sgd)
                          ], 'soft')
cross_val_score(votecf, x, y, cv = 5, scoring='log_loss', n_jobs=4)

In [None]:
rfc = RandomForestClassifier(max_depth=10, n_estimators=50)
rfc.fit(x, y)
print('Top features')
for score, feat in sorted(zip(rfc.feature_importances_, x.columns), reverse=True)[:10]:
    print('{:.3f} {}'.format(score,feat))
print()    
print('Botton features')
for score, feat in sorted(zip(rfc.feature_importances_, x.columns), reverse=True)[-20:]:
    print('{:.5f} {}'.format(score,feat))

In [None]:
#x_norm = normalize(x)
#svc = SVC(probability=True)
#cross_val_score(svc, x, y, cv=5, verbose=3, scoring='log_loss')

In [None]:
#pca = PCA(0.999)
#x_pca = pca.fit_transform(x)
#x_pca.shape