In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, Imputer, OneHotEncoder

In [2]:
train = pd.read_csv("train.csv")
train = train.drop(["Name", "OutcomeSubtype"], axis=1)
test = pd.read_csv("test.csv")
test = test.drop(["Name"], axis=1)
train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
# Split up information in SexuponOutcome

def split_sexuponoutcome(dataset):
    sex_split = [str(s).split() for s in dataset.SexuponOutcome]
    sterilizationStatus = [s[0] if len(s) == 2 else "Unknown" for s in sex_split]
    sex = [s[1] if len(s) == 2 else "Unknown" for s in sex_split]
    return sterilizationStatus, sex

sterile_train, sex_train = split_sexuponoutcome(train)
sterile_test, sex_test = split_sexuponoutcome(test)

train['Sex'], train['SterilizationStatus'] = pd.Series(sex_train), pd.Series(sterile_train)
test['Sex'], test['SterilizationStatus'] = pd.Series(sex_test), pd.Series(sterile_test)
train = train.drop('SexuponOutcome', axis=1)
test = test.drop('SexuponOutcome', axis=1)
train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,AgeuponOutcome,Breed,Color,Sex,SterilizationStatus
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,1 year,Shetland Sheepdog Mix,Brown/White,Male,Neutered
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,1 year,Domestic Shorthair Mix,Cream Tabby,Female,Spayed
2,A686464,2015-01-31 12:28:00,Adoption,Dog,2 years,Pit Bull Mix,Blue/White,Male,Neutered
3,A683430,2014-07-11 19:09:00,Transfer,Cat,3 weeks,Domestic Shorthair Mix,Blue Cream,Male,Intact
4,A667013,2013-11-15 12:52:00,Transfer,Dog,2 years,Lhasa Apso/Miniature Poodle,Tan,Male,Neutered


In [4]:
# Convert AgeuponOutcome to days

def convert_ages_to_days(dataset):
    ages = [str(a).split() for a in dataset.AgeuponOutcome]
    ages_in_days = []
    for a in ages:
        if len(a) != 2:
            ages_in_days.append(np.nan)
        else:
            value, unit = int(a[0]), a[1]
            if unit == 'year' or unit == 'years':
                ages_in_days.append(365 * value)
            elif unit == 'month' or unit == 'months':
                ages_in_days.append(30 * value)
            elif unit == 'week' or unit == 'weeks':
                ages_in_days.append(7 * value)
            elif unit == 'day' or unit == 'days':
                ages_in_days.append(value)
            else:
                raise Exception('Data is in inconsistent format.', 'value:', value, 'unit:', unit)
    return ages_in_days

ages_in_days_train = convert_ages_to_days(train)
ages_in_days_test = convert_ages_to_days(test)
## TODO: train random forest or neareast neighbors to predict age and impute NaN values
train_age_mean = np.nanmedian(ages_in_days_train)
test_age_mean = np.nanmedian(ages_in_days_test)
ages_in_days_train = [a if not np.isnan(a) else train_age_mean for a in ages_in_days_train]
ages_in_days_test = [a if not np.isnan(a) else test_age_mean for a in ages_in_days_test]
train['Age'] = pd.Series(ages_in_days_train)
test['Age'] = pd.Series(ages_in_days_test)
train = train.drop('AgeuponOutcome', axis=1)
test = test.drop('AgeuponOutcome', axis=1)
train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,Age
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,Male,Neutered,365
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,Female,Spayed,365
2,A686464,2015-01-31 12:28:00,Adoption,Dog,Pit Bull Mix,Blue/White,Male,Neutered,730
3,A683430,2014-07-11 19:09:00,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,Male,Intact,21
4,A667013,2013-11-15 12:52:00,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,Male,Neutered,730


In [5]:
# Split up components of Datetime

def format_datetime(dataset):
    times = [pd.Timestamp(t) for t in dataset.DateTime]
    years = [int(t.year) for t in times]
    months = [int(t.month) for t in times]
    days = [int(t.day) for t in times]
    hours = [int(t.hour) for t in times]
    minutes = [int(t.minute) for t in times]
    return years, months, days, hours, minutes

years_train, months_train, days_train, hours_train, minutes_train = format_datetime(train)
years_test, months_test, days_test, hours_test, minutes_test = format_datetime(test)
train['Year'], train['Month'], train['Day'], train['Hour'], train['Minute'] \
    = years_train, months_train, days_train, hours_train, minutes_train
test['Year'], test['Month'], test['Day'], test['Hour'], test['Minute'] \
    = years_test, months_test, days_test, hours_test, minutes_test
train = train.drop('DateTime', axis=1)
test = test.drop('DateTime', axis=1)
train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,Age,Year,Month,Day,Hour,Minute
0,A671945,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,Male,Neutered,365,2014,2,12,18,22
1,A656520,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,Female,Spayed,365,2013,10,13,12,44
2,A686464,Adoption,Dog,Pit Bull Mix,Blue/White,Male,Neutered,730,2015,1,31,12,28
3,A683430,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,Male,Intact,21,2014,7,11,19,9
4,A667013,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,Male,Neutered,730,2013,11,15,12,52


In [6]:
# Whether or not an animal is a "mix"

def mix_encoder(dataset):
    breeds = [b.split() for b in dataset.Breed]
    is_mix = []
    for b in breeds:
        if 'mix' in b or 'Mix' in b:
            is_mix.append(1)
        else:
            is_mix.append(0)
    return is_mix

train_mix = mix_encoder(train)
test_mix = mix_encoder(test)
train['Mix'] = pd.Series(train_mix)
test['Mix'] = pd.Series(test_mix)
train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,Age,Year,Month,Day,Hour,Minute,Mix
0,A671945,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,Male,Neutered,365,2014,2,12,18,22,1
1,A656520,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,Female,Spayed,365,2013,10,13,12,44,1
2,A686464,Adoption,Dog,Pit Bull Mix,Blue/White,Male,Neutered,730,2015,1,31,12,28,1
3,A683430,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,Male,Intact,21,2014,7,11,19,9,1
4,A667013,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,Male,Neutered,730,2013,11,15,12,52,0


In [7]:
# TODO: Do something more clever with breed and color other than enumerating them...
# Idea: Wikipedia and other sites have lists of most popular dog breeds by 
# # of registrations: https://en.wikipedia.org/wiki/List_of_most_popular_dog_breeds

train['OutcomeType'] = LabelEncoder().fit_transform(train['OutcomeType'])

for var in ['AnimalType', 'Sex']:
    train[var] = LabelBinarizer().fit_transform(train[var])
    test[var] = LabelBinarizer().fit_transform(test[var])
    
for var in ['SterilizationStatus', 'Breed', 'Color']:
    train[var] = LabelEncoder().fit_transform(train[var])
    test[var] = LabelEncoder().fit_transform(test[var])
    
train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,Age,Year,Month,Day,Hour,Minute,Mix
0,A671945,3,1,1221,130,0,1,365,2014,2,12,18,22,1
1,A656520,2,0,640,167,1,2,365,2013,10,13,12,44,1
2,A686464,0,1,1066,86,0,1,730,2015,1,31,12,28,1
3,A683430,4,0,640,42,0,0,21,2014,7,11,19,9,1
4,A667013,4,1,914,274,0,1,730,2013,11,15,12,52,0


In [8]:
clf = GridSearchCV(DecisionTreeClassifier(), param_grid={'max_depth':[4,5,6,7,8,9,10,12],
                                                        'min_samples_split':[2, 5, 10, 15, 20, 25],
                                                        'min_samples_leaf':[1, 2, 5, 8, 12, 15, 20]},
                  scoring='log_loss',
                  n_jobs=-1)
clf.fit(train[train.columns[2:]], train.OutcomeType)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [2, 5, 10, 15, 20, 25], 'max_depth': [4, 5, 6, 7, 8, 9, 10, 12], 'min_samples_leaf': [1, 2, 5, 8, 12, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [9]:
clf.best_params_

{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 25}

In [10]:
clf.best_score_

-0.90703987147627374

In [11]:
# predict on test set and write submission file

class_probabilites = clf.predict_proba(test[test.columns[1:]])
submission = 'ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n'
for i in range(len(test.ID)):
    submission += str(test.ID[i]) + ',' + ','.join([str(j) for j in class_probabilites[i]]) + '\n'
f = open("submission.csv", "w")
f.write(submission)
f.close()

Notice that it's technically okay if we predict the probability of some classes to be 0, even if we're being evaluated on the logloss. Kaggle replaces 0 probabilities with 10e-15. But when GridSearchCV attempts to score a model with a given parameter set using logloss, it's not replacing any 0 probability class predictions with 10e-15 - which means if we predicted a class with 0 probability, but it turned out to be correct, then our loss will be log(0) = -infinity and there's no way we can recover! This very likely explains why GridSearchCV chose such a relatively large min_samples_split parameter, so that, with high probability, none of the classes have a 0 percent chance of occurring in any of the leaves.