In [1]:
import pandas as pd
import numpy as np
import operator
from scipy.stats import entropy
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder

In [2]:
train = pd.read_csv("train.csv")
train = train.drop(["Name", "OutcomeSubtype"], axis=1)
test = pd.read_csv("test.csv")
test = test.drop(["Name"], axis=1)
train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
# Split up information in SexuponOutcome

def split_sexuponoutcome(dataset):
    sex_split = [str(s).split() for s in dataset.SexuponOutcome]
    sterilizationStatus = [s[0] if len(s) == 2 else "Unknown" for s in sex_split]
    sex = [s[1] if len(s) == 2 else "Unknown" for s in sex_split]
    return sterilizationStatus, sex

sterile_train, sex_train = split_sexuponoutcome(train)
sterile_test, sex_test = split_sexuponoutcome(test)

sterile_train = ["Sterilized" if s != "Intact" else "Intact" for s in sterile_train]
sterile_test = ["Sterilized" if s != "Intact" else "Intact" for s in sterile_test]
train['Sex'], train['SterilizationStatus'] = pd.Series(sex_train), pd.Series(sterile_train)
test['Sex'], test['SterilizationStatus'] = pd.Series(sex_test), pd.Series(sterile_test)
train = train.drop('SexuponOutcome', axis=1)
test = test.drop('SexuponOutcome', axis=1)
train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,AgeuponOutcome,Breed,Color,Sex,SterilizationStatus
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,1 year,Shetland Sheepdog Mix,Brown/White,Male,Sterilized
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,1 year,Domestic Shorthair Mix,Cream Tabby,Female,Sterilized
2,A686464,2015-01-31 12:28:00,Adoption,Dog,2 years,Pit Bull Mix,Blue/White,Male,Sterilized
3,A683430,2014-07-11 19:09:00,Transfer,Cat,3 weeks,Domestic Shorthair Mix,Blue Cream,Male,Intact
4,A667013,2013-11-15 12:52:00,Transfer,Dog,2 years,Lhasa Apso/Miniature Poodle,Tan,Male,Sterilized


In [4]:
'''
Convert AgeuponOutcome to log(days).
We use the log transform since a 365 day-old (1 year-old) animal is going 
to have more similar outcomes to a 1095 day-old (3 year-old) animal
than to a 7 day-old animal.
Check out an un-logged distribution here: 
https://www.kaggle.com/apapiu/shelter-animal-outcomes/visualizing-breeds-and-ages-by-outcome/discussion
and a logged distribution here: 
https://www.kaggle.com/andraszsom/shelter-animal-outcomes/age-gender-and-breed-vs-outcome/discussion
'''


def convert_ages_to_days(dataset):
    ages = [str(a).split() for a in dataset.AgeuponOutcome]
    ages_in_days = []
    for a in ages:
        if len(a) != 2:
            ages_in_days.append(np.nan)
        else:
            value, unit = int(a[0]), a[1]
            if unit == 'year' or unit == 'years':
                ages_in_days.append(365 * value)
            elif unit == 'month' or unit == 'months':
                ages_in_days.append(30 * value)
            elif unit == 'week' or unit == 'weeks':
                ages_in_days.append(7 * value)
            elif unit == 'day' or unit == 'days':
                ages_in_days.append(value)
            else:
                raise Exception('Data is in inconsistent format.', 'value:', value, 'unit:', unit)
    return ages_in_days

ages_in_days_train = convert_ages_to_days(train)
ages_in_days_test = convert_ages_to_days(test)
## TODO: train random forest or neareast neighbors to predict age and impute NaN values
train_age_mean = np.nanmedian(ages_in_days_train)
test_age_mean = np.nanmedian(ages_in_days_test)
ages_in_days_train = [a if not np.isnan(a) else train_age_mean for a in ages_in_days_train]
ages_in_days_test = [a if not np.isnan(a) else test_age_mean for a in ages_in_days_test]
train['LogAge'] = pd.Series(ages_in_days_train)
test['LogAge'] = pd.Series(ages_in_days_test)

train = train.drop('AgeuponOutcome', axis=1)
test = test.drop('AgeuponOutcome', axis=1)

train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,LogAge
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,Male,Sterilized,365
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,Female,Sterilized,365
2,A686464,2015-01-31 12:28:00,Adoption,Dog,Pit Bull Mix,Blue/White,Male,Sterilized,730
3,A683430,2014-07-11 19:09:00,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,Male,Intact,21
4,A667013,2013-11-15 12:52:00,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,Male,Sterilized,730


In [5]:
# Split up components of Datetime

def format_datetime(dataset):
    times = [pd.Timestamp(t) for t in dataset.DateTime]
    years = [int(t.year) for t in times]
    months = [int(t.month) for t in times]
    days = [int(t.day) for t in times]
    hours = [float(t.hour + t.minute/60.0) for t in times]
    weekdays = [t.weekday() for t in times]
    return years, months, days, hours, weekdays

years_train, months_train, days_train, hours_train, weekday_train = format_datetime(train)
years_test, months_test, days_test, hours_test, weekday_test = format_datetime(test)
train['Year'], train['Month'], train['Day'], train['Hour'], train['Weekday'] \
    = years_train, months_train, days_train, hours_train, weekday_train
test['Year'], test['Month'], test['Day'], test['Hour'], test['Weekday'] \
    = years_test, months_test, days_test, hours_test, weekday_test
train = train.drop('DateTime', axis=1)
test = test.drop('DateTime', axis=1)
train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,LogAge,Year,Month,Day,Hour,Weekday
0,A671945,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,Male,Sterilized,365,2014,2,12,18.366667,2
1,A656520,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,Female,Sterilized,365,2013,10,13,12.733333,6
2,A686464,Adoption,Dog,Pit Bull Mix,Blue/White,Male,Sterilized,730,2015,1,31,12.466667,5
3,A683430,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,Male,Intact,21,2014,7,11,19.15,4
4,A667013,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,Male,Sterilized,730,2013,11,15,12.866667,4


In [6]:
# Whether or not an animal is a "mix"

def mix_encoder(dataset):
    is_mix = []
    for b in dataset.Breed:
        if '/' in b or 'Mix' in b:
            is_mix.append(1)
        else:
            is_mix.append(0)
    return is_mix

train_mix = mix_encoder(train)
test_mix = mix_encoder(test)
train['Mix'] = pd.Series(train_mix)
test['Mix'] = pd.Series(test_mix)
train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Breed,Color,Sex,SterilizationStatus,LogAge,Year,Month,Day,Hour,Weekday,Mix
0,A671945,Return_to_owner,Dog,Shetland Sheepdog Mix,Brown/White,Male,Sterilized,365,2014,2,12,18.366667,2,1
1,A656520,Euthanasia,Cat,Domestic Shorthair Mix,Cream Tabby,Female,Sterilized,365,2013,10,13,12.733333,6,1
2,A686464,Adoption,Dog,Pit Bull Mix,Blue/White,Male,Sterilized,730,2015,1,31,12.466667,5,1
3,A683430,Transfer,Cat,Domestic Shorthair Mix,Blue Cream,Male,Intact,21,2014,7,11,19.15,4,1
4,A667013,Transfer,Dog,Lhasa Apso/Miniature Poodle,Tan,Male,Sterilized,730,2013,11,15,12.866667,4,1


In [7]:
# Parse the Breed feature to extract primary and/or secondary breeds

def parse_breed(data):
    breed_split = [s.split('/') for s in data.Breed]
    bag_of_breeds = []
    for l in breed_split:
        primary_secondary_breed = []
        for breed in l:
            if "Mix" in breed:
                breed = ' '.join(breed.split()[:-1])
            primary_secondary_breed.append(breed)
        if len(primary_secondary_breed) == 1:
            if "Mix" in l[0]:
                primary_secondary_breed.append("Unknown")
            else:
                primary_secondary_breed.append("None")
        if len(primary_secondary_breed) > 2: # handles weird edge case "Plott Hound/Black/Tan Hound"
            primary_secondary_breed = [primary_secondary_breed[0], "Unknown"]
        bag_of_breeds.append(primary_secondary_breed)
    return bag_of_breeds


train_breeds = parse_breed(train)
test_breeds = parse_breed(test)
train['PrimaryBreed'] = [b[0] for b in train_breeds]
train['SecondaryBreed'] = [b[1] for b in train_breeds]
test['PrimaryBreed'] = [b[0] for b in test_breeds]
test['SecondaryBreed'] = [b[1] for b in test_breeds]

train = train.drop("Breed", axis=1)
test = test.drop("Breed", axis=1)

train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Color,Sex,SterilizationStatus,LogAge,Year,Month,Day,Hour,Weekday,Mix,PrimaryBreed,SecondaryBreed
0,A671945,Return_to_owner,Dog,Brown/White,Male,Sterilized,365,2014,2,12,18.366667,2,1,Shetland Sheepdog,Unknown
1,A656520,Euthanasia,Cat,Cream Tabby,Female,Sterilized,365,2013,10,13,12.733333,6,1,Domestic Shorthair,Unknown
2,A686464,Adoption,Dog,Blue/White,Male,Sterilized,730,2015,1,31,12.466667,5,1,Pit Bull,Unknown
3,A683430,Transfer,Cat,Blue Cream,Male,Intact,21,2014,7,11,19.15,4,1,Domestic Shorthair,Unknown
4,A667013,Transfer,Dog,Tan,Male,Sterilized,730,2013,11,15,12.866667,4,1,Lhasa Apso,Miniature Poodle


Now that we have Breed in a nice format, let's do something fancy. We are going to calculate the cross entropy of each breed's outcome distribution q with respect to the overall outcome distribution p (the prior).

A common way of calculating cross entropy is: 

H(p, q) = -sum(p * log(q))

This function is minimized when q = p. Hence when q = p, this specific breed has an outcome distribution that's identical to the prior outcome distribution - so knowing that an animal is this breed gives us no more of an advantage than if we were to just guess based on the prior.

In this way, we can interpret the cross entropy as a way to calculate how much "information" over the prior a specific breed gives us about the outcome.

In [8]:
# First the "histogram" calculations

p = train.OutcomeType.value_counts() / len(train.OutcomeType)

breeds = train.PrimaryBreed.unique()
breed_groups = train.groupby("PrimaryBreed")
breeds_and_outcomes = breed_groups.OutcomeType.value_counts()
q = {b : breeds_and_outcomes[b] / sum(breeds_and_outcomes[b].values) for b in breeds}

# If some outcome never happens to a breed, we give it a 0.01 probability
# of occuring. This is the easy way of dealing with the fact that we must 
# take the log of the values of q.
for b in q.keys():
    for outcome in train.OutcomeType.unique():
        if not outcome in q[b].keys():
            q[b][outcome] = 1e-2
            
breed_entropy = {b: entropy(p, q[b].values) for b in q.keys()}
sorted_entropy = [i for i in reversed(sorted(breed_entropy.items(), key=operator.itemgetter(1)))]

In [9]:
sorted_entropy

[('Pixiebob Shorthair', 1.5455236534727064),
 ('Havana Brown', 1.5455236534727064),
 ('Sealyham Terr', 1.5455236534727064),
 ('Spanish Mastiff', 1.5455236534727064),
 ('German Pinscher', 1.5455236534727064),
 ('Afghan Hound', 1.5455236534727064),
 ('Munchkin Longhair', 1.5455236534727064),
 ('Hovawart', 1.5455236534727064),
 ('Turkish Van', 1.5455236534727064),
 ('Belgian Sheepdog', 1.5455236534727064),
 ('Irish Setter', 1.5455236534727064),
 ('Javanese', 1.5455236534727064),
 ('Wirehaired Pointing Griffon', 1.5455236534727064),
 ('Kuvasz', 1.5455236534727064),
 ('Mexican Hairless', 1.5455236534727064),
 ('Lowchen', 1.5455236534727064),
 ('Burmese', 1.5455236534727064),
 ('Treeing Tennesse Brindle', 1.5455236534727064),
 ('Norwegian Elkhound', 1.5455236534727064),
 ('Cornish Rex', 1.5455236534727064),
 ('Port Water Dog', 1.5455236534727064),
 ('Entlebucher', 1.5455236534727064),
 ('Norwegian Forest Cat', 1.5455236534727064),
 ('Abyssinian', 1.5455236534727064),
 ('Swiss Hound', 1.54552

Feel free to continue in the cross-entropy vein, or use more established methods for dimensionality reduction. Google "sparse binary dimensionality reduction" for more info.

In [10]:
# Convert categorical features into numbers for model fitting

outcome_map = {'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Return_to_owner': 3, 'Transfer': 4}
train['OutcomeType'] = [outcome_map[s] for s in train.OutcomeType]

for var in ['AnimalType', 'Sex', 'SterilizationStatus']:
    bin_fit = LabelBinarizer().fit(train[var].append(test[var]))
    train[var] = bin_fit.transform(train[var])
    test[var] = bin_fit.transform(test[var])

for var in ['PrimaryBreed', 'SecondaryBreed', 'Color']:
    label_fit = LabelEncoder().fit(train[var].append(test[var]))
    train[var] = label_fit.transform(train[var])
    test[var] = label_fit.transform(test[var])

train.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,Color,Sex,SterilizationStatus,LogAge,Year,Month,Day,Hour,Weekday,Mix,PrimaryBreed,SecondaryBreed
0,A671945,3,1,146,0,1,365,2014,2,12,18.366667,2,1,191,150
1,A656520,2,0,184,1,1,365,2013,10,13,12.733333,6,1,85,150
2,A686464,0,1,97,0,1,730,2015,1,31,12.466667,5,1,168,150
3,A683430,4,0,47,0,0,21,2014,7,11,19.15,4,1,85,150
4,A667013,4,1,311,0,1,730,2013,11,15,12.866667,4,1,134,99


Uncomment some of the lines below if you want to fit a decision tree to the training data or create a submission file.

In [11]:
#clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=25)
#clf.fit(train[train.columns[2:]], train.OutcomeType)
#clf.score(train[train.columns[2:]], train.OutcomeType)

In [12]:
'''
clf = GridSearchCV(DecisionTreeClassifier(), param_grid={'max_depth':[4,5,6,7,8,9,10,12],
                                                        'min_samples_split':[2, 5, 10, 15, 20, 25],
                                                        'min_samples_leaf':[1, 2, 5, 8, 12, 15, 20]},
                  scoring='log_loss',
                  n_jobs=-1)
clf.fit(train[train.columns[2:]], train.OutcomeType)
'''

"\nclf = GridSearchCV(DecisionTreeClassifier(), param_grid={'max_depth':[4,5,6,7,8,9,10,12],\n                                                        'min_samples_split':[2, 5, 10, 15, 20, 25],\n                                                        'min_samples_leaf':[1, 2, 5, 8, 12, 15, 20]},\n                  scoring='log_loss',\n                  n_jobs=-1)\nclf.fit(train[train.columns[2:]], train.OutcomeType)\n"

In [13]:
#clf.best_params_

In [14]:
#clf.best_score_

In [15]:
# predict on test set and write submission file

'''class_probabilites = clf.predict_proba(test[test.columns[1:]])
submission = 'ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n'
for i in range(len(test.ID)):
    submission += str(test.ID[i]) + ',' + ','.join([str(j) for j in class_probabilites[i]]) + '\n'
f = open("submission.csv", "w")
f.write(submission)
f.close()'''

'class_probabilites = clf.predict_proba(test[test.columns[1:]])\nsubmission = \'ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n\'\nfor i in range(len(test.ID)):\n    submission += str(test.ID[i]) + \',\' + \',\'.join([str(j) for j in class_probabilites[i]]) + \'\n\'\nf = open("submission.csv", "w")\nf.write(submission)\nf.close()'

Notice that it's technically okay if we predict the probability of some classes to be 0, even if we're being evaluated on the logloss. Kaggle replaces 0 probabilities with 10e-15. But when GridSearchCV attempts to score a model with a given parameter set using logloss, it's not replacing any 0 probability class predictions with 10e-15 - which means if we predicted a class with 0 probability, but it turned out to be correct, then our loss will be log(0) = -infinity and there's no way we can recover! This very likely explains why GridSearchCV chose such a relatively large min_samples_split parameter, so that, with high probability, none of the classes have a 0 percent chance of occurring in any of the leaves.