In [22]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn import linear_model,tree,ensemble
from sklearn.metrics import accuracy_score,log_loss
from sklearn.grid_search import GridSearchCV

import sys
sys.path.insert(0,"../libs")
import data_cleaning as dc

In [23]:
train = pd.read_csv("../data/train.csv")
train = train[train.AnimalType=="Dog"]
train = dc.massage_df(train)

In [24]:
akc = pd.read_json("../breeds/akc.json")
manual = [pd.Series({
    "breed":"American Pit Bull Terrier"
,"energy":"high","group":"Terrier Group"
,"rank":-1,"size":"medium"})
          ,pd.Series({
    "breed":"Miniature Poodle"
,"energy":"medium","group":"Toy Group"
,"rank":8,"size":"small"})]
akc = akc.append(manual,ignore_index=True)

In [25]:
akc["dog_breed"] = akc.breed.apply(lambda x:x.replace("Dog","").strip())

In [26]:
train["has_pit"] = train.akc_name.apply(lambda x:True if "Pit" in x else "False")

In [27]:
counts = train.akc_name.value_counts()
popular_breeds = counts[counts>100].index

In [35]:
missing = []
for name in train.akc_name.unique():
    if name in akc.dog_breed.values:
        continue
        entry = counts[counts.index==name]
        print name,entry.values[0]
    else:
        #print name,len( train[ train.akc_name == name ] )
        missing.append({
                "name":name,
                "number":len( train[ train.akc_name == name ] )
            })
    # end if
# end for

In [36]:
miss_df = pd.DataFrame(missing)

In [32]:
sel = train.dog_breed=="German Shepherd"
train[sel].OutcomeType.value_counts()/len(train[sel])

Adoption           0.429479
Return_to_owner    0.294790
Transfer           0.231258
Euthanasia         0.040661
Died               0.003812
Name: OutcomeType, dtype: float64

In [31]:
sel = train.dog_breed=="Yorkshire Terrier"
train[sel].OutcomeType.value_counts()/len(train[sel])

Adoption           0.389423
Return_to_owner    0.360577
Transfer           0.201923
Euthanasia         0.048077
Name: OutcomeType, dtype: float64

In [28]:
sel = train.dog_breed=="Labrador Retriever"
train[sel].OutcomeType.value_counts()/len(train[sel])

Adoption           0.439978
Return_to_owner    0.267448
Transfer           0.244556
Euthanasia         0.044668
Died               0.003350
Name: OutcomeType, dtype: float64

In [42]:
sel = train.energy == "medium"
train[sel].OutcomeType.value_counts()/len(train[sel])

Adoption           0.423559
Return_to_owner    0.272763
Transfer           0.259588
Euthanasia         0.039786
Died               0.004305
Name: OutcomeType, dtype: float64

In [28]:
# restrict training set by available data
train = train[ train.akc_name.apply(lambda x:x in akc.dog_breed.values) ]

In [29]:
akc_dict = akc[["dog_breed","energy","rank","size"]].set_index("dog_breed").to_dict()
for key in akc_dict.keys():
    train[key] = train.akc_name.apply(lambda x:akc_dict[key][x])
# end for

In [30]:
mytrain,mytest = train_test_split(train,test_size=0.4)

In [35]:
features = ["neuter_status","age_numeric_years","is_weekend",
            "has_pit","energy","size","rank"]
# these features are linear combinations of the others
drop_features = ["neuter_status_Unknown"]

usable_mytrain_idx = mytrain[features].dropna().index
print "train utilization: %1.3f" % (float(len(usable_mytrain_idx))/len(mytrain))
mytrain = mytrain.ix[ usable_mytrain_idx ]
usable_mytest_idx = mytest[features].dropna().index
print "test utilization: %1.3f" % (float(len(usable_mytest_idx))/len(mytest))
mytest = mytest.ix[ usable_mytest_idx ]

mytrain_features = pd.get_dummies(mytrain[features])
mytrain_features = mytrain_features.drop(drop_features,axis=1)
mytrain_outcomes = mytrain.OutcomeType

mytest_features = pd.get_dummies(mytest[features])
mytest_features = mytest_features.drop(drop_features,axis=1)
mytest_outcomes = mytest.OutcomeType

train utilization: 1.000
test utilization: 1.000


In [36]:
#estimators = ensemble.AdaBoostClassifier(
#    tree.DecisionTreeClassifier(max_depth=2)
#    ,n_estimators=500
#)
#estimators = ensemble.AdaBoostClassifier(
#    tree.DecisionTreeClassifier(max_depth=3) )
#parameters = {
    #'max_depth' : list(range(1,10)),
    #'min_samples_split' : list(range(2,10))
#    'n_estimators' : [10,50,100,200,500]
#}

In [37]:
#classifier = GridSearchCV(estimators,parameters,n_jobs=8)

In [56]:
# build classifier
#classifier = linear_model.LogisticRegression()
#classifier = tree.DecisionTreeClassifier(max_depth=3)
#classifier = ensemble.RandomForestClassifier(max_depth=6)
#classifier = ensemble.AdaBoostClassifier(
#    tree.DecisionTreeClassifier(max_depth=2)
#    ,n_estimators=500
#)
classifier = GridSearchCV(
    ensemble.GradientBoostingClassifier(),
    {"min_samples_leaf":[1,2],"max_depth":[2,3,4,5,6]},
    scoring="log_loss",
    n_jobs=4
)

# train
classifier.fit(mytrain_features,mytrain_outcomes);

In [58]:
predictions = classifier.predict(mytest_features)
prediction_probabilities = classifier.predict_proba(mytest_features)

# check performance
acc = accuracy_score(mytest_outcomes, predictions)
los = log_loss( pd.get_dummies( mytest.OutcomeType ).values, prediction_probabilities)
print("accuracy = {0:1.2f} \nlog-loss = {1:1.3f}".format(acc,los) )

accuracy = 0.55 
log-loss = 1.019


In [26]:
pd.Series(predictions).value_counts()

Adoption           2525
Return_to_owner    1779
Transfer           1190
Euthanasia          132
Died                  8
dtype: int64

I predict lots of happy adoptions. In practice, there are many more transfers and euthanasia ...

In [23]:
mytest_outcomes.value_counts()

Adoption           2322
Return_to_owner    1586
Transfer           1404
Euthanasia          309
Died                 13
Name: OutcomeType, dtype: int64