In [11]:
import pandas as pd
import numpy as np

#CrossVal
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold as KFold
#GridSearch
from sklearn.grid_search import GridSearchCV
#models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
#ensembles
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier


In [2]:
def IsIntact(x):
    if x == 'Intact':
        return 'Intact'
    elif x == 'Unknown':
        return 'Unknown'
    else:
        return 'NotIntact'

def convertToYears(ageString):
    try:
        if 'year' in ageString:
            return float(ageString.split()[0])
        elif 'week' in ageString:
            return float(ageString.split()[0])/52.1
        elif 'month' in ageString:
            return float(ageString.split()[0])/12
        elif 'day' in ageString:
            return float(ageString.split()[0])/365
        else:
            print ageString
    except:
        return float(10)
    
common_dog_breeds = [u'Boxer', u'Rottweiler', u'Poodle', u'Yorkshire', u'French', u'Beagle', 
                     u'Bulldog', u'Golden', u'German', u'Labrador']

common_cat_breeds = ['Maine Coon', 'Domestic', 'Persian' , 'RagDoll', 'Shorthair', 'Fold',
                    'Sphynx']
def IsCommon(row):
    if row['IsDog'] == True:
        if any(breed in row['Breed'] for breed in common_dog_breeds):
            return True
        else:
            return False
    else:
        if any(breed in row['Breed'] for breed in common_cat_breeds):
            return True
        else:
            return False

doglife = 12
catlife = 15
def agePercentage(row):
    if row['IsDog'] == True:
        return row['NewAge']/doglife
    else:
        return row['NewAge']/catlife


In [3]:
#Testbed for random queries on data
data = pd.read_csv('train.csv')

# Split into X and Y, Remove Unneeded Columns
X = data.drop(['OutcomeType', 'OutcomeSubtype', 'AnimalID' ], axis=1)
Y = data ['OutcomeType']

from collections import Counter


In [82]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#Clean Data
data = data[data['SexuponOutcome']!= 'Unknown']
data = data[~pd.isnull(data['AgeuponOutcome'])]

# Split into X and Y, Remove Unneeded Columns
X = data.drop(['OutcomeType', 'OutcomeSubtype', 'AnimalID' ,'DateTime' ], axis=1)

Y = data ['OutcomeType']



def preprocessing(X):
    
    #Change DataTypes
    X['SexuponOutcome'] = X['SexuponOutcome'].astype(str)

    # print X.isnull().sum()

    # Feature Engineering

    #Split SexuponOutcome (Drops the original SexUponOutcome Feature)
    X['IsIntact'] = X.SexuponOutcome.str.split(' ').str.get(0)
    X['IsMale'] = X.SexuponOutcome.str.split(' ').str.get(1)    
    X['IsMale'] = X['IsMale'].apply(lambda x: True if x == 'Male' else False)
    X['IsIntact'] = X['IsIntact'].apply(lambda x: True if x == 'Intact' else False)
    X = X.drop('SexuponOutcome', axis =1)

    #Convert Dog/cat to Boolean
    X['IsDog'] = X['AnimalType'].apply(lambda x: True if x == 'Dog' else False)
    X= X.drop('AnimalType', axis =1 )

    #Named/Unnamed
    X['Name'] = X['Name'].fillna("NoName")
    X['hasName'] = X['Name'].apply(lambda x: False if x == 'NoName' else True)
    X = X.drop('Name', axis =1)

    #TODO: Do something with Age, perhaps percentage of average lifespan?
    
    X['NewAge'] = X['AgeuponOutcome'].apply(convertToYears)
    X['AgePercentage'] = X.apply(agePercentage, axis = 1)
    X['Old'] = X['NewAge'].apply(lambda x: True if x> 7 else False )
    X = X.drop('AgeuponOutcome', axis =1)

    #Drop Breed. TODO: Create Size by breed. 
    X['IsPureBred'] = X["Breed"].apply(lambda x: False if "Mix" in x else True)
    X['IsPopular'] = X.apply(IsCommon, axis = 1)

    X = X.drop('Breed', axis =1)


    #Drop Color Until You figure out how to get information from it
#     X['Monochrome'] = X['Color'].apply(lambda x: True if '/' not in x else False)
    X = X.drop('Color', axis =1)
    
    return X

    
X = preprocessing(X) 

In [83]:
X

Unnamed: 0,IsIntact,IsMale,IsDog,hasName,NewAge,AgePercentage,Old,IsPureBred,IsPopular
0,False,True,True,True,1.000000,0.083333,False,False,False
1,False,False,False,True,1.000000,0.066667,False,False,True
2,False,True,True,True,2.000000,0.166667,False,False,False
3,True,True,False,False,0.057582,0.003839,False,False,True
4,False,True,True,False,2.000000,0.166667,False,True,True
5,True,False,True,True,0.083333,0.006944,False,True,False
6,True,True,False,True,0.057582,0.003839,False,False,True
8,False,False,True,True,0.416667,0.034722,False,False,False
9,False,False,True,False,1.000000,0.083333,False,True,False
11,False,False,True,False,2.000000,0.166667,False,False,False


In [84]:
#CROSS VALIDATION

seed = 7
processors=1
num_folds=10
num_instances=len(X)
scoring='log_loss'

kfold = KFold( Y, n_folds=num_folds, random_state=seed)

#Algorithm Spot Check

models = []
models.append(('LR', LogisticRegression()))
models.append(('Random Forest', RandomForestClassifier(random_state = seed, max_depth=8, bootstrap=True, n_estimators= 200)))
models.append(('Extra Trees', ExtraTreesClassifier(random_state = seed, max_depth=8, bootstrap=True,n_estimators=100)))
models.append(('GB', GradientBoostingClassifier(random_state = seed, subsample = 0.8 )))

# Evaluate each model in turn
results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
    results.append(cv_results)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))

    
#TODO: Hyperparameter tuning 

#Final classifier is a vote using the best classifiers, perhaps using weights     
finalmodels = models
voter = VotingClassifier(finalmodels, voting ='soft')
results = cross_val_score(voter, X, Y, cv=kfold, scoring=scoring,n_jobs=processors)
print "Voting Clf:" , ("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))



LR: (-0.936) +/- (0.013)
Random Forest: (-0.866) +/- (0.012)
Extra Trees: (-0.879) +/- (0.012)
GB: (-0.867) +/- (0.014)
Voting Clf: (-0.874) +/- (0.012)


In [77]:
test = pd.read_csv("test.csv")
ids = test.ID.as_matrix().reshape(11456,1)
test = test.drop(["ID", 'DateTime'], axis =1)
test = preprocessing(test)
voter.fit(X,Y)
res = voter.predict_proba(test)
# voter.fit(X,Y)
# voter.(predict)

In [78]:
reswithid = np.hstack((ids, res))
headers = voter.classes_
headers = (np.insert(headers, 0, "ID"))
final = pd.DataFrame(reswithid, columns = headers)
final["ID"] = final["ID"].astype(int)
final.to_csv("results.csv", index=False)

In [53]:
testmodel = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200],
    'criterion': ['gini', 'entropy'],
#     'max_features': [18, 20],
    'max_depth': [8, 10],
    'bootstrap': [True]
}, 
clf = GridSearchCV(testmodel, param_grid)
# results = cross_val_score(clf, X, Y, cv=kfold, scoring=scoring,n_jobs=processors)
# print " Clf:" , ("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))



In [54]:
clf.fit(X,Y)
clf.best_params_

{'bootstrap': True, 'criterion': 'gini', 'max_depth': 8, 'n_estimators': 200}

In [None]:
IsIntact	IsMale	IsDog	hasName	NewAge
LR: (-0.944) +/- (0.014)
Random Forest: (-1.143) +/- (0.101)
Extra Trees: (-1.244) +/- (0.116)
GB: (-0.868) +/- (0.014)
Voting Clf: (-0.879) +/- (0.014)
    
IsIntact	IsMale	IsDog	hasName	NewAge	AgePercentage
LR: (-0.944) +/- (0.014)
Random Forest: (-1.152) +/- (0.106)
Extra Trees: (-1.243) +/- (0.117)
GB: (-0.869) +/- (0.014)
Voting Clf: (-0.877) +/- (0.015)
In [18]:

IsIntact	IsMale	IsDog	hasName	NewAge	AgePercentage	Old    
LR: (-0.941) +/- (0.014)
Random Forest: (-1.140) +/- (0.099)
Extra Trees: (-1.242) +/- (0.119)
GB: (-0.868) +/- (0.014)
Voting Clf: (-0.876) +/- (0.015)

-----Tuned classifiers------
IsIntact	IsMale	IsDog	hasName	NewAge	AgePercentage	Old	IsPureBred	IsPopular
LR: (-0.936) +/- (0.013)
Random Forest: (-0.866) +/- (0.012)
Extra Trees: (-0.879) +/- (0.012)
GB: (-0.867) +/- (0.014)
Voting Clf: (-0.874) +/- (0.012)
