In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

import random
import math

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score

import features as feat

df = pd.read_csv("train.csv")

#Create df in place with all the needed features and prepared values.
def prepare_features(df):
    df['SexuponOutcome'].fillna('Unknown', inplace=True)
    df['AgeuponOutcome'].fillna("-1 months", inplace=True)
    
    df['HasName'] = df['Name'].apply(lambda x: feat.hasName(x))
    df['NameLength'] = df['Name'].apply(lambda x: feat.getNameLength(x))
    df['ColourLength'] = df['Color'].apply(lambda x: feat.getColourLength(x))
    df['Intact'] = df['SexuponOutcome'].apply(lambda x: feat.intact(x))
    df['Gender'] = df['SexuponOutcome'].apply(lambda x: feat.gender(x))
    df['MonthsuponOutcome'] = df['AgeuponOutcome'].apply(lambda x: feat.convertAgeToMonths(x))
    df['YearsuponOutcome'] = df['AgeuponOutcome'].apply(lambda x: feat.convertAgeToYears(x))
    df['DaysuponOutcome'] = df['AgeuponOutcome'].apply(lambda x: feat.convertAgeToDays(x))
    df['WeeksuponOutcome'] = df['AgeuponOutcome'].apply(lambda x: feat.convertAgeToWeeks(x))
    df['IsYoung'] = df['MonthsuponOutcome'].apply(lambda x: feat.isYoung(x))
    df['Season'] = df['DateTime'].apply(lambda x: feat.timeToSeason(x))
    df['TimeofDay'] = df['DateTime'].apply(lambda x: feat.timeToHourOfDay(x))
    df['DayofWeek'] = df['DateTime'].apply(lambda x: feat.dayOfWeek(x))
    df['MonthofYear'] = df['DateTime'].apply(lambda x: feat.monthOfYear(x))
    df['Year'] = df['DateTime'].apply(lambda x: feat.timeToYear(x))
    df['IsShihTzu'] = df['Breed'].apply(lambda x: feat.isShihTzu(x))
    df['IsAggressive'] = df['Breed'].apply(lambda x: feat.isAggressive(x))
    df['IsMix'] = df['Breed'].apply(lambda x: feat.isMix(x))
    df['IsMultiColour'] = df['Color'].apply(lambda x: feat.isMultiColour(x))

    df['IsToy'] = df['Breed'].apply(lambda x: feat.isToy(x))
    df['IsWorking'] = df['Breed'].apply(lambda x: feat.isWorking(x))
    df['IsSporting'] = df['Breed'].apply(lambda x: feat.isSporting(x))
    df['IsHound'] = df['Breed'].apply(lambda x: feat.isHound(x))
    df['IsNonSporting'] = df['Breed'].apply(lambda x: feat.isNonSporting(x))
    df['IsHerding'] = df['Breed'].apply(lambda x: feat.isHerding(x))
    df['IsTerrier'] = df['Breed'].apply(lambda x: feat.isTerrier(x))
    df['IsShortHair'] = df['Breed'].apply(lambda x: feat.isShortHair(x))
    df['IsMediumHair'] = df['Breed'].apply(lambda x: feat.isMediumHair(x))
    df['IsLongHair'] = df['Breed'].apply(lambda x: feat.isLongHair(x))
    df['IsTabby'] = df['Color'].apply(lambda x: feat.isTabby(x))
    
    df['AnimalType'] = df['AnimalType'].apply(lambda x: feat.convertAnimalType(x))
    
    df.drop(['AnimalID', 
             'DateTime', 
             'OutcomeSubtype',
             'SexuponOutcome',
             'AgeuponOutcome',
             'Breed',
             'Color',
             'DateTime',
             'Name'], axis=1, inplace=True)
    
prepare_features(df)

# Selecting 40% of Training set for actual training.
df = df.iloc[np.random.permutation(len(df))]
df.reset_index(inplace=True, drop=True)
split = int(len(df) * .4)

xTrain = df.iloc[:split + 1,1:] #iloc == [), loc == [] for ranges.
yTrain = df.loc[:split,'OutcomeType']

xTest = df.iloc[split:,1:]
yTest = df.loc[split:,'OutcomeType']



# Train classifer.
xTrain = xTrain.as_matrix()
yTrain = np.array(yTrain)
nodes = (len(xTrain[0]) + len(np.unique(yTrain))) / 2

clf = RandomForestClassifier(n_estimators = 100,
                            max_features = int(math.sqrt(len(xTrain[0]))),
                             criterion = 'entropy'
                            )

rfc = clf.fit(xTrain,yTrain)
yPred = rfc.predict(xTest)
print(classification_report(yTest, yPred))
print("Accuracy: %1.3f" % accuracy_score(yTest, yPred))


             precision    recall  f1-score   support

   Adoption       0.67      0.81      0.73      6428
       Died       0.00      0.00      0.00       121
 Euthanasia       0.56      0.20      0.29       926
Return_to_owner       0.46      0.44      0.45      2867
   Transfer       0.75      0.68      0.71      5696

avg / total       0.65      0.66      0.64     16038

Accuracy: 0.657
