In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.learning_curve import learning_curve
from sklearn import metrics, cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score,log_loss, f1_score
from sklearn.grid_search import GridSearchCV 
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing cleaned dataset for both the testing set and training set 
trainDF = pd.read_csv('files/cleanTrain.csv')
testDF = pd.read_csv('files/cleanTest.csv')

In [3]:
trainDF.head()

Unnamed: 0,AnimalID,Name,OutcomeType,AnimalType,AgeuponOutcome,Breed,Color,Sex,Neutered,Year,Month,Day,Hour,Minute
0,A671945,Has Name,Return_to_owner,Dog,52,Hybrid,Brown,Male,Neutered,2014,2,12,18,22
1,A656520,Has Name,Euthanasia,Cat,52,Hybrid,Cream,Female,Neutered,2013,10,13,12,44
2,A686464,Has Name,Adoption,Dog,104,Hybrid,Blue,Male,Neutered,2015,1,31,12,28
3,A683430,No Name,Transfer,Cat,3,Hybrid,Blue,Male,Intact,2014,7,11,19,9
4,A667013,No Name,Transfer,Dog,104,Hybrid,Tan,Male,Neutered,2013,11,15,12,52


In [4]:
trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 14 columns):
AnimalID          26729 non-null object
Name              26729 non-null object
OutcomeType       26729 non-null object
AnimalType        26729 non-null object
AgeuponOutcome    26729 non-null int64
Breed             26729 non-null object
Color             26729 non-null object
Sex               26729 non-null object
Neutered          26729 non-null object
Year              26729 non-null int64
Month             26729 non-null int64
Day               26729 non-null int64
Hour              26729 non-null int64
Minute            26729 non-null int64
dtypes: int64(6), object(8)
memory usage: 2.9+ MB


In [5]:
testDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11456 entries, 0 to 11455
Data columns (total 13 columns):
ID                11456 non-null int64
Name              11456 non-null object
AnimalType        11456 non-null object
AgeuponOutcome    11456 non-null int64
Breed             11456 non-null object
Color             11456 non-null object
Sex               11456 non-null object
Neutered          11456 non-null object
Year              11456 non-null int64
Month             11456 non-null int64
Day               11456 non-null int64
Hour              11456 non-null int64
Minute            11456 non-null int64
dtypes: int64(7), object(6)
memory usage: 1.1+ MB


In [6]:
# Dropping features so that datasets features match
train = trainDF.drop(['AnimalID', 'OutcomeType'], axis = 1)
test = testDF.drop('ID', axis=1)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 12 columns):
Name              26729 non-null object
AnimalType        26729 non-null object
AgeuponOutcome    26729 non-null int64
Breed             26729 non-null object
Color             26729 non-null object
Sex               26729 non-null object
Neutered          26729 non-null object
Year              26729 non-null int64
Month             26729 non-null int64
Day               26729 non-null int64
Hour              26729 non-null int64
Minute            26729 non-null int64
dtypes: int64(6), object(6)
memory usage: 2.4+ MB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11456 entries, 0 to 11455
Data columns (total 12 columns):
Name              11456 non-null object
AnimalType        11456 non-null object
AgeuponOutcome    11456 non-null int64
Breed             11456 non-null object
Color             11456 non-null object
Sex               11456 non-null object
Neutered          11456 non-null object
Year              11456 non-null int64
Month             11456 non-null int64
Day               11456 non-null int64
Hour              11456 non-null int64
Minute            11456 non-null int64
dtypes: int64(6), object(6)
memory usage: 1.0+ MB


In [9]:
# Transform data into numeric values
lb = LabelEncoder()
categorical_columns = train.columns[train.dtypes == 'object']
for var in categorical_columns:
    full_data = pd.concat((train[var],test[var]),axis=0).astype('str')
    lb.fit(full_data )
    train.loc[:, var] = lb.transform(train[var].astype('str'))
    test.loc[:, var] = lb.transform(test[var].astype('str'))

In [10]:
train.head()

Unnamed: 0,Name,AnimalType,AgeuponOutcome,Breed,Color,Sex,Neutered,Year,Month,Day,Hour,Minute
0,0,1,52,0,4,1,1,2014,2,12,18,22
1,0,0,52,0,8,0,1,2013,10,13,12,44
2,0,1,104,0,3,1,1,2015,1,31,12,28
3,1,0,3,0,3,1,0,2014,7,11,19,9
4,1,1,104,0,23,1,1,2013,11,15,12,52


In [11]:
#
X_all = train.values
y_all = trainDF.OutcomeType.values

x_test = test.values

In [12]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_all, y_all,
                                                                     test_size=0.30, random_state=67) 

In [13]:
def train_classifier(clf, X_train, y_train):
    print("Training {}...".format(clf.__class__.__name__))
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print("Done!\nTraining time (secs): {:.3f}".format(end - start))
    
def predict_labels(clf, X_train, y_train):
    print("Predicting labels using {}...".format(clf.__class__.__name__))
    start = time.time()
    y_pred = clf.predict(X_train)
    end = time.time()
    print("Done!\nPrediction time (secs): {:.3f}".format(end - start))
    return cross_val_score(clf, X_train, y_train, scoring= 'log_loss')

def train_predict(clf, X_train, y_train, X_test, y_test):
    print ("------------------------------------------")
    print ("Training set size: {}".format(len(X_train)))
    train_classifier(clf, X_train, y_train)
    print ("Log_loss score for training set: {}".format(predict_labels(clf, X_train, y_train)))
    print ("Log_loss for test set: {}".format(predict_labels(clf, X_test, y_test)))

In [14]:
# Loading the classifiers that will be used
clf = GaussianNB()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
knn = KNeighborsClassifier()
log = LogisticRegression()

In [None]:
# The training of each model
train_predict(clf, X_train, y_train, X_test, y_test)
train_predict(rfc, X_train, y_train, X_test, y_test)
train_predict(gbc, X_train, y_train, X_test, y_test)
train_predict(knn, X_train, y_train, X_test, y_test)
train_predict(log, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size: 18710
Training GaussianNB...
Done!
Training time (secs): 0.049
Predicting labels using GaussianNB...
Done!
Prediction time (secs): 0.009
Log_loss score for training set: [-1.35270304 -1.4415109  -1.40703546]
Predicting labels using GaussianNB...
Done!
Prediction time (secs): 0.002
Log_loss for test set: [-1.42382185 -1.46435222 -1.43835775]
------------------------------------------
Training set size: 18710
Training RandomForestClassifier...
Done!
Training time (secs): 0.145
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.029
Log_loss score for training set: [-2.31849779 -2.40809908 -2.28586337]
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.013
Log_loss for test set: [-2.53346081 -2.49015741 -2.5197616 ]
------------------------------------------
Training set size: 18710
Training GradientBoostingClassifier...
Done!
Training time (secs): 4.939
Predicting la

In [None]:
param_test1 = {'n_estimators':[i for i in range(90, 100)], 'max_depth':[i for i in range(5,16,2)],
              'min_samples_leaf':[i for i in range(30,71,10)], 'min_samples_split': [i for i in range(10,200, 10)],
              'max_features':[i for i in range(1,12,2)], 'subsample':[i/100 for i in range(1,100,5)],
              'subsample':[i/100 for i in range(90,101)]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,
                        min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
                        param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# Determining the best tradeoff for the number of estimators and learning rat
clf2 = GradientBoostingClassifier(learning_rate=0.1, max_depth=7, max_features=3,
                        min_samples_leaf= 40, min_samples_split= 170,subsample=0.96,random_state=10,n_estimators=93)

In [None]:
clf3 = GradientBoostingClassifier(learning_rate=0.05, max_depth=7, max_features=3,
                        min_samples_leaf= 40, min_samples_split= 170,subsample=0.96,random_state=10,n_estimators=186)

In [None]:
clf4 = GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features=3,
                        min_samples_leaf= 40, min_samples_split= 170,subsample=0.96,random_state=10,n_estimators=700)

In [None]:
cross_val_score(clf2, X_all, y_all, cv = 3, scoring = 'log_loss')

In [None]:
cross_val_score(clf3, X_all, y_all, cv = 3, scoring = 'log_loss')

In [None]:
cross_val_score(clf4, X_all, y_all, cv = 3, scoring = 'log_loss')

In [None]:
# Training best tuned model and preparing for submission to kaggle 
clf3.fit(X_all, y_all)
pred = clf3.predict_proba(x_test)

In [None]:
submission = pd.DataFrame(pred, columns=clf3.classes_)

In [None]:
submission['ID'] = testDF.ID

In [None]:
submission.head()

In [None]:
submission.to_csv('files/submission.csv', index=False)