In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Resources/pets.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,Animal ID,DateTime_intake,Intake_Type,Condition_intake,Animal_Type_intake,Sex,Age_intake,Breed_intake,Color_intake,...,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull,DateTime_length,Days_length
0,0,A730601,2016-07-07 12:11:00,Stray,Normal,Cat,Male,7 months,Domestic Shorthair Mix,Blue Tabby,...,0,0,0,0,0,0,0,0,0 days 20:49:00.000000000,0-7 days
1,1,A683644,2014-07-13 11:02:00,Owner Surrender,Nursing,Dog,Female,4 weeks,Border Collie Mix,Brown/White,...,0,0,0,0,0,0,0,0,115 days 23:04:00.000000000,12 weeks - 6 months
2,2,A676515,2014-04-11 08:45:00,Stray,Normal,Dog,Male,2 months,Pit Bull Mix,White/Brown,...,0,0,0,0,0,0,0,1,3 days 09:53:00.000000000,0-7 days
3,3,A742953,2017-01-31 13:30:00,Stray,Normal,Dog,Male,2 years,Saluki,Sable/Cream,...,0,0,0,0,0,0,0,0,4 days 00:47:00.000000000,0-7 days
4,4,A679549,2014-05-22 15:43:00,Stray,Normal,Cat,Male,1 month,Domestic Shorthair Mix,Black/White,...,0,0,0,0,0,0,0,0,24 days 22:11:00.000000000,3-6 weeks


In [3]:
pet_columns = ["Intake_Type", "Condition_intake", "Animal_Type_intake", "Sex",
               "Color_intake", "Outcome_Type", "Age_Bucket", "Days_length", "retriever", "shepherd", "beagle", 
               "terrier", "boxer", "poodle", "rottweiler", "dachshund", "chihuahua", "pit bull"]

df = df[pet_columns].copy()
df.head()

Unnamed: 0,Intake_Type,Condition_intake,Animal_Type_intake,Sex,Color_intake,Outcome_Type,Age_Bucket,Days_length,retriever,shepherd,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull
0,Stray,Normal,Cat,Male,Blue Tabby,Transfer,7-12 months,0-7 days,0,0,0,0,0,0,0,0,0,0
1,Owner Surrender,Nursing,Dog,Female,Brown/White,Adoption,1-6 weeks,12 weeks - 6 months,0,0,0,0,0,0,0,0,0,0
2,Stray,Normal,Dog,Male,White/Brown,Return to Owner,1-6 months,0-7 days,0,0,0,0,0,0,0,0,0,1
3,Stray,Normal,Dog,Male,Sable/Cream,Transfer,1-3 years,0-7 days,0,0,0,0,0,0,0,0,0,0
4,Stray,Normal,Cat,Male,Black/White,Transfer,1-6 months,3-6 weeks,0,0,0,0,0,0,0,0,0,0


In [4]:
df["Outcome_Type"].unique()

array(['Transfer', 'Adoption', 'Return to Owner', 'Euthanasia', 'Died',
       'Rto-Adopt', 'Missing', 'Disposal'], dtype=object)

In [5]:
#Reduce to "Adoption" or "Not" for a binary outcome from model
df = df.replace(['Transfer', 'Return to Owner', 'Euthanasia', 'Died', 'Rto-Adopt', 'Missing', 'Disposal'], 'Not Adopted')
df.head()

Unnamed: 0,Intake_Type,Condition_intake,Animal_Type_intake,Sex,Color_intake,Outcome_Type,Age_Bucket,Days_length,retriever,shepherd,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull
0,Stray,Normal,Cat,Male,Blue Tabby,Not Adopted,7-12 months,0-7 days,0,0,0,0,0,0,0,0,0,0
1,Owner Surrender,Nursing,Dog,Female,Brown/White,Adoption,1-6 weeks,12 weeks - 6 months,0,0,0,0,0,0,0,0,0,0
2,Stray,Normal,Dog,Male,White/Brown,Not Adopted,1-6 months,0-7 days,0,0,0,0,0,0,0,0,0,1
3,Stray,Normal,Dog,Male,Sable/Cream,Not Adopted,1-3 years,0-7 days,0,0,0,0,0,0,0,0,0,0
4,Stray,Normal,Cat,Male,Black/White,Not Adopted,1-6 months,3-6 weeks,0,0,0,0,0,0,0,0,0,0


In [6]:
df["Outcome_Type"].unique()

array(['Not Adopted', 'Adoption'], dtype=object)

In [7]:
X = df[["Intake_Type", "Condition_intake", "Animal_Type_intake", "Sex", "Age_Bucket", "retriever", "shepherd", "beagle", 
               "terrier", "boxer", "poodle", "rottweiler", "dachshund", "chihuahua", "pit bull","Days_length"]]
y = df["Outcome_Type"]
print(X.shape, y.shape)

(63429, 16) (63429,)


In [8]:
data = X.copy()

data_encoded = pd.get_dummies(data)
data_encoded.head()

Unnamed: 0,retriever,shepherd,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull,...,Age_Bucket_7-12 months,Age_Bucket_Less than 1 week,Days_length_0-7 days,Days_length_1-2 years,Days_length_1-3 weeks,Days_length_12 weeks - 6 months,Days_length_2+ years,Days_length_3-6 weeks,Days_length_6-12 months,Days_length_7-12 weeks
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


#split training and testing data
X_train, X_test, y_train, y_test = train_test_split(data_encoded, y, random_state=115, stratify=y)

In [10]:
X_train.head()


Unnamed: 0,retriever,shepherd,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull,...,Age_Bucket_7-12 months,Age_Bucket_Less than 1 week,Days_length_0-7 days,Days_length_1-2 years,Days_length_1-3 weeks,Days_length_12 weeks - 6 months,Days_length_2+ years,Days_length_3-6 weeks,Days_length_6-12 months,Days_length_7-12 weeks
6552,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
42614,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
23235,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

0.7563851926593933
0.7185017026106697


In [14]:
sorted(zip(rf.feature_importances_, X), reverse=True)

[(0.060548546455157434, 'dachshund'),
 (0.042316899268438284, 'rottweiler'),
 (0.0314535779907261, 'chihuahua'),
 (0.02965532157132695, 'boxer'),
 (0.02246883262867711, 'Intake_Type'),
 (0.021901505219850292, 'Sex'),
 (0.021736251136381904, 'terrier'),
 (0.014987239680506874, 'Condition_intake'),
 (0.012742714615417998, 'beagle'),
 (0.011635613019603707, 'retriever'),
 (0.011213547195765063, 'Age_Bucket'),
 (0.00948277661976783, 'Animal_Type_intake'),
 (0.006293409774738232, 'shepherd'),
 (0.0021374606673656774, 'pit bull'),
 (0.0017007495512529845, 'poodle'),
 (0.0010054461480617858, 'Days_length')]

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [4, 7, 10], 'n_estimators': [20, 50, 100]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [17]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] max_depth=4, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ max_depth=4, n_estimators=20, score=0.684, total=   0.3s
[CV] max_depth=4, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ........ max_depth=4, n_estimators=20, score=0.681, total=   0.2s
[CV] max_depth=4, n_estimators=20 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ........ max_depth=4, n_estimators=20, score=0.700, total=   0.2s
[CV] max_depth=4, n_estimators=20 ....................................
[CV] ........ max_depth=4, n_estimators=20, score=0.687, total=   0.2s
[CV] max_depth=4, n_estimators=20 ....................................
[CV] ........ max_depth=4, n_estimators=20, score=0.703, total=   0.3s
[CV] max_depth=4, n_estimators=50 ....................................
[CV] ........ max_depth=4, n_estimators=50, score=0.684, total=   0.5s
[CV] max_depth=4, n_estimators=50 ....................................
[CV] ........ max_depth=4, n_estimators=50, score=0.694, total=   0.5s
[CV] max_depth=4, n_estimators=50 ....................................
[CV] ........ max_depth=4, n_estimators=50, score=0.692, total=   0.5s
[CV] max_depth=4, n_estimators=50 ....................................
[CV] ........ max_depth=4, n_estimators=50, score=0.689, total=   0.5s
[CV] max_depth=4, n_estimators=50 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   36.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 10, 'n_estimators': 100}
0.7250846792585222


In [19]:
grid.score(X_train, y_train)

0.7322948855395094

In [20]:
grid.score(X_test, y_test)

0.7179341657207718

In [21]:
predictions = grid.predict(X_test)

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    Adoption       0.67      0.72      0.70      7118
 Not Adopted       0.76      0.72      0.74      8740

    accuracy                           0.72     15858
   macro avg       0.72      0.72      0.72     15858
weighted avg       0.72      0.72      0.72     15858

