In [1]:
#Import Dependencies
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import numpy as np
from collections import Counter 

In [2]:
coded_data=pd.read_csv("../Resources/CodedData.csv")
coded_data.head()

Unnamed: 0,type_assist,type_other,type_stray,type_surr,cond_aged,cond_inj,cond_norm,cond_other,cond_sick,Age_Adult,...,hound,herding,toy,terrier,pit bull,cur,g_misc,small,med,large
0,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [3]:
coded_data.columns

Index(['type_assist', 'type_other', 'type_stray', 'type_surr', 'cond_aged',
       'cond_inj', 'cond_norm', 'cond_other', 'cond_sick', 'Age_Adult',
       'Age_Puppy', 'Age_Senior', 'Age_Young', 'Month_Apr', 'Month_Aug',
       'Month_Dec', 'Month_Feb', 'Month_Jan', 'Month_July', 'Month_June',
       'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
       'Color_Black', 'Color_Black/Brown', 'Color_Black/Tan',
       'Color_Black/White', 'Color_Brindle', 'Color_Brown', 'Color_Brown/Tan',
       'Color_Brown/White', 'Color_Gray', 'Color_Gray/White', 'Color_Merle',
       'Color_Red', 'Color_Red/White', 'Color_Sable', 'Color_Tan',
       'Color_Tricolor', 'Color_White', 'mix', 'sex', 'snstatus', 'restrict',
       'prior', 'LongStay', 'sporting', 'working', 'non-sporting', 'hound',
       'herding', 'toy', 'terrier', 'pit bull', 'cur', 'g_misc', 'small',
       'med', 'large'],
      dtype='object')

In [4]:
#Define feature set, dropping outcome
X=coded_data.drop(columns="LongStay")



In [6]:
#Create a subset of features, based on preliminary analysis.
#The Five binary features are included because they include basic demographic information (sex, spay/neuter), 
#or a priori hypothesis of risk (mixed breed, restricted breed, prior encounters)
#Other features were chosen based on preliminary analysis. Very rare features were excluded.
#Given that the split is at the 75th percentile, 25% is the expected proportion for long stays.
#Common features with greater than 5% difference from expected were included in the subset of features.
Xsub=coded_data[["sex","snstatus","restrict","prior","mix","type_surr","cond_inj","pit bull", "cur","non-sporting",
                "toy", "Age_Young","small","large","Color_Brindle","Color_Brown/White","Color_Gray/White",
                "Color_White"]]

Xsub.head()

Unnamed: 0,sex,snstatus,restrict,prior,mix,type_surr,cond_inj,pit bull,cur,non-sporting,toy,Age_Young,small,large,Color_Brindle,Color_Brown/White,Color_Gray/White,Color_White
0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0


In [7]:
#Created second subset with less restrictive criteria, included >3% difference 
Xsub2=coded_data[["sex","snstatus","restrict","prior","mix","type_surr","cond_inj","pit bull", "cur","non-sporting",
                "toy", "Age_Young","small","large","Color_Brindle","Color_Brown/White","Color_Gray/White",
                "Color_White","type_assist","hound","sporting","terrier","Age_Senior","Color_Black/Tan","Color_Brown/Tan","Color_Tricolor","Month_Nov"]]
Xsub2.head()

Unnamed: 0,sex,snstatus,restrict,prior,mix,type_surr,cond_inj,pit bull,cur,non-sporting,...,Color_White,type_assist,hound,sporting,terrier,Age_Senior,Color_Black/Tan,Color_Brown/Tan,Color_Tricolor,Month_Nov
0,0,1,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [8]:
#Outcome is prolonged length of stay
y=coded_data["LongStay"]

# Random Forest

In [9]:
#Split into training and testing sets, using default settings of 75/25 stratified split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)

In [10]:
from sklearn.ensemble import RandomForestClassifier
rf_model=RandomForestClassifier(n_estimators=100, random_state=0)
#Fit classifier to training set
rf_model= rf_model.fit(X_train, y_train)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11988,1466
Actual 1,3451,1426


In [11]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.921
Accuracy score (validation): 0.732
              precision    recall  f1-score   support

           0       0.78      0.89      0.83     13454
           1       0.49      0.29      0.37      4877

    accuracy                           0.73     18331
   macro avg       0.63      0.59      0.60     18331
weighted avg       0.70      0.73      0.71     18331



In [12]:
#Get top 10 features from random forest model
importances = rf_model.feature_importances_
feature_names=X.columns

d={"feature": feature_names,"importance":importances}
importance_df=pd.DataFrame(d).sort_values(by="importance", ascending=False)
importance_df.head(10)

Unnamed: 0,feature,importance
43,sex,0.066046
44,snstatus,0.038348
42,mix,0.035039
46,prior,0.032706
39,Color_Tan,0.026372
28,Color_Black/White,0.025466
32,Color_Brown/White,0.022202
23,Month_Oct,0.021612
15,Month_Dec,0.021537
10,Age_Puppy,0.021155


In [13]:
#Same model with restricted set of features
X_train, X_test, y_train, y_test=train_test_split(Xsub,y,random_state=0)

In [14]:
rf_model=RandomForestClassifier(n_estimators=100, random_state=0)
#Fit classifier to training set
rf_model= rf_model.fit(X_train, y_train)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12686,768
Actual 1,4033,844


In [15]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.764
Accuracy score (validation): 0.738
              precision    recall  f1-score   support

           0       0.76      0.94      0.84     13454
           1       0.52      0.17      0.26      4877

    accuracy                           0.74     18331
   macro avg       0.64      0.56      0.55     18331
weighted avg       0.70      0.74      0.69     18331



In [16]:
#Get top 10 features from random forest model
importances = rf_model.feature_importances_
feature_names=Xsub.columns

d={"feature": feature_names,"importance":importances}
importance_df=pd.DataFrame(d).sort_values(by="importance", ascending=False)
importance_df.head(10)

Unnamed: 0,feature,importance
12,small,0.122367
5,type_surr,0.109322
1,snstatus,0.086351
7,pit bull,0.085495
11,Age_Young,0.074912
0,sex,0.068169
3,prior,0.053581
6,cond_inj,0.05229
4,mix,0.049744
2,restrict,0.042548


In [17]:
#Same model with restricted set of features
X_train, X_test, y_train, y_test=train_test_split(Xsub2,y,random_state=0)

In [18]:
rf_model=RandomForestClassifier(n_estimators=100, random_state=0)
#Fit classifier to training set
rf_model= rf_model.fit(X_train, y_train)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12337,1117
Actual 1,3871,1006


In [19]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.797
Accuracy score (validation): 0.728
              precision    recall  f1-score   support

           0       0.76      0.92      0.83     13454
           1       0.47      0.21      0.29      4877

    accuracy                           0.73     18331
   macro avg       0.62      0.56      0.56     18331
weighted avg       0.68      0.73      0.69     18331



In [20]:
#Get top 10 features from random forest model
importances = rf_model.feature_importances_
feature_names=Xsub2.columns

d={"feature": feature_names,"importance":importances}
importance_df=pd.DataFrame(d).sort_values(by="importance", ascending=False)
importance_df.head(10)

Unnamed: 0,feature,importance
0,sex,0.076515
1,snstatus,0.061743
12,small,0.055565
11,Age_Young,0.054586
5,type_surr,0.054526
3,prior,0.051963
7,pit bull,0.047341
26,Month_Nov,0.046936
4,mix,0.04524
20,sporting,0.041104


# SMOTE oversampling

In [21]:

from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [22]:
#Use SMOTE to oversample data
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 40684, 1: 40684})

In [23]:
rf_model=RandomForestClassifier(n_estimators=120, random_state=0, max_depth=18)
#Fit classifier to training set
rf_model= rf_model.fit(X_resampled, y_resampled)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9797,3657
Actual 1,2074,2803


In [24]:
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.809
Accuracy score (validation): 0.687
Balanced accuracy score: 0.651
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.73      0.57      0.77      0.65      0.42     13454
          1       0.43      0.57      0.73      0.49      0.65      0.41      4877

avg / total       0.72      0.69      0.62      0.70      0.65      0.42     18331



# SMOTEENN

In [25]:
from imblearn.combine import SMOTEENN 
sme = SMOTEENN(random_state=0)
X_res, y_res = sme.fit_resample(X_train, y_train)
Counter(y_res)

Counter({0: 13501, 1: 24283})

In [26]:
rf_model=RandomForestClassifier(n_estimators=100, random_state=0)
#Fit classifier to training set
rf_model= rf_model.fit(X_res, y_res)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7155,6299
Actual 1,1296,3581


In [27]:
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.740
Accuracy score (validation): 0.586
Balanced accuracy score: 0.633
                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.53      0.73      0.65      0.62      0.38     13454
          1       0.36      0.73      0.53      0.49      0.62      0.40      4877

avg / total       0.72      0.59      0.68      0.61      0.62      0.39     18331

