In [47]:
#Import Dependencies
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import numpy as np
from collections import Counter 

In [48]:
coded_data=pd.read_csv("../Resources/CodedData.csv")
coded_data.head()

Unnamed: 0,type_Ereq,type_aband,type_assist,type_stray,type_surr,cond_aged,cond_inj,cond_norm,cond_nurs,cond_other,...,hound,herding,toy,terrier,pit bull,cur,g_misc,small,med,large
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [49]:
coded_data.columns

Index(['type_Ereq', 'type_aband', 'type_assist', 'type_stray', 'type_surr',
       'cond_aged', 'cond_inj', 'cond_norm', 'cond_nurs', 'cond_other',
       'cond_sick', 'Age_Adult', 'Age_Puppy', 'Age_Senior', 'Age_Young',
       'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan',
       'Month_July', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov',
       'Month_Oct', 'Month_Sep', 'Color_Black', 'Color_Black/Brown',
       'Color_Black/Tan', 'Color_Black/White', 'Color_Brindle', 'Color_Brown',
       'Color_Brown/Tan', 'Color_Brown/White', 'Color_Gray',
       'Color_Gray/White', 'Color_Merle', 'Color_Red', 'Color_Red/White',
       'Color_Sable', 'Color_Tan', 'Color_Tricolor', 'Color_White', 'mix',
       'sex', 'snstatus', 'restrict', 'prior', 'LongStay', 'sporting',
       'working', 'non-sporting', 'hound', 'herding', 'toy', 'terrier',
       'pit bull', 'cur', 'g_misc', 'small', 'med', 'large'],
      dtype='object')

In [50]:
#Define feature set, dropping outcome
X=coded_data.drop(columns="LongStay")
X.columns


Index(['type_Ereq', 'type_aband', 'type_assist', 'type_stray', 'type_surr',
       'cond_aged', 'cond_inj', 'cond_norm', 'cond_nurs', 'cond_other',
       'cond_sick', 'Age_Adult', 'Age_Puppy', 'Age_Senior', 'Age_Young',
       'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan',
       'Month_July', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov',
       'Month_Oct', 'Month_Sep', 'Color_Black', 'Color_Black/Brown',
       'Color_Black/Tan', 'Color_Black/White', 'Color_Brindle', 'Color_Brown',
       'Color_Brown/Tan', 'Color_Brown/White', 'Color_Gray',
       'Color_Gray/White', 'Color_Merle', 'Color_Red', 'Color_Red/White',
       'Color_Sable', 'Color_Tan', 'Color_Tricolor', 'Color_White', 'mix',
       'sex', 'snstatus', 'restrict', 'prior', 'sporting', 'working',
       'non-sporting', 'hound', 'herding', 'toy', 'terrier', 'pit bull', 'cur',
       'g_misc', 'small', 'med', 'large'],
      dtype='object')

In [51]:
#Create a subset of features, based on preliminary analysis.
#The Five binary features are included because they include basic demographic information (sex, spay/neuter), 
#or a priori hypothesis of risk (mixed breed, restricted breed, prior encounters)
#Other features were chosen based on preliminary analysis. Very rare features were excluded.
#Given that the split is at the 75th percentile, 25% is the expected proportion for long stays.
#Common features with greater than 5% difference from expected were included in the subset of features.
Xsub=coded_data[["sex","snstatus","restrict","prior","mix","type_surr","cond_inj","pit bull", "cur","non-sporting",
                "toy", "Age_Young","small","large","Color_Brindle","Color_Brown/White","Color_Gray/White",
                "Color_White"]]
Xsub.head()

Unnamed: 0,sex,snstatus,restrict,prior,mix,type_surr,cond_inj,pit bull,cur,non-sporting,toy,Age_Young,small,large,Color_Brindle,Color_Brown/White,Color_Gray/White,Color_White
0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0


In [52]:
Xsub2=coded_data[["sex","snstatus","restrict","prior","mix","type_surr","cond_inj","pit bull", "cur","non-sporting",
                "toy", "Age_Young","small","large","Color_Brindle","Color_Brown/White","Color_Gray/White",
                "Color_White","type_assist","hound","sporting","terrier","Age_Senior","Color_Black/Tan","Color_Brown/Tan","Color_Tricolor","Month_Nov"]]
Xsub2.head()

Unnamed: 0,sex,snstatus,restrict,prior,mix,type_surr,cond_inj,pit bull,cur,non-sporting,...,Color_White,type_assist,hound,sporting,terrier,Age_Senior,Color_Black/Tan,Color_Brown/Tan,Color_Tricolor,Month_Nov
0,0,1,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [53]:
#Outcome is prolonged length of stay
y=coded_data["LongStay"]

In [54]:
#Split into training and testing sets, using default settings of 75/25 stratified split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)

# Naive Bayes

In [55]:
#Using Bernoulli Naive Bayes because all of our data are categorical, and after dummy coding, entered as 0,1 binary features
from sklearn.naive_bayes import BernoulliNB

NB=BernoulliNB()
NB.fit(X_train, y_train)
predictions = NB.predict(X_test)


In [56]:
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.704
Accuracy score (validation): 0.701
              precision    recall  f1-score   support

           0       0.79      0.81      0.80     13454
           1       0.43      0.39      0.41      4877

    accuracy                           0.70     18331
   macro avg       0.61      0.60      0.61     18331
weighted avg       0.69      0.70      0.70     18331



In [38]:
#Create confusion matrix for Naive Bayes
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10924,2530
Actual 1,2963,1914


In [39]:
#Use restrictive subset of features
X_train, X_test, y_train, y_test=train_test_split(Xsub,y,random_state=0)

In [40]:
#Create classifier
NB=BernoulliNB()
NB.fit(X_train, y_train)
predictions = NB.predict(X_test)
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.698
Accuracy score (validation): 0.693
              precision    recall  f1-score   support

           0       0.78      0.82      0.80     13454
           1       0.41      0.35      0.38      4877

    accuracy                           0.69     18331
   macro avg       0.59      0.58      0.59     18331
weighted avg       0.68      0.69      0.68     18331



In [41]:
#Create confusion matrix
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11022,2432
Actual 1,3187,1690


In [42]:
#Use less restrictive subset of features
X_train, X_test, y_train, y_test=train_test_split(Xsub2,y,random_state=0)

In [43]:
#Create classifier
NB=BernoulliNB()
NB.fit(X_train, y_train)
predictions = NB.predict(X_test)
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.699
Accuracy score (validation): 0.693
              precision    recall  f1-score   support

           0       0.77      0.82      0.80     13454
           1       0.41      0.34      0.37      4877

    accuracy                           0.69     18331
   macro avg       0.59      0.58      0.58     18331
weighted avg       0.68      0.69      0.68     18331



In [44]:
#Create confusion matrix
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11035,2419
Actual 1,3205,1672


# Try oversampling with SMOTE

In [45]:

from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [46]:
#Use SMOTE to oversample data
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 40684, 1: 40684})

In [20]:

NB=BernoulliNB()
NB.fit(X_resampled, y_resampled)
predictions = NB.predict(X_test)


In [21]:
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.638
Accuracy score (validation): 0.611
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.60      0.63      0.70      0.62      0.38     13454
          1       0.37      0.63      0.60      0.46      0.62      0.38      4877

avg / total       0.70      0.61      0.62      0.63      0.62      0.38     18331



In [22]:
#Create confusion matrix for Naive Bayes
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8132,5322
Actual 1,1809,3068


# SMOTEENN

In [23]:
from imblearn.combine import SMOTEENN 
sme = SMOTEENN(random_state=0)
X_res, y_res = sme.fit_resample(X_train, y_train)
Counter(y_res)

Counter({0: 13503, 1: 24180})

In [24]:
NB=BernoulliNB()
NB.fit(X_res, y_res)
predictions = NB.predict(X_test)
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6470,6984
Actual 1,1284,3593


In [25]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_res,y_res)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.748
Accuracy score (validation): 0.549
Balanced accuracy score: 0.609
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.48      0.74      0.61      0.60      0.35     13454
          1       0.34      0.74      0.48      0.46      0.60      0.36      4877

avg / total       0.70      0.55      0.67      0.57      0.60      0.35     18331

