In [16]:
#Import Dependencies
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import numpy as np
from collections import Counter 

In [17]:
coded_data=pd.read_csv("../Resources/CodedData.csv")
coded_data.head()

Unnamed: 0,type_Ereq,type_aband,type_assist,type_stray,type_surr,cond_aged,cond_behav,cond_feral,cond_inj,cond_med,...,hound,herding,toy,terrier,pit bull,cur,g_misc,small,med,large
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0


In [18]:
coded_data.columns

Index(['type_Ereq', 'type_aband', 'type_assist', 'type_stray', 'type_surr',
       'cond_aged', 'cond_behav', 'cond_feral', 'cond_inj', 'cond_med',
       'cond_neonatal', 'cond_norm', 'cond_nurs', 'cond_other', 'cond_preg',
       'cond_sick', 'Age_Adult', 'Age_Puppy', 'Age_Senior', 'Age_Young',
       'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan',
       'Month_July', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov',
       'Month_Oct', 'Month_Sep', 'Color_Black', 'Color_Black/Brown',
       'Color_Black/Tan', 'Color_Black/White', 'Color_Brindle', 'Color_Brown',
       'Color_Brown/Tan', 'Color_Brown/White', 'Color_Gray',
       'Color_Gray/White', 'Color_Merle', 'Color_Red', 'Color_Red/White',
       'Color_Sable', 'Color_Tan', 'Color_Tricolor', 'Color_White', 'mix',
       'sex', 'snstatus', 'restrict', 'prior', 'LongStay', 'sporting',
       'working', 'non-sporting', 'hound', 'herding', 'toy', 'terrier',
       'pit bull', 'cur', 'g_misc', 'small', 'med', 'la

In [19]:
#Define feature set, dropping outcome
X=coded_data.drop(columns="LongStay")
X.columns


Index(['type_Ereq', 'type_aband', 'type_assist', 'type_stray', 'type_surr',
       'cond_aged', 'cond_behav', 'cond_feral', 'cond_inj', 'cond_med',
       'cond_neonatal', 'cond_norm', 'cond_nurs', 'cond_other', 'cond_preg',
       'cond_sick', 'Age_Adult', 'Age_Puppy', 'Age_Senior', 'Age_Young',
       'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan',
       'Month_July', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov',
       'Month_Oct', 'Month_Sep', 'Color_Black', 'Color_Black/Brown',
       'Color_Black/Tan', 'Color_Black/White', 'Color_Brindle', 'Color_Brown',
       'Color_Brown/Tan', 'Color_Brown/White', 'Color_Gray',
       'Color_Gray/White', 'Color_Merle', 'Color_Red', 'Color_Red/White',
       'Color_Sable', 'Color_Tan', 'Color_Tricolor', 'Color_White', 'mix',
       'sex', 'snstatus', 'restrict', 'prior', 'sporting', 'working',
       'non-sporting', 'hound', 'herding', 'toy', 'terrier', 'pit bull', 'cur',
       'g_misc', 'small', 'med', 'large'],
     

In [20]:
#Create a subset of features, based on preliminary analysis.
#The Five binary features are included because they include basic demographic information (sex, spay/neuter), 
#or a priori hypothesis of risk (mixed breed, restricted breed, prior encounters)
#Other features were chosen based on preliminary analysis. Very rare features were excluded.
#Given that the split is at the 75th percentile, 25% is the expected proportion for long stays.
#Common features with greater than 5% difference from expected were included in the subset of features.
Xsub=coded_data[["sex","snstatus","restrict","prior","mix","type_surr","cond_nurs","cond_inj","pit bull", "cur","non-sporting",
                "toy", "Age_Young","small","large","Color_Brindle","Color_Brown/White","Color_Gray/White",
                "Color_White"]]
  
Xsub.head()

Unnamed: 0,sex,snstatus,restrict,prior,mix,type_surr,cond_nurs,cond_inj,pit bull,cur,non-sporting,toy,Age_Young,small,large,Color_Brindle,Color_Brown/White,Color_Gray/White,Color_White
0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0


In [21]:
#Created second subset including >3% difference
Xsub2=coded_data[["sex","snstatus","restrict","prior","mix","type_surr","cond_nurs","cond_inj","pit bull", "cur","non-sporting",
                "toy", "Age_Young","small","large","Color_Brindle","Color_Brown/White","Color_Gray/White",
                "Color_White","type_assist","hound","sporting","terrier","Age_Senior","Color_Black/Tan","Color_Brown/Tan","Color_Tricolor","Month_Nov"]]
Xsub2.head()

Unnamed: 0,sex,snstatus,restrict,prior,mix,type_surr,cond_nurs,cond_inj,pit bull,cur,...,Color_White,type_assist,hound,sporting,terrier,Age_Senior,Color_Black/Tan,Color_Brown/Tan,Color_Tricolor,Month_Nov
0,0,1,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1


In [22]:
#Outcome is prolonged length of stay
y=coded_data["LongStay"]

In [23]:
#Split into training and testing sets, using default settings of 75/25 stratified split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)

# Gradient Boosting Classifier

In [24]:
#Gradient Boosting Classifier - select paramaters

from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=150,
                                            learning_rate=learning_rate,
                                            max_features=8,
                                            max_depth=6,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    predictions = classifier.predict(X_test)
    print(classification_report(y_test, predictions))
    print()

Learning rate:  0.05
Accuracy score (training): 0.768
Accuracy score (validation): 0.755
              precision    recall  f1-score   support

           0       0.77      0.96      0.85     13454
           1       0.63      0.20      0.30      4877

    accuracy                           0.76     18331
   macro avg       0.70      0.58      0.58     18331
weighted avg       0.73      0.76      0.71     18331


Learning rate:  0.1
Accuracy score (training): 0.775
Accuracy score (validation): 0.756
              precision    recall  f1-score   support

           0       0.77      0.95      0.85     13454
           1       0.61      0.22      0.33      4877

    accuracy                           0.76     18331
   macro avg       0.69      0.59      0.59     18331
weighted avg       0.73      0.76      0.71     18331


Learning rate:  0.25
Accuracy score (training): 0.789
Accuracy score (validation): 0.757
              precision    recall  f1-score   support

           0       0.78

In [25]:
#Chose learning rate 0.25
classifier = GradientBoostingClassifier(n_estimators=150,
                                        learning_rate=0.25,
                                        max_features=8,
                                        max_depth=6,
                                        random_state=0)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)

In [26]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.789
Accuracy score (validation): 0.757
              precision    recall  f1-score   support

           0       0.78      0.94      0.85     13454
           1       0.60      0.25      0.35      4877

    accuracy                           0.76     18331
   macro avg       0.69      0.59      0.60     18331
weighted avg       0.73      0.76      0.72     18331



In [27]:
#Create confusion matrix for Gradient Boosting Classifier
cm1=confusion_matrix(y_test, predictions)
cm1_df = pd.DataFrame(cm1,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm1_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12665,789
Actual 1,3672,1205


#  SMOTE oversampling

In [38]:

from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [39]:
#Use SMOTE to oversample data
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 40684, 1: 40684})

In [40]:

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=80,
                                            learning_rate=learning_rate,
                                            max_features=10,
                                            max_depth=5,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_resampled, y_resampled)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_resampled,
            y_resampled)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    predictions = classifier.predict(X_test)
    print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
   
    print(classification_report_imbalanced(y_test, predictions))
    print()
    

Learning rate:  0.05
Accuracy score (training): 0.687
Accuracy score (validation): 0.661
Balanced accuracy score: 0.643
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.68      0.60      0.75      0.64      0.41     13454
          1       0.41      0.60      0.68      0.49      0.64      0.41      4877

avg / total       0.71      0.66      0.62      0.68      0.64      0.41     18331


Learning rate:  0.1
Accuracy score (training): 0.707
Accuracy score (validation): 0.669
Balanced accuracy score: 0.645
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.70      0.59      0.76      0.64      0.42     13454
          1       0.41      0.59      0.70      0.49      0.64      0.41      4877

avg / total       0.72      0.67      0.62      0.68      0.64      0.42     18331


Learning rate:  0.25
Accuracy score (training): 0.748
Accuracy score (validation): 0.689


In [41]:
classifier = GradientBoostingClassifier(n_estimators=80,
                                        learning_rate=0.05,
                                        max_features=10,
                                        max_depth=5,
                                        random_state=0)

# Fit the model
classifier.fit(X_resampled, y_resampled)

# Make Prediction
predictions = classifier.predict(X_test)

In [42]:
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.687
Accuracy score (validation): 0.661
Balanced accuracy score: 0.643
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.68      0.60      0.75      0.64      0.41     13454
          1       0.41      0.60      0.68      0.49      0.64      0.41      4877

avg / total       0.71      0.66      0.62      0.68      0.64      0.41     18331



In [43]:
#Create confusion matrix for Gradient Boosting Classifier
cm1=confusion_matrix(y_test, predictions)
cm1_df = pd.DataFrame(cm1,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm1_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9174,4280
Actual 1,1932,2945


# SMOTEENN

In [44]:
from imblearn.combine import SMOTEENN 
sme = SMOTEENN(random_state=0)
X_res, y_res = sme.fit_resample(X_train, y_train)
Counter(y_res)

Counter({0: 13503, 1: 24180})

In [45]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=80,
                                            learning_rate=learning_rate,
                                            max_features=10,
                                            max_depth=5,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_res, y_res)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_res,
            y_res)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    predictions = classifier.predict(X_test)
    print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
    print(classification_report_imbalanced(y_test, predictions))
    print()

Learning rate:  0.05
Accuracy score (training): 0.812
Accuracy score (validation): 0.529
Balanced accuracy score: 0.619
                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.43      0.81      0.57      0.59      0.33     13454
          1       0.34      0.81      0.43      0.48      0.59      0.36      4877

avg / total       0.72      0.53      0.71      0.55      0.59      0.34     18331


Learning rate:  0.1
Accuracy score (training): 0.842
Accuracy score (validation): 0.566
Balanced accuracy score: 0.631
                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.49      0.77      0.62      0.62      0.37     13454
          1       0.35      0.77      0.49      0.49      0.62      0.39      4877

avg / total       0.72      0.57      0.70      0.59      0.62      0.37     18331


Learning rate:  0.25
Accuracy score (training): 0.881
Accuracy score (validation): 0.597


In [46]:
classifier = GradientBoostingClassifier(n_estimators=80,
                                        learning_rate=0.5,
                                        max_features=12,
                                        max_depth=7,
                                        random_state=0)

# Fit the model
classifier.fit(X_res, y_res)

# Make Prediction
predictions = classifier.predict(X_test)
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8200,5254
Actual 1,1604,3273


In [47]:
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.755
Accuracy score (validation): 0.626
Balanced accuracy score: 0.640
                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.61      0.67      0.71      0.64      0.41     13454
          1       0.38      0.67      0.61      0.49      0.64      0.41      4877

avg / total       0.72      0.63      0.65      0.65      0.64      0.41     18331

