In [1]:
#Import Dependencies
import pandas as pd
from sqlalchemy import create_engine
import psycopg2


from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import datetime as dt
import numpy as np
from collections import Counter 

  """)


In [2]:
#Load Data
import getpass
 
try:
    db_password = getpass.getpass()
except Exception as error:
    print('ERROR', error)
 
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/AAC_DogData"
engine = create_engine(db_string)


data=pd.read_sql_table("cleaneddata", con=engine)

········


In [3]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,Animal ID,Intake_Type,Intake_Condition,Breed,Color,Age_in_Years,Intactness,Sex,...,Breed_update,Breed1,Breed2,AKC_group1,Size1,AKC_group2,Size2,Sorted_Color,Restricted,Multiple_Intakes
0,0,0,A006100,Public Assist,Normal,Spinone Italiano Mix,Yellow/White,6.0,Altered,Male,...,Spinone Italiano,Spinone Italiano,,sporting,large,,,Tan,No,0
1,1,1,A006100,Public Assist,Normal,Spinone Italiano Mix,Yellow/White,7.0,Altered,Male,...,Spinone Italiano,Spinone Italiano,,sporting,large,,,Tan,No,1
2,2,2,A006100,Stray,Normal,Spinone Italiano Mix,Yellow/White,10.0,Altered,Male,...,Spinone Italiano,Spinone Italiano,,sporting,large,,,Tan,No,1
3,3,3,A047759,Owner Surrender,Normal,Dachshund,Tricolor,10.0,Altered,Male,...,Dachshund,Dachshund,,hound,small,,,Tricolor,No,0
4,4,4,A134067,Public Assist,Injured,Shetland Sheepdog,Brown/White,16.0,Altered,Male,...,Shetland Sheepdog,Shetland Sheepdog,,herding,small,,,Brown/White,No,0


In [4]:
#Create string labels for months, to aid with dummy coding later

data["in_Month"]=data["in_Month"].astype(str)
nums=("1","2","3","4","5","6","7","8","9","10","11","12")
months=("Jan","Feb","Mar","Apr","May","June","July","Aug","Sep","Oct","Nov","Dec")
data["in_Month"]= data["in_Month"].replace(nums,months)


In [5]:
#Drop original columns not needed
data=data.drop(columns=["Animal ID","Breed", "Color", "Age_in_Years", "index", "Unnamed: 0"])

In [6]:
#Drop rows where Outcome Type is missing or labeled "Missing", disposal and rto-adopt
data=data.dropna(subset=["Outcome_Type"])
data=data.drop(data[data["Outcome_Type"]=="Missing"].index)
data=data.drop(data[data["Outcome_Type"]=="Disposal"].index)
data=data.drop(data[data["Outcome_Type"]=="Died"].index)
data=data.drop(data[data["Outcome_Type"]=="Euthanasia"].index)

In [7]:
Counter(data["Outcome_Type"])

Counter({'Return to Owner': 20165,
         'Transfer': 16533,
         'Adoption': 36373,
         'Rto-Adopt': 661})

In [8]:
#Drop if Intactness=Unknown or missing
data=data.drop(data[data["Intactness"]=="Unknown"].index)
data=data.dropna(subset=["Intactness"])

In [9]:
#rename columns
data=data.rename(columns={"Intake_Type": "Type", "Intake_Condition": "Condition","Intactness":"Spay/Neuter Status", "Breed_Type":"Mixed Breed",
                          "Outcome_Type":"Outcome", "Length_of_Stay":"Length of Stay", "in_Month":"Month", "in_Year":"Year",
                          "AKC_group1":"Breed Type 1", "AKC_group2": "Breed Type 2", "Sorted_Color":"Color","Multiple_Intakes": "Prior Encounters"})
data.head()

Unnamed: 0,Type,Condition,Spay/Neuter Status,Sex,Age,Mixed Breed,Outcome,Length of Stay,Month,Year,Breed_update,Breed1,Breed2,Breed Type 1,Size1,Breed Type 2,Size2,Color,Restricted,Prior Encounters
0,Public Assist,Normal,Altered,Male,Adult,Mix,Return to Owner,2,Mar,2014,Spinone Italiano,Spinone Italiano,,sporting,large,,,Tan,No,0
1,Public Assist,Normal,Altered,Male,Adult,Mix,Return to Owner,2,Dec,2014,Spinone Italiano,Spinone Italiano,,sporting,large,,,Tan,No,1
2,Stray,Normal,Altered,Male,Senior,Mix,Return to Owner,1,Dec,2017,Spinone Italiano,Spinone Italiano,,sporting,large,,,Tan,No,1
3,Owner Surrender,Normal,Altered,Male,Senior,Pure,Transfer,6,Apr,2014,Dachshund,Dachshund,,hound,small,,,Tricolor,No,0
4,Public Assist,Injured,Altered,Male,Senior,Pure,Return to Owner,1,Nov,2013,Shetland Sheepdog,Shetland Sheepdog,,herding,small,,,Brown/White,No,0


In [10]:
#Look at quartiles for length of stay
print(data["Length of Stay"].describe())

count    73322.000000
mean        18.950929
std         48.287832
min          1.000000
25%          3.000000
50%          6.000000
75%         13.000000
max       1914.000000
Name: Length of Stay, dtype: float64


In [11]:
data["Prolonged Stay"]=np.where((data["Length of Stay"]<13),0,1)


In [12]:
data["Prolonged Stay"].value_counts()

0    54138
1    19184
Name: Prolonged Stay, dtype: int64

In [13]:
data.columns

Index(['Type', 'Condition', 'Spay/Neuter Status', 'Sex', 'Age', 'Mixed Breed',
       'Outcome', 'Length of Stay', 'Month', 'Year', 'Breed_update', 'Breed1',
       'Breed2', 'Breed Type 1', 'Size1', 'Breed Type 2', 'Size2', 'Color',
       'Restricted', 'Prior Encounters', 'Prolonged Stay'],
      dtype='object')

In [14]:
tab=pd.crosstab(data["Prolonged Stay"],data["Type"], rownames=["Prolonged Stay"],colnames=["Type"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abandoned,180,87,267,0.325843
Euthanasia Request,25,7,32,0.21875
Owner Surrender,10239,5539,15778,0.351058
Public Assist,4901,1385,6286,0.220331
Stray,38792,12166,50958,0.238746
Wildlife,1,0,1,0.0


In [15]:
tab=pd.crosstab(data["Prolonged Stay"],data["Condition"], rownames=["Prolonged Stay"],colnames=["Condition"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aged,267,59,326,0.180982
Behavior,15,19,34,0.558824
Feral,10,0,10,0.0
Injured,1617,1224,2841,0.430834
Medical,66,38,104,0.365385
Neonatal,29,13,42,0.309524
Normal,50379,16771,67150,0.249754
Nursing,466,598,1064,0.56203
Other,71,52,123,0.422764
Pregnant,42,22,64,0.34375


In [16]:
tab=pd.crosstab(data["Prolonged Stay"],data["Spay/Neuter Status"], rownames=["Prolonged Stay"],colnames=["Spay/Neuter Status"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Spay/Neuter Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Altered,19901,6552,26453,0.247685
Intact,34237,12632,46869,0.269517


In [17]:
tab=pd.crosstab(data["Prolonged Stay"],data["Breed Type 1"], rownames=["Prolonged Stay"],colnames=["Breed Type 1"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Breed Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cur,1209,651,1860,0.35
herding,8500,3304,11804,0.279905
hound,4364,1173,5537,0.211848
misc,101,51,152,0.335526
non-sporting,1563,304,1867,0.162828
non-working,17,3,20,0.15
pit bull,6826,4868,11694,0.416282
sporting,8911,3725,12636,0.294793
terrier,5012,1292,6304,0.204949
toy,12090,2122,14212,0.14931


In [18]:
tab=pd.crosstab(data["Prolonged Stay"],data["Sex"], rownames=["Prolonged Stay"],colnames=["Sex"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,25236,8758,33994,0.257634
Male,28902,10426,39328,0.265104


In [19]:
tab=pd.crosstab(data["Prolonged Stay"],data["Age"], rownames=["Prolonged Stay"],colnames=["Age"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adult,7822,2973,10795,0.275405
Puppy,27910,8552,36462,0.234546
Senior,4806,1338,6144,0.217773
Young,13497,6137,19634,0.31257


In [20]:
tab=pd.crosstab(data["Prolonged Stay"],data["Mixed Breed"], rownames=["Prolonged Stay"],colnames=["Mixed Breed"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Mixed Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mix,45008,16551,61559,0.268864
Pure,9130,2633,11763,0.223837


In [21]:
tab=pd.crosstab(data["Prolonged Stay"],data["Size1"], rownames=["Prolonged Stay"],colnames=["Size1"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Size1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
large,16028,8178,24206,0.33785
medium,18057,7497,25554,0.293379
small,20053,3509,23562,0.148926


In [22]:
tab=pd.crosstab(data["Prolonged Stay"],data["Color"], rownames=["Prolonged Stay"],colnames=["Color"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Black,3202,1085,4287,0.253091
Black/Brown,3450,1180,4630,0.25486
Black/Tan,3858,982,4840,0.202893
Black/White,8241,3272,11513,0.2842
Brindle,3102,1686,4788,0.35213
Brown,2729,852,3581,0.237922
Brown/Tan,785,212,997,0.212638
Brown/White,5452,2377,7829,0.303615
Gray,561,170,731,0.232558
Gray/White,2092,1117,3209,0.348084


In [23]:
tab=pd.crosstab(data["Prolonged Stay"],data["Restricted"], rownames=["Prolonged Stay"],colnames=["Restricted"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Restricted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,37502,10686,48188,0.221756
Yes,16636,8498,25134,0.338108


In [24]:
tab=pd.crosstab(data["Prolonged Stay"],data["Prior Encounters"], rownames=["Prolonged Stay"],colnames=["Prior Encounters"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Prior Encounters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,45879,15799,61678,0.256153
1,8259,3385,11644,0.290708


In [25]:
tab=pd.crosstab(data["Prolonged Stay"],data["Year"], rownames=["Prolonged Stay"],colnames=["Year"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1784,673,2457,0.273911
2014,7320,2598,9918,0.261948
2015,7314,2486,9800,0.253673
2016,7254,2411,9665,0.249457
2017,7640,2136,9776,0.218494
2018,7117,2702,9819,0.275181
2019,8041,2597,10638,0.244125
2020,3579,1669,5248,0.318026
2021,3884,1898,5782,0.32826
2022,205,14,219,0.063927


In [26]:
tab=pd.crosstab(data["Prolonged Stay"],data["Month"], rownames=["Prolonged Stay"],colnames=["Month"])
tabLS=tab.transpose()
tabLS["total"]=tabLS[0]+tabLS[1]
tabLS["percent"]=tabLS[1]/tabLS["total"]
tabLS

Prolonged Stay,0,1,total,percent
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Apr,4087,1383,5470,0.252834
Aug,4333,1532,5865,0.261211
Dec,4830,1643,6473,0.253824
Feb,4212,1428,5640,0.253191
Jan,4815,1524,6339,0.240416
July,4407,1529,5936,0.257581
June,4356,1683,6039,0.278689
Mar,4495,1591,6086,0.26142
May,4595,1510,6105,0.247338
Nov,4545,1886,6431,0.293267


In [27]:
#rename columns
data=data.rename(columns={"Intake_Type": "Type", "Condition": "cond","Spay/Neuter Status":"snstatus",
                          "Length_of_Stay":"LS", "Breed Type 1":"group1", "Breed Type 2": "group2", "Sorted_Color":"color", "Outcome B":"three"})
data.head()

Unnamed: 0,Type,cond,snstatus,Sex,Age,Mixed Breed,Outcome,Length of Stay,Month,Year,...,Breed1,Breed2,group1,Size1,group2,Size2,Color,Restricted,Prior Encounters,Prolonged Stay
0,Public Assist,Normal,Altered,Male,Adult,Mix,Return to Owner,2,Mar,2014,...,Spinone Italiano,,sporting,large,,,Tan,No,0,0
1,Public Assist,Normal,Altered,Male,Adult,Mix,Return to Owner,2,Dec,2014,...,Spinone Italiano,,sporting,large,,,Tan,No,1,0
2,Stray,Normal,Altered,Male,Senior,Mix,Return to Owner,1,Dec,2017,...,Spinone Italiano,,sporting,large,,,Tan,No,1,0
3,Owner Surrender,Normal,Altered,Male,Senior,Pure,Transfer,6,Apr,2014,...,Dachshund,,hound,small,,,Tricolor,No,0,0
4,Public Assist,Injured,Altered,Male,Senior,Pure,Return to Owner,1,Nov,2013,...,Shetland Sheepdog,,herding,small,,,Brown/White,No,0,0


In [28]:
#Recode all binary variables to 0,1

codes=[0,1]
sex=["Male","Female"]
snip=["Intact","Altered"]
mix=["Pure","Mix"]
restrict=["No","Yes"]
prior=["No","Yes"]

data["snstatus"]=data["snstatus"].replace(snip, codes)
data["sex"]=data["Sex"].replace(sex,codes)
data["mix"]=data["Mixed Breed"].replace(mix, codes)
data["restrict"]=data["Restricted"].replace(restrict, codes)
data["prior"]=data["Prior Encounters"].replace(prior, codes)
#Drop source columns
data=data.drop(columns=["Sex", "Mixed Breed","Restricted","Prior Encounters"])


In [29]:
#group medical Cond into one, shorten other conditions
newcond=["med", "med", "med","norm","inj","aged","sick","other","preg","nurs","feral","behav","neonatal"]
cond=["Medical", "Med Urgent", "Med Attn", "Normal","Injured","Aged","Sick","Other","Pregnant","Nursing","Feral","Behavior","Neonatal"]
data["cond"]=data["cond"].replace(cond, newcond)
Counter(data["cond"])

Counter({'norm': 67150,
         'inj': 2841,
         'aged': 326,
         'sick': 1564,
         'med': 104,
         'other': 123,
         'preg': 64,
         'nurs': 1064,
         'feral': 10,
         'behav': 34,
         'neonatal': 42})

In [30]:
#Change labels for type of encounter
newtypes=["assist", "stray", "surr", "Ereq", "aband"]
types=["Public Assist","Stray", "Owner Surrender", "Euthanasia Request", "Abandoned"]
data["type"]=data["Type"].replace(types, newtypes)
data=data.drop(columns=["Type"])
Counter(data["type"])

Counter({'assist': 6286,
         'stray': 50958,
         'surr': 15778,
         'Ereq': 32,
         'aband': 267,
         'Wildlife': 1})

In [31]:
data["LongStay"]=data["Prolonged Stay"]

In [32]:
#Dummy code all categorical variables, then add binary features from original file
dummies=["type", "cond", "Age", "Month", "group1", "group2", "Size1", "Size2","Color"]
coded_data=pd.get_dummies(data[dummies])
coded_data["mix"]=data["mix"]
coded_data["sex"]=data["sex"]
coded_data["snstatus"]=data["snstatus"]
coded_data["restrict"]=data["restrict"]
coded_data["prior"]=data["prior"]

coded_data.head()

Unnamed: 0,type_Ereq,type_Wildlife,type_aband,type_assist,type_stray,type_surr,cond_aged,cond_behav,cond_feral,cond_inj,...,Color_Red/White,Color_Sable,Color_Tan,Color_Tricolor,Color_White,mix,sex,snstatus,restrict,prior
0,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [33]:
#Drop Wildlife type column
coded_data=coded_data.drop(columns=["type_Wildlife"])

In [34]:
#Check full list of features
coded_data.columns

Index(['type_Ereq', 'type_aband', 'type_assist', 'type_stray', 'type_surr',
       'cond_aged', 'cond_behav', 'cond_feral', 'cond_inj', 'cond_med',
       'cond_neonatal', 'cond_norm', 'cond_nurs', 'cond_other', 'cond_preg',
       'cond_sick', 'Age_Adult', 'Age_Puppy', 'Age_Senior', 'Age_Young',
       'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan',
       'Month_July', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov',
       'Month_Oct', 'Month_Sep', 'group1_cur', 'group1_herding',
       'group1_hound', 'group1_misc', 'group1_non-sporting',
       'group1_non-working', 'group1_pit bull', 'group1_sporting',
       'group1_terrier', 'group1_toy', 'group1_working', 'group2_cur',
       'group2_herding', 'group2_hound', 'group2_misc', 'group2_non-sporting',
       'group2_pit bull', 'group2_sporting', 'group2_terrier', 'group2_toy',
       'group2_working', 'Size1_large', 'Size1_medium', 'Size1_small',
       'Size2_large', 'Size2_medium', 'Size2_small', 'Color_Black

In [35]:
Counter(coded_data["group1_non-working"])

Counter({0: 73302, 1: 20})

In [36]:
#Drop Wildlife type column
coded_data=coded_data.drop(columns=["group1_non-working"])

In [37]:
#combine group1 and group2. Single variable, 1 if either group1 or group2.
coded_data["sporting"]=np.where((coded_data["group1_sporting"]==1)|(coded_data["group2_sporting"]==1), 1, 0) 
coded_data["working"]=np.where((coded_data["group1_working"]==1)|(coded_data["group2_working"]==1), 1, 0) 
coded_data["non-sporting"]=np.where((coded_data["group1_non-sporting"]==1)|(coded_data["group2_non-sporting"]==1), 1, 0) 
coded_data["hound"]=np.where((coded_data["group1_hound"]==1)|(coded_data["group2_hound"]==1), 1, 0) 
coded_data["herding"]=np.where((coded_data["group1_herding"]==1)|(coded_data["group2_herding"]==1), 1, 0) 
coded_data["toy"]=np.where((coded_data["group1_toy"]==1)|(coded_data["group2_toy"]==1), 1, 0) 
coded_data["terrier"]=np.where((coded_data["group1_terrier"]==1)|(coded_data["group2_terrier"]==1), 1, 0) 
coded_data["pit bull"]=np.where((coded_data["group1_pit bull"]==1)|(coded_data["group2_pit bull"]==1), 1, 0) 
coded_data["cur"]=np.where((coded_data["group1_cur"]==1)|(coded_data["group2_cur"]==1), 1, 0) 
coded_data["g_misc"]=np.where((coded_data["group1_misc"]==1)|(coded_data["group2_misc"]==1), 1, 0) 


#Drop all the unnecessary columns
coded_data=coded_data.drop(columns=["group1_sporting","group2_sporting","group1_working","group2_working",
                        "group1_non-sporting","group2_non-sporting","group1_hound","group2_hound",
                        "group1_herding","group2_herding","group1_toy","group2_toy","group1_terrier",
                        "group2_terrier","group1_pit bull","group2_pit bull","group1_cur","group2_cur",
                        "group1_misc","group2_misc"])

coded_data.head()

Unnamed: 0,type_Ereq,type_aband,type_assist,type_stray,type_surr,cond_aged,cond_behav,cond_feral,cond_inj,cond_med,...,sporting,working,non-sporting,hound,herding,toy,terrier,pit bull,cur,g_misc
0,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [38]:
#combine Size1 and Size2. Single variable, 1 if 1 for either Size1 or Size2.
coded_data["small"]=np.where((coded_data["Size1_small"]==1)|(coded_data["Size2_small"]==1), 1, 0) 
coded_data["med"]=np.where((coded_data["Size1_medium"]==1)|(coded_data["Size2_medium"]==1), 1, 0) 
coded_data["large"]=np.where((coded_data["Size1_large"]==1)|(coded_data["Size2_large"]==1), 1, 0) 

#Drop all the unnecessary columns
coded_data=coded_data.drop(columns=["Size1_small","Size2_small","Size1_medium","Size2_medium","Size1_large","Size2_large"])

coded_data.columns

Index(['type_Ereq', 'type_aband', 'type_assist', 'type_stray', 'type_surr',
       'cond_aged', 'cond_behav', 'cond_feral', 'cond_inj', 'cond_med',
       'cond_neonatal', 'cond_norm', 'cond_nurs', 'cond_other', 'cond_preg',
       'cond_sick', 'Age_Adult', 'Age_Puppy', 'Age_Senior', 'Age_Young',
       'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan',
       'Month_July', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov',
       'Month_Oct', 'Month_Sep', 'Color_Black', 'Color_Black/Brown',
       'Color_Black/Tan', 'Color_Black/White', 'Color_Brindle', 'Color_Brown',
       'Color_Brown/Tan', 'Color_Brown/White', 'Color_Gray',
       'Color_Gray/White', 'Color_Merle', 'Color_Red', 'Color_Red/White',
       'Color_Sable', 'Color_Tan', 'Color_Tricolor', 'Color_White', 'mix',
       'sex', 'snstatus', 'restrict', 'prior', 'sporting', 'working',
       'non-sporting', 'hound', 'herding', 'toy', 'terrier', 'pit bull', 'cur',
       'g_misc', 'small', 'med', 'large'],
     

In [39]:
len(coded_data.columns)

67

In [40]:
#Define coded_data as feature set
X=coded_data.copy()



In [41]:
keep=["pit bull","cur","toy","non-sporting","sex","snstatus","small","med","large","Month_Nov",
     "cond_inj","cond_nurs","restrict","prior","mix","Color_Black/Tan","Color_White","Color_Tricolor",
     "Color_Brindle","Color_Gray/White","Color_Brown/White", "Color_Merle","Age_Young","Age_Senior", "type_surr",
     "type_assist"]
Xsub=coded_data[keep]
Xsub.head()

Unnamed: 0,pit bull,cur,toy,non-sporting,sex,snstatus,small,med,large,Month_Nov,...,Color_White,Color_Tricolor,Color_Brindle,Color_Gray/White,Color_Brown/White,Color_Merle,Age_Young,Age_Senior,type_surr,type_assist
0,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,0,0,1,1,0
4,0,0,0,0,0,1,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1


In [42]:
#Outcome is prolonged length of stay
y=data["LongStay"]

# Random Forest

In [43]:
#Split into training and testing sets, using default settings of 75/25 stratified split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)

In [80]:
from sklearn.ensemble import RandomForestClassifier
rf_model=RandomForestClassifier(n_estimators=100, random_state=0)
#Fit classifier to training set
rf_model= rf_model.fit(X_train, y_train)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11969,1485
Actual 1,3411,1466


In [81]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.921
Accuracy score (validation): 0.733
              precision    recall  f1-score   support

           0       0.78      0.89      0.83     13454
           1       0.50      0.30      0.37      4877

    accuracy                           0.73     18331
   macro avg       0.64      0.60      0.60     18331
weighted avg       0.70      0.73      0.71     18331



In [82]:
#Get top 10 features from random forest model
importances = rf_model.feature_importances_
feature_names=coded_data.columns

d={"feature": feature_names,"importance":importances}
importance_df=pd.DataFrame(d).sort_values(by="importance", ascending=False)
importance_df.head(10)

Unnamed: 0,feature,importance
50,sex,0.066959
51,snstatus,0.037722
49,mix,0.032998
53,prior,0.0324
46,Color_Tan,0.02587
35,Color_Black/White,0.025422
17,Age_Puppy,0.022593
30,Month_Oct,0.022244
39,Color_Brown/White,0.021988
22,Month_Dec,0.021232


In [47]:
#Try narrowed list of features
#Split into training and testing sets, using default settings of 75/25 stratified split
X_train, X_test, y_train, y_test=train_test_split(Xsub,y,random_state=0)
rf_model1=RandomForestClassifier(n_estimators=100, random_state=0, max_depth=20)
rf_model1= rf_model1.fit(X_train, y_train)
predictions1 = rf_model1.predict(X_test)
#Evaluate model
cm1=confusion_matrix(y_test, predictions1)
cm_df1 = pd.DataFrame(cm1, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df1

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12475,979
Actual 1,3880,997


In [48]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    rf_model1.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model1.score(X_test,y_test)))
print(classification_report(y_test, predictions1))

Accuracy score (training): 0.791
Accuracy score (validation): 0.735
              precision    recall  f1-score   support

           0       0.76      0.93      0.84     13454
           1       0.50      0.20      0.29      4877

    accuracy                           0.73     18331
   macro avg       0.63      0.57      0.56     18331
weighted avg       0.69      0.73      0.69     18331



In [49]:
#Get top 10 features from random forest model
importances1 = rf_model1.feature_importances_
feature_names1=Xsub.columns

d={"feature": feature_names1,"importance":importances1}
importance_df1=pd.DataFrame(d).sort_values(by="importance", ascending=False)
importance_df1.head(10)

Unnamed: 0,feature,importance
4,sex,0.072217
24,type_surr,0.067716
5,snstatus,0.061725
6,small,0.057215
22,Age_Young,0.054986
0,pit bull,0.054066
13,prior,0.050863
9,Month_Nov,0.047077
14,mix,0.046901
7,med,0.042921


# Gradient Boosting Classifier

In [50]:
#Try GradientBoostingClassifier to try another ensemble learning method

from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=150,
                                            learning_rate=learning_rate,
                                            max_features=8,
                                            max_depth=6,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.768
Accuracy score (validation): 0.755

Learning rate:  0.1
Accuracy score (training): 0.774
Accuracy score (validation): 0.758

Learning rate:  0.25
Accuracy score (training): 0.789
Accuracy score (validation): 0.756

Learning rate:  0.5
Accuracy score (training): 0.801
Accuracy score (validation): 0.747

Learning rate:  0.75
Accuracy score (training): 0.808
Accuracy score (validation): 0.744

Learning rate:  1
Accuracy score (training): 0.811
Accuracy score (validation): 0.737



In [51]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=150,
                                        learning_rate=0.1,
                                        max_features=8,
                                        max_depth=6,
                                        random_state=0)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)

In [52]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.774
Accuracy score (validation): 0.758
              precision    recall  f1-score   support

           0       0.77      0.95      0.85     13454
           1       0.62      0.23      0.33      4877

    accuracy                           0.76     18331
   macro avg       0.70      0.59      0.59     18331
weighted avg       0.73      0.76      0.71     18331



In [53]:
#Create confusion matrix for Gradient Boosting Classifier
cm1=confusion_matrix(y_test, predictions)
cm1_df = pd.DataFrame(cm1,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm1_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12779,675
Actual 1,3762,1115


In [54]:
#small set features
X_train, X_test, y_train, y_test=train_test_split(Xsub,y,random_state=0)
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=150,
                                        learning_rate=0.1,
                                        max_features=8,
                                        max_depth=6,
                                        random_state=0)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.765
Accuracy score (validation): 0.749
              precision    recall  f1-score   support

           0       0.77      0.95      0.85     13454
           1       0.58      0.20      0.30      4877

    accuracy                           0.75     18331
   macro avg       0.67      0.58      0.57     18331
weighted avg       0.72      0.75      0.70     18331



In [55]:
#Create confusion matrix for Gradient Boosting Classifier
cm1=confusion_matrix(y_test, predictions)
cm1_df = pd.DataFrame(cm1,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm1_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12740,714
Actual 1,3881,996


# Naive Bayes

In [56]:
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
#Using Bernoulli Naive Bayes because all of our data are categorical, and after dummy coding, entered as 0,1 binary features
from sklearn.naive_bayes import BernoulliNB

NB=BernoulliNB()
NB.fit(X_train, y_train)
predictions = NB.predict(X_test)


In [57]:
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_train,y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print(classification_report(y_test, predictions))

Accuracy score (training): 0.704
Accuracy score (validation): 0.701
              precision    recall  f1-score   support

           0       0.79      0.81      0.80     13454
           1       0.43      0.39      0.41      4877

    accuracy                           0.70     18331
   macro avg       0.61      0.60      0.61     18331
weighted avg       0.69      0.70      0.70     18331



In [58]:
#Create confusion matrix for Gradient Boosting Classifier
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10931,2523
Actual 1,2962,1915


# Try oversampling with SMOTE

In [84]:

from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 40684, 1: 40684})

In [85]:
rf_model=RandomForestClassifier(n_estimators=100, random_state=0, max_depth=18)
#Fit classifier to training set
rf_model= rf_model.fit(X_resampled, y_resampled)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9817,3637
Actual 1,2095,2782


In [86]:
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.807
Accuracy score (validation): 0.687
Balanced accuracy score: 0.650
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.73      0.57      0.77      0.65      0.42     13454
          1       0.43      0.57      0.73      0.49      0.65      0.41      4877

avg / total       0.72      0.69      0.61      0.70      0.65      0.42     18331



In [61]:

print(rf_model.score(X_train,y_train))
print(rf_model.score(X_test, y_test))
print(classification_report(y_test, predictions))

0.7932207088432652
0.6873056570836288
              precision    recall  f1-score   support

           0       0.82      0.73      0.77     13454
           1       0.43      0.57      0.49      4877

    accuracy                           0.69     18331
   macro avg       0.63      0.65      0.63     18331
weighted avg       0.72      0.69      0.70     18331



In [62]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)

0.65005205809304

In [63]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.73      0.57      0.77      0.65      0.42     13454
          1       0.43      0.57      0.73      0.49      0.65      0.41      4877

avg / total       0.72      0.69      0.61      0.70      0.65      0.42     18331



In [64]:
#Get top 10 features from random forest model
#importances = rf_model.feature_importances_
#feature_names=coded_data.columns

#d={"feature": feature_names,"importance":importances}
#importance_df=pd.DataFrame(d).sort_values(by="importance", ascending=False)
#importance_df.head(10)

In [65]:

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=80,
                                            learning_rate=learning_rate,
                                            max_features=12,
                                            max_depth=7,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_resampled, y_resampled)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_resampled,
            y_resampled)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    predictions = classifier.predict(X_test)
    print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.714
Accuracy score (validation): 0.675
Balanced accuracy score: 0.651

Learning rate:  0.1
Accuracy score (training): 0.744
Accuracy score (validation): 0.684
Balanced accuracy score: 0.648

Learning rate:  0.25
Accuracy score (training): 0.790
Accuracy score (validation): 0.691
Balanced accuracy score: 0.637

Learning rate:  0.5
Accuracy score (training): 0.818
Accuracy score (validation): 0.700
Balanced accuracy score: 0.627

Learning rate:  0.75
Accuracy score (training): 0.825
Accuracy score (validation): 0.699
Balanced accuracy score: 0.625

Learning rate:  1
Accuracy score (training): 0.830
Accuracy score (validation): 0.692
Balanced accuracy score: 0.613



In [66]:
classifier = GradientBoostingClassifier(n_estimators=80,
                                        learning_rate=0.05,
                                        max_features=12,
                                        max_depth=7,
                                        random_state=0)

# Fit the model
classifier.fit(X_resampled, y_resampled)

# Make Prediction
predictions = classifier.predict(X_test)

In [67]:
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.714
Accuracy score (validation): 0.675
Balanced accuracy score: 0.651
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.70      0.60      0.76      0.65      0.43     13454
          1       0.42      0.60      0.70      0.50      0.65      0.42      4877

avg / total       0.72      0.67      0.63      0.69      0.65      0.42     18331



In [69]:
#Create confusion matrix for Gradient Boosting Classifier
cm1=confusion_matrix(y_test, predictions)
cm1_df = pd.DataFrame(cm1,index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm1_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9447,4007
Actual 1,1953,2924


In [70]:
balanced_accuracy_score(y_test, predictions)

0.6508596306359576

In [71]:
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.70      0.60      0.76      0.65      0.43     13454
          1       0.42      0.60      0.70      0.50      0.65      0.42      4877

avg / total       0.72      0.67      0.63      0.69      0.65      0.42     18331



# SMOTEENN

In [72]:
from imblearn.combine import SMOTEENN 
sme = SMOTEENN(random_state=0)
X_res, y_res = sme.fit_resample(X_train, y_train)
Counter(y_res)

Counter({0: 13516, 1: 24181})

In [73]:
rf_model=RandomForestClassifier(n_estimators=100, random_state=0, max_depth=18)
#Fit classifier to training set
rf_model= rf_model.fit(X_res, y_res)
#Make prediction
predictions = rf_model.predict(X_test)
#Evaluate model
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7083,6371
Actual 1,1160,3717


In [74]:
print("Accuracy score (training): {0:.3f}".format(
    rf_model.score(X_resampled,y_resampled)))
print("Accuracy score (validation): {0:.3f}".format(
    rf_model.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))

print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.722
Accuracy score (validation): 0.589
Balanced accuracy score: 0.644
                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.53      0.76      0.65      0.63      0.39     13454
          1       0.37      0.76      0.53      0.50      0.63      0.41      4877

avg / total       0.73      0.59      0.70      0.61      0.63      0.40     18331



In [75]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=80,
                                            learning_rate=learning_rate,
                                            max_features=12,
                                            max_depth=7,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_res, y_res)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_res,
            y_res)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    predictions = classifier.predict(X_test)
    print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.855
Accuracy score (validation): 0.566
Balanced accuracy score: 0.634

Learning rate:  0.1
Accuracy score (training): 0.895
Accuracy score (validation): 0.592
Balanced accuracy score: 0.644

Learning rate:  0.25
Accuracy score (training): 0.940
Accuracy score (validation): 0.621
Balanced accuracy score: 0.646

Learning rate:  0.5
Accuracy score (training): 0.967
Accuracy score (validation): 0.629
Balanced accuracy score: 0.642

Learning rate:  0.75
Accuracy score (training): 0.980
Accuracy score (validation): 0.628
Balanced accuracy score: 0.636

Learning rate:  1
Accuracy score (training): 0.987
Accuracy score (validation): 0.625
Balanced accuracy score: 0.630



In [92]:
classifier = GradientBoostingClassifier(n_estimators=80,
                                        learning_rate=0.25,
                                        max_features=12,
                                        max_depth=7,
                                        random_state=0)

# Fit the model
classifier.fit(X_res, y_res)

# Make Prediction
predictions = classifier.predict(X_test)
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7979,5475
Actual 1,1469,3408


In [93]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    classifier.score(X_res,y_res)))
print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.940
Accuracy score (validation): 0.621
Balanced accuracy score: 0.646
                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.59      0.70      0.70      0.64      0.41     13454
          1       0.38      0.70      0.59      0.50      0.64      0.42      4877

avg / total       0.72      0.62      0.67      0.64      0.64      0.41     18331



In [94]:
NB=BernoulliNB()
NB.fit(X_res, y_res)
predictions = NB.predict(X_test)
cm=confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ["Actual 0", "Actual 1"], 
                      columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6241,7213
Actual 1,1216,3661


In [95]:
# Score the model
print("Accuracy score (training): {0:.3f}".format(
    NB.score(X_res,y_res)))
print("Accuracy score (validation): {0:.3f}".format(
    NB.score(X_test,y_test)))
print("Balanced accuracy score: {0:.3f}".format( 
        balanced_accuracy_score(y_test, predictions)))
print(classification_report_imbalanced(y_test, predictions))

Accuracy score (training): 0.753
Accuracy score (validation): 0.540
Balanced accuracy score: 0.607
                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.46      0.75      0.60      0.59      0.34     13454
          1       0.34      0.75      0.46      0.46      0.59      0.36      4877

avg / total       0.70      0.54      0.67      0.56      0.59      0.34     18331

