# Naive Bayes Model for Animal Outcome Prediction
Metis-Classification_Project
14JUN2022
John Tazioli

In [1]:
import pandas as pd
import numpy as np

In [2]:
#load datasets from logistic regression MVP
import pickle

with open('shelter_dogs.pickle','rb') as g:
    x = pickle.load(g)
    
with open('shelter_dogs_y.pickle', 'rb') as g:
    y = pickle.load(g)

In [3]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21989 entries, 0 to 21988
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   breed_euth_rate  21989 non-null  float64
 1   age(days)        21989 non-null  int64  
 2   time_in_shelt    21989 non-null  float64
 3   Feral            21989 non-null  uint8  
 4   Injured          21989 non-null  uint8  
 5   Normal           21989 non-null  uint8  
 6   Nursing          21989 non-null  uint8  
 7   Other            21989 non-null  uint8  
 8   Pregnant         21989 non-null  uint8  
 9   Sick             21989 non-null  uint8  
dtypes: float64(2), int64(1), uint8(7)
memory usage: 665.8 KB


In [4]:
#remap adoption as 1 and euthanasia as 0
y = y.map({'Euthanasia':0, 'Adoption':1})
y[0:10]

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: outcome_type, dtype: int64

## Train/Test split
70/30 split for train and test

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=10)

## Resampling:
Euthanasia currently represents 6.4% of the observations. 80%/20% split or lower often causes problems. I will oversample the euthanasia observations at a ratio of 6:1 to make it 29% (6138/20507).

In [6]:
from imblearn.over_sampling import RandomOverSampler

n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)

ratio = {1:n_pos, 0:n_neg*6}

ROS = RandomOverSampler(sampling_strategy = ratio, random_state=10)

x_tr_rs, y_tr_rs = ROS.fit_resample(x_train, y_train)

In [7]:
x_tr_rs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20507 entries, 0 to 20506
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   breed_euth_rate  20507 non-null  float64
 1   age(days)        20507 non-null  int64  
 2   time_in_shelt    20507 non-null  float64
 3   Feral            20507 non-null  uint8  
 4   Injured          20507 non-null  uint8  
 5   Normal           20507 non-null  uint8  
 6   Nursing          20507 non-null  uint8  
 7   Other            20507 non-null  uint8  
 8   Pregnant         20507 non-null  uint8  
 9   Sick             20507 non-null  uint8  
dtypes: float64(2), int64(1), uint8(7)
memory usage: 620.9 KB


In [8]:
y_tr_rs.value_counts()

1    14369
0     6138
Name: outcome_type, dtype: int64

## Naive Bayes Models:
Bernoulli NB for categorical condition dummy variables and Gaussian NB for breed euth rate, age, and time in shelter 

In [9]:
#split features into 2 dataframes
x_tr_gaus = x_tr_rs.iloc[:,0:3]
x_tr_bern = x_tr_rs.iloc[:,3:]

x_tst_g = x_test.iloc[:,0:3]
x_tst_b = x_test.iloc[:,3:] 

### Gaussian NB

In [10]:
from sklearn.naive_bayes import GaussianNB

g_nb = GaussianNB()

g_nb.fit(x_tr_gaus, y_tr_rs)

y_g_pred = g_nb.predict(x_tst_g)

y_g_pred = pd.Series(y_g_pred)

In [12]:
from sklearn.metrics import f1_score

print(f"F1: {f1_score(y_test, y_g_pred)}")

F1: 0.9461087593617714


### Bernoulli NB

In [13]:
from sklearn.naive_bayes import BernoulliNB

b_nb = BernoulliNB()

b_nb.fit(x_tr_bern, y_tr_rs)

y_b_pred = b_nb.predict(x_tst_b)

In [14]:
print(f"F1: {f1_score(y_test, y_b_pred)}")

F1: 0.9503419081732334


## Ensemble with Voting Classifier

In [16]:
from sklearn.ensemble import VotingClassifier

model_list = [('gnb',g_nb),
             ('bnb',b_nb)]

vc = VotingClassifier(estimators = model_list,
                     voting = 'hard',
                     weights = None)

vc.fit(x_test,y_test)

vc_y_pred = vc.predict(x_test)

In [17]:
print(f"F1: {f1_score(y_test, vc_y_pred)}")

F1: 0.9545602605863192
