In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
from sklearn.metrics import accuracy_score

### 1. Extending the Gaussian Naive Bayes code so that it handles missing values

### My GaussianNB
Reimplementation of a Gaussian Naive Bayes.

In [2]:
class MyGaussianNB(BaseEstimator, ClassifierMixin):          
    
    missing_columns=[]
    def fit(self, Xt, yt):
        self.var_smoothing = 1e-9   # zero variance will cause division by zero errors.
        self.Xt = Xt
        self.yt = yt
        self.n_feat = Xt.shape[1]
        self.mus = {}
        self.sig_sqs = {}
        self.priors = {}

        
        c_dict = Counter(self.yt)
        
        for c in c_dict.keys():
            self.mus[c] = np.zeros(self.n_feat) # where the means will be stored
            self.sig_sqs[c] = np.zeros(self.n_feat) # where the variances will be stored
            self.priors[c] = c_dict[c]/Xt.shape[0]
            
            mask = self.yt == c
            X_tr_c = self.Xt[mask, :] # the rows for this class label
            
            for f in range(self.n_feat):
                self.mus[c][f] = np.mean(X_tr_c[:,f])
                self.sig_sqs[c][f] = np.var(X_tr_c[:,f] + self.var_smoothing)  #var              
        #print(self.mus)
        #print(self.sig_sqs)
        
        return self
    
    # Fit function for handling missing values univariate
    def fit_missing_uni(self, Xt, yt,limit): # This function takes 3 arguments Trainining data, Training Features and limit of missing values
        Xt=pd.DataFrame(Xt) # convert into data frame
        percent_missing = list(Xt.isnull().sum() * 100 / len(Xt)) # check the percentage of missing values
        columns_dropped=[] # initializing the list of columns need to be dropped
        
        for i in range(len(percent_missing)): 
            if percent_missing[i] >= limit: # check if the missing values percentage is greater than defined limit
                columns_dropped.append(Xt.columns[i]) 
        
        self.missing_columns= columns_dropped       
        Xt=Xt.drop(columns_dropped,axis=1) # dropped the columnns having missing values greater than defined limits
        
        
        if len(Xt.columns) ==0: # if all columns dropped just return and print the message
            return
        else:
            for i in Xt.columns:  
                Xt[i]=Xt[i].fillna(Xt[i].median()) # impute the missing values of features having less missing values than threshold

        Xt=np.array(Xt)
        
        self.var_smoothing = 1e-9   # zero variance will cause division by zero errors.
        self.Xt = Xt
        self.yt = yt
        self.n_feat = Xt.shape[1]
        self.mus = {}
        self.sig_sqs = {}
        self.priors = {}

        c_dict = Counter(self.yt)
        
        for c in c_dict.keys():
            self.mus[c] = np.zeros(self.n_feat) # where the means will be stored
            self.sig_sqs[c] = np.zeros(self.n_feat) # where the variances will be stored
            self.priors[c] = c_dict[c]/Xt.shape[0]
            
            mask = self.yt == c
            X_tr_c = self.Xt[mask, :] # the rows for this class label
            
            for f in range(self.n_feat):
                self.mus[c][f] = np.mean(X_tr_c[:,f])
                self.sig_sqs[c][f] = np.var(X_tr_c[:,f] + self.var_smoothing)  #var              
        #print(self.mus)
        #print(self.sig_sqs)
        
        return self
    
    
    # Fit function for handling missing values multivariate
    def fit_missing_mul(self, Xt, yt,limit): # This function takes 3 arguments Trainining data, Training Features and limit of missing values
        Xt=pd.DataFrame(Xt) # convert into data frame
        percent_missing = list(Xt.isnull().sum() * 100 / len(Xt)) # check the percentage of missing values
        columns_dropped=[] # initializing the list of columns need to be dropped
        
        for i in range(len(percent_missing)): 
            if percent_missing[i] >= limit: # check if the missing values percentage is greater than defined limit
                columns_dropped.append(Xt.columns[i]) 
        
        self.missing_columns= columns_dropped       
        Xt=Xt.drop(columns_dropped,axis=1) # dropped the columnns having missing values greater than defined limits
        
        
        if len(Xt.columns) ==0: # if all columns dropped just return and print the message
            return
        else:
            #for i in Xt.columns:  
            Xt = Xt.fillna( Xt.median()) # impute the missing values of features having less missing values than threshold

        Xt=np.array(Xt)
        
        self.var_smoothing = 1e-9   # zero variance will cause division by zero errors.
        self.Xt = Xt
        self.yt = yt
        self.n_feat = Xt.shape[1]
        self.mus = {}
        self.sig_sqs = {}
        self.priors = {}

        c_dict = Counter(self.yt)
        
        for c in c_dict.keys():
            self.mus[c] = np.zeros(self.n_feat) # where the means will be stored
            self.sig_sqs[c] = np.zeros(self.n_feat) # where the variances will be stored
            self.priors[c] = c_dict[c]/Xt.shape[0]
            
            mask = self.yt == c
            X_tr_c = self.Xt[mask, :] # the rows for this class label
            
            for f in range(self.n_feat):
                self.mus[c][f] = np.mean(X_tr_c[:,f])
                self.sig_sqs[c][f] = np.var(X_tr_c[:,f] + self.var_smoothing)  #var              
        #print(self.mus)
        #print(self.sig_sqs)
        
        return self
    
    
    # The predictions for univariate imputation.
    def predict_uni(self, Xtes):
        Xtes=pd.DataFrame(Xtes)
        Xtes=Xtes.drop(self.missing_columns,axis=1) # dropping the features that we dropped in training data
        
        for i in Xtes.columns:   # Handling missing values univariate
                Xtes[i]=Xtes[i].fillna(Xtes[i].median()) 
          
        Xtes=np.array(Xtes)
        
        self.Xtes = Xtes
         
        res_list = []
        for sample in Xtes:
            res_list.append(self.predict_single(sample))
            
        return np.array(res_list)
    
     # The predictions for multi-variate imputation.
    def predict_mul(self, Xtes):
        Xtes=pd.DataFrame(Xtes)
        Xtes=Xtes.drop(self.missing_columns,axis=1) # dropping the features that we dropped in training data
        
        # Handling missing values univariate
        Xtes=Xtes.fillna(Xtes.median()) 
        
        Xtes=np.array(Xtes)
        
        self.Xtes = Xtes
         
        res_list = []
        for sample in Xtes:
            res_list.append(self.predict_single(sample))
            
        return np.array(res_list)
    
        # The predictions are the most common class in the training set.
    def predict(self, Xtes):
        #print("Predicting MGNB")
        self.Xtes = Xtes
         
        res_list = []
        for sample in Xtes:
            res_list.append(self.predict_single(sample))
            
        return np.array(res_list)
    
    
    def predict_single(self, x_single):
        probs = {}
        for c in self.priors.keys():   # for each of the class labels
            probs[c] = self.priors[c]
            for i, f in enumerate(x_single):
                t1 = 1/math.sqrt(2*math.pi*self.sig_sqs[c][i])
                num = (f - self.mus[c][i])**2
                den = 2*self.sig_sqs[c][i]
                pxi_y = t1 * math.exp(-num/den)
                probs[c] = probs[c] * pxi_y
                #print(t1, num, den, pxi_y)
                #print(probs)
            #print(c, self.priors[c])
        return max(probs, key=probs.get) # Return the key with the largest value
    

### Commenting on my design decisions
#### How did I handle the missing values?

4 functions were created to handle the missing values.
<br>
1) fit_missing_uni(data, labels, missing)
<br>
This function takes 3 arguments as its input which are; data, labels and the percentage of missing values.<br>
This function calculates the missing percentage of the missing values and removes the features having more missing values than the defined threshold. <br>
Once the features are removed this function imputes the missing values by univariate Median.<br>

<br>
2) fit_missing_mul(data, labels, missing)
<br>
This function takes 3 arguments as its inputs which are; data, labels and the percentage of missing values.<br>
This function calculates the missing percentage of the missing values and removes the features having missing values more than the defined threshold. <br>
Once the features are removed this function imputes the missing values by multivariate Median.<br>

3) predict_uni(data)
<br>
This function takes 1 argument which is the test data. <br>
In the next step it removes the features that were selected by the "fit_missing_uni" function, due to exceeding the missing values threshold.<br>
Once removing the column this functions replaces the missing values by the univariate median to handle the test data properly.<br>

4) predict_mul()
<br>
This function takes 1 argument which is the test data. <br>
In the next step, it removes the features that are selected by the "fit_missing_mul" function, due to exceeding the missing values threshold.<br>
Once removing the column this functions replaces the missing values by the multivariate median to handle the test data properly.<br>

### 2. Test the performance of my implementation against the scikit-learn GaussianNB using missing value imputation

#### Penguins Dataset 20% missing values

In [3]:
# Enter the limited percentage of missing values here, I've enterend 50%
LIMIT=50

In [4]:
# LOADING FILE WITH 20% MISSING VALUES
df = pd.read_csv('PenguinsMV0.2.csv', index_col = 0, na_values = '?')
print(df.shape)

percent_missing = df.isnull().sum() * 100 / len(df)
y=df["species"]
X=df.drop(["species"],axis=1)

# Scaling of data
sc=StandardScaler()
X=sc.fit_transform(X)

# Splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)


(333, 5)


#### 2.1.0 My Naive bayes For Handling Missing Values Itself univariate

In [5]:
# My guassian naive bayes

nb=MyGaussianNB()

In [6]:
# Training the model 
nb.fit_missing_uni(X_train,y_train,LIMIT)

MyGaussianNB()

In [7]:
predi=nb.predict_uni(X_test)

In [8]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8982035928143712


**Cross Validtion K-fold**

In [9]:
from sklearn.model_selection import KFold

In [10]:
kf = KFold(n_splits=5)
kf.get_n_splits(X)

5

In [11]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit_missing_uni(X_tr,y_tr,LIMIT)
    predi=nb.predict_uni(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 1.0
Model accuracy at fold 2 = 0.9701492537313433
Model accuracy at fold 3 = 0.9850746268656716
Model accuracy at fold 4 = 0.9848484848484849
Model accuracy at fold 5 = 0.5454545454545454


#### 2.1.1 My Naive bayes For Handling Missing Values Itself Multivariate

In [12]:
# My guassian naive bayes

nb=MyGaussianNB()

In [13]:
# Training the model 
nb.fit_missing_mul(X_train,y_train,LIMIT)

MyGaussianNB()

In [14]:
predi=nb.predict_mul(X_test)

In [15]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8982035928143712


**Cross validation**

In [16]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit_missing_mul(X_tr,y_tr,LIMIT)
    predi=nb.predict_mul(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 1.0
Model accuracy at fold 2 = 0.9701492537313433
Model accuracy at fold 3 = 0.9850746268656716
Model accuracy at fold 4 = 0.9848484848484849
Model accuracy at fold 5 = 0.5454545454545454


#### 2.1.2 Naive Bayes Sciket Learn handling missing values Explicitly Univariate

In [17]:
# Loading the dataset again and imputing the missing values because naive bayes did not handle missing values in sk learn
df = pd.read_csv('PenguinsMV0.2.csv', index_col = 0, na_values = '?')
print(df.shape)

percent_missing = df.isnull().sum() * 100 / len(df)

y=df["species"]
X=df.drop(["species"],axis=1)

for i in X.columns:
    X[i]= X[i].fillna(X[i].median())

    
# Scaling the dataset   
sc=StandardScaler()
X=sc.fit_transform(X)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)


(333, 5)


In [18]:
gnb=GaussianNB()

In [19]:
model= gnb.fit(X_train,y_train)

In [20]:
predi=gnb.predict(X_test)

In [21]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.9401197604790419


**Cross validation**

In [22]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    gnb.fit(X_tr,y_tr)
    predi=gnb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.9253731343283582
Model accuracy at fold 2 = 0.8805970149253731
Model accuracy at fold 3 = 0.9850746268656716
Model accuracy at fold 4 = 0.9696969696969697
Model accuracy at fold 5 = 0.25757575757575757


 #### 2.1.3 My Naive Bayes handling missing values Explicitly Univariate

In [23]:
nb=MyGaussianNB()

In [24]:
# Training the model 
nb.fit(X_train,y_train)

MyGaussianNB()

In [25]:
predi=nb.predict_uni(X_test)

In [26]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.9401197604790419


**Cross validation**

In [27]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit(X_tr,y_tr)
    predi=nb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.9253731343283582
Model accuracy at fold 2 = 0.8805970149253731
Model accuracy at fold 3 = 0.9850746268656716
Model accuracy at fold 4 = 0.9696969696969697
Model accuracy at fold 5 = 0.25757575757575757


#### 2.1.4 Naive Bayes Sciket Learn handling missing values Explicitly Multivariate

In [28]:
# Loading the dataset again and imputing the missing values because naive bayes did not handle missing values in sk learn
df = pd.read_csv('PenguinsMV0.2.csv', index_col = 0, na_values = '?')
print(df.shape)

percent_missing = df.isnull().sum() * 100 / len(df)


y=df["species"]
X=df.drop(["species"],axis=1)


X= X.fillna(X.median())

    
# Scaling the dataset   
sc=StandardScaler()
X=sc.fit_transform(X)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)

(333, 5)


In [29]:
gnb=GaussianNB()

In [30]:
model= gnb.fit(X_train,y_train)

In [31]:
predi=gnb.predict(X_test)

In [32]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.9401197604790419


**Cross Validation**

In [33]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    gnb.fit(X_tr,y_tr)
    predi=gnb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.9253731343283582
Model accuracy at fold 2 = 0.8805970149253731
Model accuracy at fold 3 = 0.9850746268656716
Model accuracy at fold 4 = 0.9696969696969697
Model accuracy at fold 5 = 0.25757575757575757


 #### 2.1.5 My Naive Bayes handling missing values Explicitly Multivariate

In [34]:
nb=MyGaussianNB()

In [35]:
# Training the model 
nb.fit(X_train,y_train)

MyGaussianNB()

In [36]:
predi=nb.predict_uni(X_test)

In [37]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.9401197604790419


**Cross Validation**

In [38]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit(X_tr,y_tr)
    predi=nb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.9253731343283582
Model accuracy at fold 2 = 0.8805970149253731
Model accuracy at fold 3 = 0.9850746268656716
Model accuracy at fold 4 = 0.9696969696969697
Model accuracy at fold 5 = 0.25757575757575757


### Penguins Dataset using 40% missing values

We will now carry out further testing but this time using the Penguins dataset with 40% missing values

In [39]:
df = pd.read_csv('PenguinsMV0.4.csv', index_col = 0, na_values = '?')
print(df.shape)

percent_missing = df.isnull().sum() * 100 / len(df)
y=df["species"]
X=df.drop(["species"],axis=1)

# Scaling of data
sc=StandardScaler()
X=sc.fit_transform(X)

# Splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)



(333, 5)


 #### 2.2.0 My Naive bayes For Handling Missing Values Itself univariate

In [40]:
# My guassian naive bayes

nb=MyGaussianNB()

In [41]:
# Training the model 
nb.fit_missing_uni(X_train,y_train,LIMIT)

MyGaussianNB()

In [42]:
predi=nb.predict_uni(X_test)

In [43]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8263473053892215


**Cross Validtion K-fold**

In [44]:
kf = KFold(n_splits=5)
kf.get_n_splits(X)

5

In [45]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit_missing_uni(X_tr,y_tr,LIMIT)
    predi=nb.predict_uni(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 1.0
Model accuracy at fold 2 = 0.9850746268656716
Model accuracy at fold 3 = 0.8805970149253731
Model accuracy at fold 4 = 0.9696969696969697
Model accuracy at fold 5 = 0.3181818181818182


 #### 2.2.1 My Naive bayes For Handling Missing Values Itself Multivariate

In [46]:
# Training the model 
nb.fit_missing_mul(X_train,y_train,LIMIT)

MyGaussianNB()

In [47]:
predi=nb.predict_mul(X_test)

In [48]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8263473053892215


**Cross Validtion**

In [49]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit_missing_mul(X_tr,y_tr,LIMIT)
    predi=nb.predict_mul(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 1.0
Model accuracy at fold 2 = 0.9850746268656716
Model accuracy at fold 3 = 0.8805970149253731
Model accuracy at fold 4 = 0.9696969696969697
Model accuracy at fold 5 = 0.3181818181818182


#### 2.2.2 Naive Bayes Sciket Learn handling missing values Explicitly Univariate

In [50]:
# Loading the dataset again and imputing the missing values because naive bayes did not handle missing values in sk learn
df = pd.read_csv('penguinsMV0.4.csv', index_col = 0, na_values = '?')
print(df.shape)

percent_missing = df.isnull().sum() * 100 / len(df)





y=df["species"]
X=df.drop(["species"],axis=1)

for i in X.columns:
    X[i]= X[i].fillna(X[i].median())

    
    
# Scaling the dataset   
sc=StandardScaler()
X=sc.fit_transform(X)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)


(333, 5)


In [51]:
gnb=GaussianNB()

In [52]:
model= gnb.fit(X_train,y_train)

In [53]:
predi=gnb.predict(X_test)

In [54]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8083832335329342


**Cross validation**

In [55]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    gnb.fit(X_tr,y_tr)
    predi=gnb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.6865671641791045
Model accuracy at fold 2 = 0.6268656716417911
Model accuracy at fold 3 = 0.8507462686567164
Model accuracy at fold 4 = 0.9545454545454546
Model accuracy at fold 5 = 0.0


#### 2.2.3 My Naive Bayes handling missing values Explicitly Univariate

In [56]:
nb=MyGaussianNB()

In [57]:
# Training the model 
nb.fit(X_train,y_train)

MyGaussianNB()

In [58]:
predi=nb.predict_uni(X_test)

In [59]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8083832335329342


**Cross validation**

In [60]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit(X_tr,y_tr)
    predi=nb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.6865671641791045
Model accuracy at fold 2 = 0.6268656716417911
Model accuracy at fold 3 = 0.8507462686567164
Model accuracy at fold 4 = 0.9545454545454546
Model accuracy at fold 5 = 0.0


#### 2.2.4 Naive Bayes Sciket Learn handling missing values Explicitly Multivariate

In [61]:
# Loading the dataset again and imputing the missing values because naive bayes did not handle missing values in sk learn
df = pd.read_csv('penguinsMV0.4.csv', index_col = 0, na_values = '?')
print(df.shape)

percent_missing = df.isnull().sum() * 100 / len(df)


y=df["species"]
X=df.drop(["species"],axis=1)

X= X.fillna(X.median())

    
# Scaling the dataset   
sc=StandardScaler()
X=sc.fit_transform(X)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)

(333, 5)


In [62]:
gnb=GaussianNB()

In [63]:
model= gnb.fit(X_train,y_train)

In [64]:
predi=gnb.predict(X_test)

In [65]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8083832335329342


**Cross Validation**

In [66]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    gnb.fit(X_tr,y_tr)
    predi=gnb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.6865671641791045
Model accuracy at fold 2 = 0.6268656716417911
Model accuracy at fold 3 = 0.8507462686567164
Model accuracy at fold 4 = 0.9545454545454546
Model accuracy at fold 5 = 0.0


##### 2.2.5 My Naive Bayes handling missing values Explicitly Multivariate

In [67]:
nb=MyGaussianNB()

In [68]:
# Training the model 
nb.fit(X_train,y_train)

MyGaussianNB()

In [69]:
predi=nb.predict_uni(X_test)

In [70]:
# Accuracy of model
print(accuracy_score(predi,y_test))

0.8083832335329342


**Cross Validation**

In [71]:
i=1
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_ts = X[train_index], X[test_index]
    y_tr, y_ts = y[train_index], y[test_index]
    nb.fit(X_tr,y_tr)
    predi=nb.predict(X_ts)
    print("Model accuracy at fold "+str(i)+" =", accuracy_score(predi,y_ts))
    i+=1

Model accuracy at fold 1 = 0.6865671641791045
Model accuracy at fold 2 = 0.6268656716417911
Model accuracy at fold 3 = 0.8507462686567164
Model accuracy at fold 4 = 0.9545454545454546
Model accuracy at fold 5 = 0.0


## Commenting on the results of my evaluation

We got comparable results to scikit learn when we dealt with the missing values both implicitly and explicitly in the sklearn naive bayes.<br>
There is a minute difference between the accuracies of both models because we handle missing values implicitly and impute the median values. Due to this impution, there is a minor difference between the median of the whole dataset X and and a subset of the data set Xtest. The difference between the medians of both datasets cause a small disparity in the data set resulting in a slight difference in accuracy (approximately 1% - 2%).<br>
<br>
In the case of the other models, we obtained the same accuracy as the well as cross validation. This is because the median of the data by which we impute the missing values are the same.
