In [None]:
import pandas as pd
import numpy as np
from pprint import pprint
#from fractions import Fraction

In [None]:
#Load training data from local drive
from google.colab import files
uploaded = files.upload()
import io
happiness_train= pd.read_csv(io.BytesIO(uploaded['ndata3.csv']))
happiness_train.head(3)

In [None]:
#Load test data from local drive
from google.colab import files
uploaded = files.upload()
import io
happiness_test = pd.read_csv(io.BytesIO(uploaded['ntest3.csv']))
happiness_test.head(3)

Saving ntest3.csv to ntest3.csv


Unnamed: 0,D,X1,X2,X3,X4,X5,X6
0,0,5,1,4,4,4,5
1,0,5,2,2,4,4,5
2,0,5,3,5,4,5,5


In [None]:
#Form the dataset as X consisting of all training examples and features except the ground truths and Y consisiting of only ground truths of the
#corresponding training examples in X.
X=happiness_train.loc[:,"X1":"X6"]
Y=happiness_train.loc[:,"D"]
print(X.head(3),"\n", Y.head(3), X.shape)

   X1  X2  X3  X4  X5  X6
0   3   3   3   4   2   4
1   3   2   3   5   4   3
2   5   3   3   3   3   5 
 0    0
1    0
2    1
Name: D, dtype: int64 (129, 6)


#Start the training

In [None]:
features=X.columns
features

In [None]:
ground_truths,gcount = np.unique(Y,return_counts=True)
type(ground_truths)

###Compute the prior probability values

In [None]:
prior_prob=gcount/np.sum(gcount)
prior_prob

###Define a dictionary of dataframes, one for each feature, as the contigency tables of the Naive Bayes classifier. Indices (rows) of the dataframes(contigency tables) are the distinct values of an attribute and columns represent ground truth values.

In [None]:
d = {name: pd.DataFrame(index=np.unique(X.loc[:,name]),columns=ground_truths) for name in features}
pprint(d)

###Start filling in the cells of the dataframes. Each cell stores a probability value which indicates the frequency count of an attribute value for a ground truth value. The probability value is computed as the number of occurances of the attribute value out of the number of occurances of the ground truth value.

In [None]:
for f in range(len(features)):
  df=X.loc[:,features[f]]
  for gt in ground_truths:
    yc=Y[Y==ground_truths[gt]]
    df2= pd.concat([df,yc],axis=1).dropna() #df2 is dataframe consists of two columns, values of the feature f corresponding to one ground truth value gt
    df3=df2.loc[:,features[f]] #df3 is a series corresponding to the feature f of df2
    values,counts = np.unique(df3,return_counts=True)
    prob_val=counts/np.sum(counts)# probability values of one column of the contigency table for the feature f 
    for i in range(len(values)):
      #print(values[i],prob_val[i])
      d[features[f]].loc[values[i],gt]=prob_val[i]
for f in range(len(features)):
  d[features[f]].fillna(0,inplace=True)    # replace zero probability value for all missing values, NaNs in cells which are corrected by laplacian smoothing in the next step
pprint(d)


In [None]:
#Laplacian Correction
for f in range(len(features)):
  for gt in ground_truths:
    if(d[features[f]].loc[:,gt]==0).any():
      currentval=d[features[f]].loc[:,gt]
      newval=(currentval*10+1)/(10+len(d[features[f]]))
      d[features[f]].loc[:,gt]=newval
pprint(d)

###Test Phase

In [None]:
print(happiness_test.head(1),happiness_test.shape)

In [None]:
testind=5
test_vector= happiness_test.loc[testind,"X1":"X6"]
test_vector

In [None]:
pred_prob=np.zeros(len(ground_truths))
for count,gt in enumerate(ground_truths):
  cval=1
  for f in range(len(features)):
    cval=d[features[f]].loc[test_vector[f],gt]*cval
  pred_prob[count]=cval      

In [None]:
print(pred_prob) 

In [None]:
ground_truths[np.argmax(pred_prob)] #predicted ground truth

In [None]:
print("Actual Class in the dataset: ", happiness_test.loc[testind,"D"])

#Packing everything together by defining the NB classifier class

In [None]:
class NB:
    def __init__(self, X, Y):
        self.num_examples, self.num_features = X.shape
        self.features=X.columns        
        self.ground_truths,self.gcount = np.unique(Y,return_counts=True)
        self.d={name: pd.DataFrame(index=np.unique(X.loc[:,name]),columns=self.ground_truths) for name in self.features}
    
    # Training phase
    def train(self, X, Y):
      prior_prob=self.gcount/np.sum(self.gcount)
      #d = {name: pd.DataFrame(index=np.unique(X.loc[:,name]),columns=self.ground_truths) for name in self.features}
      for f in range(len(self.features)):
        df=X.loc[:,self.features[f]]
        for gt in self.ground_truths:
          yc=Y[Y==self.ground_truths[gt]]
          df2= pd.concat([df,yc],axis=1).dropna()
          df3=df2.loc[:,self.features[f]]
          values,counts = np.unique(df3,return_counts=True)
          prob_val=counts/np.sum(counts)
          for i in range(len(values)):
            #print(values[i],prob_val[i])
            self.d[self.features[f]].loc[values[i],gt]=prob_val[i]
          self.d[self.features[f]].fillna(0,inplace=True)
      for f in range(len(self.features)):#laplacian correction
        for gt in self.ground_truths:
          if(self.d[self.features[f]].loc[:,gt]==0).any():
            currentval=self.d[self.features[f]].loc[:,gt]
            newval=(currentval*10+1)/(10+len(self.d[self.features[f]]))
            self.d[self.features[f]].loc[:,gt]=newval
      #return d
      #Predicting unkown labels of a feature vector
    def predict(self, test_vector):
      pred_prob=np.zeros(len(self.ground_truths))
      for count,gt in enumerate(self.ground_truths):
        cval=1
        for f in range(len(self.features)):
          cval=self.d[self.features[f]].loc[test_vector[f],gt]*cval
          pred_prob[count]=cval
      return self.ground_truths[np.argmax(pred_prob)]     
    

In [None]:
nb=NB(X,Y)# create the NB classifier object
nb.train(X,Y) # perform the training

In [None]:
testind=3
test_vector= happiness_test.loc[testind,"X1":"X6"]
#test_vector

In [None]:
print("Predicted class:",nb.predict(test_vector))

Predicted class: 0


In [None]:
print("Actual Class in the dataset: ", happiness_test.loc[testind,"D"])

Actual Class in the dataset:  1


###Write code in the following cells to test all instances of the given testdata. Form a confusion matrix and find the precision, recall, f-score, accuracy, and error rate.

#Use sci-kit learn library to train and test the Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X.values, Y)

CategoricalNB()

In [None]:
testind=3
test_vector= happiness_test.loc[testind,"X1":"X6"].to_numpy().reshape(1,-1)

In [None]:
print("predicted class", clf.predict(test_vector))

predicted class [0]


In [None]:
print("Actual Class in the dataset: ", happiness_test.loc[testind,"D"])

Actual Class in the dataset:  1


###Form a confusion matrix and find precision, recall, accuracy, error rate, etc. using sci-kit learn library