In [27]:
import numpy as np
from sklearn import datasets

# Fit Function

In [49]:
def fit(X_train,Y_train):
  result = {}                   #dict correspond to every possible value
  class_values = set(Y_train)   #to find distinct values in y
  for current_class in class_values:
    result[current_class ] = {}  #dict of all possible features
    result["total_data"] = len(Y_train)
    current_class_rows = (Y_train == current_class) #training data which has the class as current class
    X_train_current = X_train[current_class_rows] #we'll get those rows where value is true bcz in above step we got an array of true false values
    Y_train_current = Y_train[current_class_rows]
    num_features = X_train.shape[1] #m*n and number of features is n i.e. columns
    result[current_class]["total_count"] = len(Y_train_current)#total count of training data which belongs to the current class
    for j in range(1,num_features+1):
      j_1 = j-1 #to keep the feature names from 1 to n 
      result[current_class][j] = {}   #all the possible value a particular feature can take
      all_possible_values = set(X_train[:,j_1])  #[:,j_1] only the Jth column and set will get us the unique values in it
      for current_value in all_possible_values:
        result[current_class][j][current_value] =  (X_train_current[:,j_1] == current_value).sum() #for each value we need to store the count, count of all training datapoints where y is current class how many of them jth value is the current value
  return result

# Predict Function

In [50]:
def probability(dictionary,x,current_class):
  output = np.log(dictionary[current_class]["total_count"]/dictionary["total_data"])  #using log probab instead of prob
  num_features = len(dictionary[current_class].keys())-1; #number of features we have
  for j in range(1,num_features+1):
    xj = x[j-1]
    count_current_class_with_value_xj = dictionary[current_class][j][xj]+1 #Laplace Correction
    count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys()) #how many different values j can have
    current_xj_probablity = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
    output = output + current_xj_probablity
  return output



In [51]:
def predictSinglePoint(dictionary,x):
  classes = dictionary.keys()   #.keys() returns all the variable keys in the dictionary
  best_p = -1000
  best_class = -1
  first_run = True
  for current_class in classes:
    if current_class == "total_data":
      continue
    p_current_class = probability(dictionary,x,current_class)
    if(first_run or p_current_class>best_p):
      best_p = p_current_class
      best_class = current_class
    first_run = False
  return best_class

In [52]:
def predict(dictionary,X_test):
  y_pred = []
  for x in X_test:
    x_class = predictSinglePoint(dictionary,x)
    y_pred.append(x_class)
  return y_pred 

# Iris DataSet

#### Converting continous data to Discrete data for classification

In [53]:
def makeLabelled(column):
  second_limit = column.mean()
  first_limit = 0.5*second_limit
  third_limit = 1.5*second_limit
  for i in range(0,len(column)):
    if(column[i]<first_limit):
      column[i] = 0
    elif (column[i]<second_limit):
      column[i] = 1
    elif column[i]<third_limit:
      column[i] = 2
    else:
      column[i] = 3
  return column

In [54]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [55]:
for i in range(0,X.shape[-1]):
  X[:,i] = makeLabelled(X[:,i])

In [56]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0) #split of 75 25

In [57]:
dictionary = fit(X_train,Y_train)

In [58]:
Y_pred = predict(dictionary,X_test)

In [60]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


### Naive Bayes using Continous Value without converting them Discrete

In [63]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


## Conclusion:- Our accuracy is decreased in this case because Iris dataset points are not strictly following the Gaussian Curve but for most of the datasets having continous value our GaussianNB() works great and accuracy improved majorly