# Implementing Naive Baye's with and without using sklearn

## Gaussian Naive Baye's with and without sklearn

**Impoting necessary libraries and datasets**

In [1]:
import numpy as np
import random
from sklearn import datasets,metrics
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
import math
from sklearn import model_selection

**Writing own class for Gaussian Naïve Baye's**

In [2]:
class gaussClf:
  # Separates the data by classes
    def separate_by_classes(self, X, y):
        
        self.classes = np.unique(y)
        classes_index = {}
        subdatasets = {}
        cls, counts = np.unique(y, return_counts=True)
        self.class_freq = dict(zip(cls, counts))
        print("Class frequency = {}".format(self.class_freq))
        for class_type in self.classes:
            classes_index[class_type] = np.argwhere(y==class_type)
            subdatasets[class_type] = X[classes_index[class_type], :]
            self.class_freq[class_type] = self.class_freq[class_type]/sum(list(self.class_freq.values()))
        return subdatasets

  # Fitting the data
    def fit(self, X, y):
      separated_X = self.separate_by_classes(X, y)
      # print(separated_X)
      # print(self.classes)

      self.means = {}
      self.std = {}
      for class_type in self.classes:
          # Here we calculate the mean and the standart deviation from datasets
          # print(np.mean(separated_X[class_type], axis=0))
          self.means[class_type] = np.mean(separated_X[class_type], axis=0)[0]
          self.std[class_type] = np.std(separated_X[class_type], axis=0)[0]

  # Calculating the probability by Gaussian Formula
    def calculate_probability(self, x, mean, stdev):
      exponent = math.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
      return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

  # Calculating probrabilty for our each datapoint to belong to a class
    def predict_proba(self, X):
      self.class_prob = {}
      for cls in self.classes:
          prob = self.class_freq[cls]/len(y)
          self.class_prob[cls] = prob
          for i in range(len(self.means)):
              self.class_prob[cls] *= self.calculate_probability(X[i], self.means[cls][i], self.std[cls][i])
          
      return self.class_prob
    
  # Predicting the result
    def predict(self, X):
      ''' This funtion predicts the class of a sample '''
      pred = []
      for x in X:
          pred_class = None
          max_prob = 0
          for cls, prob in self.predict_proba(x).items():
              if prob>max_prob:
                  max_prob = prob
                  pred_class = cls
          pred.append(pred_class)
      return pred
      

**Splitting the data into training and testing sets**

In [3]:
X,y = datasets.load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,random_state=1)

**Predicting using own classifier**

In [4]:
clf_scratch = gaussClf()
clf_scratch.fit(X_train,y_train)
prediction = clf_scratch.predict(X_test)

metrics.accuracy_score(y_true=y_test,y_pred=prediction,normalize=True)

Class frequency = {0: 37, 1: 34, 2: 41}


0.9473684210526315

**Predicting using library function**

In [5]:
clf = GaussianNB()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
metrics.accuracy_score(y_true=y_test,y_pred=prediction,normalize=True)

0.9736842105263158

## Multinomial and Bernoulli Naive Baye's using sklearn

**Importing Libraries and Reading the data**

In [61]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv('/content/drive/My Drive/spam_ham_dataset.csv',delimiter=',')
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


**Tokenizing the text data and vectorizing it**

In [71]:
cv = CountVectorizer(stop_words='english',ngram_range=(1,1))
X = cv.fit_transform(df['text'])
y = df['label_num']

df.label_num.value_counts()

0    3672
1    1499
Name: label_num, dtype: int64

**Dividing the data into training and testing sets**

In [63]:
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,stratify=y,random_state=1)

**Classification using Multinomial**

In [65]:
clfMNB = MultinomialNB()
clfMNB.fit(X_train,y_train)
prediction = clfMNB.predict(X_test)
metrics.accuracy_score(y_true=y_test,y_pred = prediction)

0.9822119102861562

**Classification using Bernoulli**

In [66]:
clfMNB = BernoulliNB()
clfMNB.fit(X_train,y_train)
prediction = clfMNB.predict(X_test)
metrics.accuracy_score(y_true=y_test,y_pred = prediction)

0.8499613302397525