#NB from the scratch

## Some important concepts in Python

In [None]:
# prerequisite (enumerate, lambda, list comprehention)
# enumerate
for i in range(10):
  print(i)

mylist = [1, 2, 4, 8]
for i in mylist:
  print(i)

for i in range(len(mylist)):
  print(i)

for i in range(len(mylist)):
  print(i, ": ", mylist[i])

#enumerate is an alternative
for idx, val in enumerate(mylist): # enumerate = range + len + slicing
  print(idx, ": ", val)

In [None]:
# 2. lambda function
def square_if_even(x):
  if x % 2 == 0:
    return x**2
  else:
    return x

square_if_even(5)
square_if_even(4)

16

In [None]:
square = lambda x, y: (x+y)**2
square(5, 2)

49

In [None]:
even_square = lambda x: x**2 if x%2==0 else x

print(even_square(5))
print(even_square(4))

5
16


In [None]:
# 3. list comprehention
mystring = []
for i in range(10):
  mystring.append(i)


print(mystring)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [None]:
mylistcom = [i for i in range(10)]
print(mylistcom)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


## Naive Bayes from scratch

In [None]:
import numpy as np

# camel case: guassianNiaveBayes
# capitalize: GuassianNaiveBayes ---> class
# underline: guassian_naive_bayes ---> function

class GaussianNaiveBayes:
  # fit
  def fit(self, X, y):
    n_smaple, n_feature = X.shape
    self._classes = np.unique(y)
    n_classes = len(self._classes)
    self._mean = np.zeros((n_classes, n_feature), dtype=np.float32)
    self._var = np.zeros((n_classes, n_feature), dtype=np.float32)
    self._prior = np.zeros(n_classes, dtype=np.float32)

  # calculating mean, variance, and prior
    for i, c in enumerate(self._classes):
      X_for_class_c = X[y==c]
      self._mean[i, :] = X_for_class_c.mean(axis=0)
      self._var[i, :] = X_for_class_c.var(axis=0)
      self._prior[i] = X_for_class_c.shape[0] / float(n_smaple)

  # calculating likelihood
  def likelihood(self, class_idx, x):
    mean = self._mean[class_idx]
    var = self._var[class_idx]
    num = np.exp(-(x-mean)**2 / 2*var) # numerator
    denom = np.sqrt(2 * np.pi * var) # denominator
    return num / denom

  # prediction method
  def predict(self, X):
    y_pred = [self._classify_sample(x) for x in X]
    return np.array(y_pred)

  # classification pahse
  def _classify_sample(self, x):
    posteriors = []

    for i, c in enumerate(self._classes):
      pri = np.log(self._prior[i])
      post = np.sum(np.log(self.likelihood(i, x)))
      posterior = pri + post
      posteriors.append(posterior)

    return self._classes[np.argmax(posteriors)]

In [None]:
# imports
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import time

In [None]:
import random
# load (synthesize) data
X, y = make_classification(n_samples=100000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [None]:
start_time = time.perf_counter()
gnb = GaussianNaiveBayes()
#train phase
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

end_time = time.perf_counter()
print(f'duration of the manual model was: {end_time-start_time}')
print(f'acc for manual model was: {accuracy_score(y_test, y_pred)}')

duration of the manual model was: 1.559659996999926
acc for manual model was: 0.8572


In [None]:
start_time = time.perf_counter()
sk_gnb = GaussianNB()
#train phase
sk_gnb.fit(X_train, y_train)
y_pred = sk_gnb.predict(X_test)
accuracy_score(y_test, y_pred)

end_time = time.perf_counter()
print(f'duration of the manual model was: {end_time-start_time}')
print(f'acc for manual model was: {accuracy_score(y_test, y_pred)}')

duration of the manual model was: 0.052607327999794506
acc for manual model was: 0.85908
