# Simple Naive Bayes Classifier from scratch

In [1]:
# load required library
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
import random
import math
# load dataset
x, y = load_iris(return_X_y=True)

### Split data (k-fold)

In [2]:
# Split into 5 folds randomly (no train / test split)

# Concat with label to shuffle
data = [[x[i, :], y[i]] for i in range(len(x))]
random.shuffle(data)

k_folds = 5

# Split features / labels
x_train = np.array([e[0] for e in data])
y_train = np.array([e[1] for e in data])

# reshape into folds (4 features per sample)
x_train = x_train.reshape(k_folds,-1,4)
y_train = y_train.reshape(k_folds,-1)


In [3]:
print(x_train.shape)
print(y_train.shape)

(5, 30, 4)
(5, 30)


### Naive Bayes Classifier implementation

In [4]:
def sep_by_class(x: np.array, y: np.array, k: int):
	"""Order x by class, classes between 0 and k-1

	Args:
		x (np.array): features
		y (np.array): labels
		k (int): The number of classes

	Returns:
		list[list]: The elements by class class. element index 0, class 0; index 1, class 1; ...
	"""
	assert len(x) == len(y)

	res = [[] for _ in range(k)]
	for i in range(len(x)):
		res[y[i]].append(list(x[i]))

	return res


In [5]:
# Mean, variance, standard deviation
def mean(input):
	return sum(input) / len(input)

def var(input):
	avg = mean(input)
	return sum([(x - avg)**2 for x in input]) / len(input)

def standard_dev(input):
	return math.sqrt(var(input))

In [6]:
def summary(input):
	"""Return (mean, standard deviation, nb of elements) for each column of the input (eg, the summary of each feature)

	Args:
		input (list): The features 

	Returns:
		list: [(mean, standard deviation, nb of elements)]
	"""
	return [
		(mean(column), standard_dev(column), len(column)) for column in zip(*input)
	]

In [7]:
def gaussian_proba(x, mean, standard_dev):
	"""Get probability using the Gaussian distribution

	Args:
		x (float)                  
		mean (float)                     
		standard_dev (float)                

	Returns:
		float: Probability
	"""
	return (1 / math.sqrt(2 * math.pi) * standard_dev) * math.exp(-((x - mean)**2 / (2 * standard_dev**2)))

In [8]:
def accuracy(y, y_pred):
	"""Count the number of correct prediction

	Args:
		y (array): The true class
		y_pred (array): Predicted class

	Returns:
		float: The % of correct predictions
	"""
	assert len(y) == len(y_pred)
	
	count = 0
	for i in range(len(y_pred)):
		if (y_pred[i] == y[i]): count += 1
	
	return count * 100 / len(y_pred)

In [9]:
def get_class_proba(summaries: list, x):
	"""Calculate the probability of each class

	Args:
		summaries (list): Ordered by class, index 0, class 0; ...
		x (array): 1 sample of features
	"""	
	probas = {}
	for i in range(len(summaries)):
		probas[i] = 1
		for j in range(len(summaries[i])):
			mean, stddev, _ = summaries[i][j]
			probas[i] *= gaussian_proba(x[j], mean, stddev)

	return probas

In [10]:
def predict(summaries: list, x):
	"""Compute get_class_proba above and return the label for which the probability is the highest
	"""
	predicted_class = 0
	associated_proba = 0
	for _class, proba in get_class_proba(summaries, x).items():
		if (proba > associated_proba):
			predicted_class = _class
			associated_proba = proba
	return predicted_class

In [11]:
def get_predictions(summaries: list, x):
	"""Predict class for each sample

	Args:
		summaries (list): Ordered by class, index 0, class 0; ...
		x (_type_): The list of samples
	"""
	y_pred = []
	for _x in x:
		y_pred.append(predict(summaries, _x))

	return y_pred

In [12]:
def naive_bayes(x, y, k):
	"""Algorithm

	Args:
		x (list): One fold of sample
		y (list): The associated fold of labels
		k (int): Number of classes
	"""
	# Order by class
	separated_by_class = sep_by_class(x, y, k)
	# Calculate mean and standard deviation for each feature, for each class
	summaries = [summary(separated_by_class[i]) for i in range(len(separated_by_class))]
	# Calculate prediction
	return get_predictions(summaries, x)

### Train

$y \in {0, 1, 2}$ => number of classes k = 3

In [13]:
k = 3

# For each fold, apply naive Bayes and get accuracy

acc = []
for i in range(k_folds):
	print(f"\n------------------------ Fold {i} ---------------------\n")
	y_pred = naive_bayes(x_train[i], y_train[i], k)
	_acc = accuracy(y_train[i], y_pred)
	acc.append(_acc)
	print(f"Accuracy: {_acc}%")


# Return the avg accuracy
avg_accuracy_folds = sum(acc) / len(acc)

print(f"\n\nAvg folds accuracy: {round(avg_accuracy_folds, 2)}%")


------------------------ Fold 0 ---------------------

Accuracy: 93.33333333333333%

------------------------ Fold 1 ---------------------

Accuracy: 96.66666666666667%

------------------------ Fold 2 ---------------------

Accuracy: 100.0%

------------------------ Fold 3 ---------------------

Accuracy: 96.66666666666667%

------------------------ Fold 4 ---------------------

Accuracy: 93.33333333333333%


Avg folds accuracy: 96.0%
