# Naîve Bayes

## [Workshop 1](https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/)

### 0x000 Init

In [82]:
from math import pi, sqrt, exp
from csv import reader
from random import seed, randrange

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

iris_data = load_csv('./iris.csv')
test_dataset = [
    [3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]
]

### 0x200 Iris Data Normalization

In [83]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

for i in range(len(iris_data[0])-1):
	str_column_to_float(iris_data, i)
	
# convert class column to integers
str_column_to_int(iris_data, len(iris_data[0])-1)

{'Iris-setosa': 0, 'Iris-virginica': 1, 'Iris-versicolor': 2}

### 0x300 Seperate By Class

In [84]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset:list) -> dict:
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector[:-1])

	return separated

# -[Test]----------------------------------------------
separated = separate_by_class(iris_data)
for label in separated:
	print(label)
	for row in separated[label]:
		print(row)

0
[5.1, 3.5, 1.4, 0.2]
[4.9, 3.0, 1.4, 0.2]
[4.7, 3.2, 1.3, 0.2]
[4.6, 3.1, 1.5, 0.2]
[5.0, 3.6, 1.4, 0.2]
[5.4, 3.9, 1.7, 0.4]
[4.6, 3.4, 1.4, 0.3]
[5.0, 3.4, 1.5, 0.2]
[4.4, 2.9, 1.4, 0.2]
[4.9, 3.1, 1.5, 0.1]
[5.4, 3.7, 1.5, 0.2]
[4.8, 3.4, 1.6, 0.2]
[4.8, 3.0, 1.4, 0.1]
[4.3, 3.0, 1.1, 0.1]
[5.8, 4.0, 1.2, 0.2]
[5.7, 4.4, 1.5, 0.4]
[5.4, 3.9, 1.3, 0.4]
[5.1, 3.5, 1.4, 0.3]
[5.7, 3.8, 1.7, 0.3]
[5.1, 3.8, 1.5, 0.3]
[5.4, 3.4, 1.7, 0.2]
[5.1, 3.7, 1.5, 0.4]
[4.6, 3.6, 1.0, 0.2]
[5.1, 3.3, 1.7, 0.5]
[4.8, 3.4, 1.9, 0.2]
[5.0, 3.0, 1.6, 0.2]
[5.0, 3.4, 1.6, 0.4]
[5.2, 3.5, 1.5, 0.2]
[5.2, 3.4, 1.4, 0.2]
[4.7, 3.2, 1.6, 0.2]
[4.8, 3.1, 1.6, 0.2]
[5.4, 3.4, 1.5, 0.4]
[5.2, 4.1, 1.5, 0.1]
[5.5, 4.2, 1.4, 0.2]
[4.9, 3.1, 1.5, 0.1]
[5.0, 3.2, 1.2, 0.2]
[5.5, 3.5, 1.3, 0.2]
[4.9, 3.1, 1.5, 0.1]
[4.4, 3.0, 1.3, 0.2]
[5.1, 3.4, 1.5, 0.2]
[5.0, 3.5, 1.3, 0.3]
[4.5, 2.3, 1.3, 0.3]
[4.4, 3.2, 1.3, 0.2]
[5.0, 3.5, 1.6, 0.6]
[5.1, 3.8, 1.9, 0.4]
[4.8, 3.0, 1.4, 0.3]
[5.1, 3.8, 1.6, 0.2]
[4.6, 3.2, 

### 0x300 Mean, Variance & Standard Deviation

In [85]:
def mean(numbers):
    return sum(numbers) / len(numbers)

def variance(numbers, avg=None):
    _m = mean(numbers) if avg == None else avg
    return sum([(x - _m)**2 for x in numbers]) / len(numbers)

def stdev(numbers, avg=None):
    _mean = mean(numbers) if avg == None else avg
    return sqrt(variance(numbers, _mean))

# -[Test]----------------------------------------------
x = [1, 2, 3, 4]
print(f"{{mean={mean(x)}, variance={variance(x):.3f}, standard_deviation={stdev(x):.3f}}}")

{mean=2.5, variance=1.250, standard_deviation=1.118}


### 0x400 Summarize Dataset

In [86]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

# -[Test]----------------------------------------------
summary = summarize_dataset(iris_data)
print(summary)

[(5.843333333333335, 0.8253012917851409, 150), (3.0540000000000007, 0.4321465800705435, 150), (3.7586666666666693, 1.7585291834055201, 150), (1.1986666666666672, 0.760612618588172, 150)]


In [87]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# -[Test]----------------------------------------------
summary = summarize_by_class(iris_data)
for label in summary:
    print(f"class = {label}")
    for row in summary[label]:
        print(f" - {row}")

class = 0
 - (5.005999999999999, 0.348946987377739, 50)
 - (3.4180000000000006, 0.37719490982779713, 50)
 - (1.464, 0.17176728442867115, 50)
class = 2
 - (5.936, 0.5109833656783752, 50)
 - (2.7700000000000005, 0.31064449134018135, 50)
 - (4.26, 0.4651881339845204, 50)
class = 1
 - (6.587999999999998, 0.6294886813914925, 50)
 - (2.9739999999999998, 0.319255383666431, 50)
 - (5.552, 0.5463478745268441, 50)


### 0x500 Gaussian Probability Density Function

In [88]:
def gaussian_pdf(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev**2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# -[Test]----------------------------------------------
print(gaussian_pdf(1.0, 1.0, 1.0))
print(gaussian_pdf(2.0, 1.0, 1.0))
print(gaussian_pdf(0.0, 1.0, 1.0))

0.3989422804014327
0.24197072451914337
0.24197072451914337


### 0x600 Class Probabilities

In [91]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= gaussian_pdf(row[i], mean, stdev)
	return probabilities

# -[Test]----------------------------------------------
summaries = summarize_by_class(iris_data)
probabilities = calculate_class_probabilities(summary, iris_data[0])
print(probabilities)

{0: 0.8225776209843491, 2: 2.944966420841627e-11, 1: 8.735016114208001e-16}


### 0x700 Iris Flower Species Case Study

In [90]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds) # copy of folds
		train_set.remove(fold)  # remove the current fold from copy
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

# Test Naive Bayes on Iris Dataset
seed(1)

# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(iris_data, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [80.0, 96.66666666666667, 83.33333333333334, 86.66666666666667, 96.66666666666667]
Mean Accuracy: 88.667%
