In [2]:
!/opt/venv/bin/python -m pip install --upgrade pip
# Handling pip upgrades

import pandas as pd
import numpy as np
import math

# Making plotly as the backend for pandas
!pip install plotly
pd.options.plotting.backend = "plotly"

# Setting the theme
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

from sklearn.model_selection import train_test_split
import pprint

Requirement already up-to-date: pip in /opt/venv/lib/python3.7/site-packages (20.2.4)


In [11]:
iris_set = pd.read_csv("./iris/iris.data")
# add the column names, as it doesn't have it
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_set.columns = attributes
iris_set = iris_set.sample(frac=1).reset_index(drop=True) # Shuffle
iris_set

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.9,3.0,4.2,1.5,Iris-versicolor
1,5.4,3.9,1.7,0.4,Iris-setosa
2,5.6,3.0,4.5,1.5,Iris-versicolor
3,6.4,3.2,5.3,2.3,Iris-virginica
4,5.4,3.0,4.5,1.5,Iris-versicolor
...,...,...,...,...,...
144,6.1,2.9,4.7,1.4,Iris-versicolor
145,6.4,2.8,5.6,2.2,Iris-virginica
146,5.6,2.9,3.6,1.3,Iris-versicolor
147,4.8,3.1,1.6,0.2,Iris-setosa


In [12]:
X = iris_set[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
X = np.array(X)
X[:3]

array([[5.9, 3. , 4.2, 1.5],
       [5.4, 3.9, 1.7, 0.4],
       [5.6, 3. , 4.5, 1.5]])

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

Y = iris_set['species'].to_numpy()
Y = le.fit_transform(Y)
Y[:3]

array([1, 0, 1])

In [14]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(X,Y):
	separated = dict()
	for i in range(len(X)):
		vector = X[i]
		class_value = Y[i]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

In [22]:
separated = separate_by_class(X,Y)

for label in separated:
    print(label)
    print(separated[label][:3])

1
[array([5.9, 3. , 4.2, 1.5]), array([5.6, 3. , 4.5, 1.5]), array([5.4, 3. , 4.5, 1.5])]
0
[array([5.4, 3.9, 1.7, 0.4]), array([5. , 3.3, 1.4, 0.2]), array([5. , 3.6, 1.4, 0.2])]
2
[array([6.4, 3.2, 5.3, 2.3]), array([7.2, 3.6, 6.1, 2.5]), array([6.4, 2.8, 5.6, 2.1])]


In [23]:
def mean(numbers):
	return sum(numbers)/float(len(numbers))

In [24]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return math.sqrt(variance)

In [25]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(X):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*X)]
	return summaries

In [26]:
summary = summarize_dataset(X)
print(summary)

[(5.848322147651007, 0.8285940572656175, 149), (3.051006711409395, 0.4334988777167476, 149), (3.7744966442953025, 1.7596511617753423, 149), (1.2053691275167786, 0.7612920413899603, 149)]


In [27]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(X,Y):
	separated = separate_by_class(X,Y)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

In [48]:
summary = summarize_by_class(X,Y)
for label in summary:
    print(label)
    print(np.asarray(["Mean","STD","Len"]))
    for row in summary[label]:
        print(np.asarray(row))

1
['Mean' 'STD' 'Len']
[ 5.936       0.51617115 50.        ]
[ 2.77        0.31379832 50.        ]
[ 4.26        0.46991098 50.        ]
[ 1.326       0.19775268 50.        ]
0
['Mean' 'STD' 'Len']
[ 5.00408163  0.35587872 49.        ]
[ 3.41632653  0.38478725 49.        ]
[ 1.46530612  0.17506073 49.        ]
[ 0.24489796  0.10813037 49.        ]
2
['Mean' 'STD' 'Len']
[ 6.588       0.63587959 50.        ]
[ 2.974       0.32249664 50.        ]
[ 5.552      0.5518947 50.       ]
[ 2.026       0.27465006 50.        ]


In [50]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

In [51]:
print(calculate_probability(1.0, 1.0, 1.0))
print(calculate_probability(2.0, 1.0, 1.0))
print(calculate_probability(0.0, 1.0, 1.0))

0.3989422804014327
0.24197072451914337
0.24197072451914337


In [52]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, count = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

In [71]:
# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

In [72]:
summaries = summarize_by_class(X,Y)
probabilities = predict(summaries, X[0])
probabilities

1

In [60]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

In [76]:
# Naive Bayes Algorithm
def naive_bayes(X_train, X_test, Y_train, Y_test):
	summarize = summarize_by_class(X_train, Y_train)
	predictions = list()
	for row in X_test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

In [74]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

In [77]:
predicted = naive_bayes(X_train, X_test, Y_train, Y_test)

In [81]:
print("The accuracy is", accuracy_metric(Y_test, predicted))

The accuracy is 95.65217391304348
