<h1>Naive Bayes Classifier</h1>

In [1]:
import numpy as np
import pandas as pd
import math
import time
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

<h2>Input and Model Splitting</h2>

In [2]:
t1 = time.time()
mtx_arr = np.zeros((2225, 9635), dtype=np.float16)
with open('bbc.mtx', 'r') as mtx:
    next(mtx)
    next(mtx)
    for line in mtx:
        line = line[:-1]
        val = line.split()
        mtx_arr[int(val[1])-1][int(val[0])-1] = float(val[2])

actual_class = np.empty(shape=(2225, 1), dtype=int)
with open('bbc.classes', 'r') as classes:
    next(classes)
    next(classes)
    next(classes)
    next(classes)
    for line in classes:
        line = line[:-1]
        val = [int(i) for i in line.split()]
        actual_class[val[0]][0] = val[1]

t2 = time.time()
X_train, X_test, Y_train, Y_test = train_test_split(mtx_arr, actual_class, random_state=12345)
print("File input and partition completed in", t2-t1, "seconds")

File input and partition completed in 0.3440842628479004 seconds


<h2>Naive Bayes Classifier Training</h2>

In [3]:
def find_prob_classifier(x, y):
    # Compute P(x_i|c) for all x_i in x while adjusting for 0 probabilities
    # Compute P(c = i), 0 <= i <= 4 
    # Compute P(x_i) for all columns in x
    prob_c = []
    prob_xic = []
    prob_xi = []
    lst_x = [row.tolist() for row in x]
    lst_y = [row.tolist() for row in y]
    for i in range(5):
        class_i = list(map(lambda n: n==[i], lst_y))
        total = sum([int(val) for val in class_i])
        cond = [0] * np.shape(x)[1]
        for j in range(np.size(y)):
            if class_i[j]:
                for k in range(len(lst_x[j])):
                    cond[k]+=min(lst_x[j][k], 1.0)
        prob_xic.append([(i+1)/(total+2) for i in cond])
        total = total/np.size(y)
        prob_c.append(total)
        
    lst_xt = [row.tolist() for row in x.T]
    for i in lst_xt:
        bin_i = list(map(lambda x: not math.isclose(x, 0.0, rel_tol=0.1), i))
        prob = sum([int(val) for val in bin_i])/np.shape(x)[0]
        prob_xi.append(prob)
    return (prob_xic, prob_c, prob_xi)
        

<h2>Prediction Function</h2>

In [4]:
def predict(prob_xic, prob_c, prob_xi, point):
    prob_list = []
    for i in range(5):
        prob_cxi = math.log2(prob_c[i])
        bin_point = list(map(lambda x: math.isclose(x, 0.0, rel_tol=0.1), point))
        prob_x = 0
        for j in range(len(point)):
            if not bin_point[j]:
                prob_cxi = prob_cxi + math.log2(prob_xic[i][j])
                prob_x += prob_xi[j]
        prob_cxi = prob_cxi - math.log2(prob_x)
        prob_list.append(prob_cxi)

    return prob_list


<h2>Train Bayes Classifier</h2>

In [5]:
t1 = time.time()
prob_xic, prob_c, prob_xi = find_prob_classifier(X_train, Y_train)
t2 = time.time()
print("Training Naive Bayes classifier completed in", t2-t1, "seconds")


Training Naive Bayes classifier completed in 6.60952353477478 seconds


<h2>Error Rate Computation</h2>

In [6]:
t1 = time.time()
X_test_error = 0
X_train_error = 0
for i in range(np.shape(X_test)[0]):
    pred = predict(prob_xic, prob_c, prob_xi, X_test[i])
    if pred.index(max(pred)) != Y_test[i]:
        X_test_error += 1
for i in range(np.shape(X_train)[0]):
    pred = predict(prob_xic, prob_c, prob_xi, X_train[i])
    if pred.index(max(pred)) != Y_train[i]:
        X_train_error += 1

print("Accuracy of classifier on training dataset", 1-(X_train_error/np.shape(X_train)[0]))
print("Accuracy of classifier on test dataset: ", 1-(X_test_error/np.shape(X_test)[0]))
t2 = time.time()
print("Testing Naive Bayes classifier completed in", t2-t1, "seconds")

Accuracy of classifier on training dataset 0.9574340527577938
Accuracy of classifier on test dataset:  0.9174147217235189
Testing Naive Bayes classifier completed in 22.666558742523193 seconds


<h1>Gaussian Classifier</h1>

<h2>Finding Covariance Matrix</h2>

In [7]:
def covar(x, y):
    return np.sum(x*y)/(len(x)-1)

def train_gaussian(x, y):
    # Finds the diagonal of the covariance matrix
    # Assumes the correlation of features is 0
    lst_x = [row.tolist() for row in x]
    lst_xt = [row.tolist() for row in x.T]
    lst_y = [row.tolist() for row in y]
    size_x = np.shape(x)[0]
    mu = []
    for i in range(5):
        class_i = list(map(lambda n: n==[i], lst_y))
        mu_i = [0] * np.shape(x)[1]
        lst_xi = []
        # Compute mean 
        for j in range(np.size(y)):
            if class_i[j]:
                lst_xi.append(lst_xt[j])
                for k in range(len(lst_x[j])):
                    mu_i[k]+=(lst_x[j][k]/size_x)
        
        mu.append(mu_i)
    # Find variance
    cov = np.zeros([len(lst_xt), len(lst_xt)])
 
    for i in range(len(lst_xt)):
        lst_xt[i] = lst_xt[i]-np.mean(lst_xt[i])

    for i in range(len(lst_xt)):
        cov[i][i] = covar(lst_xt[i], lst_xt[i]) + 0.000000001
    return (mu, cov)

In [8]:
t1 = time.time()
mu, cov = train_gaussian(X_train, Y_train)
inv_cov = np.linalg.inv(cov)
t2 = time.time()
print("Training Gaussian classifier completed in", t2-t1, "seconds")

Training Gaussian classifier completed in 17.348944902420044 seconds


In [9]:
def gauss_predict(prob_c, mu, inv_cov, point):
    # Predicts datapoint using Gaussian Class Conditionals
    # Computes the log joint pdf of every possible classification
    pred = []
    for i in range(5):
        logpdf = -0.5 * (point-mu[i]) @ inv_cov @ (point-mu[i])
        logpdf += math.log(prob_c[i])
        pred.append(logpdf)

    return pred


In [10]:
t1 = time.time()
X_test_error = 0
X_train_error = 0
for i in range(np.shape(X_test)[0]):
    pred = gauss_predict(prob_c, mu, inv_cov, X_test[i])
    if pred.index(max(pred)) != Y_test[i]:
        X_test_error += 1
for i in range(np.shape(X_train)[0]):
    pred = gauss_predict(prob_c, mu, inv_cov, X_train[i])
    if pred.index(max(pred)) != Y_train[i]:
        X_train_error += 1

print("Accuracy of classifier on training dataset", 1-(X_train_error/np.shape(X_train)[0]))
print("Accuracy of classifier on test dataset: ", 1-(X_test_error/np.shape(X_test)[0]))
t2 = time.time()
print("Testing Gaussian classifier completed in", t2-t1, "seconds")

Accuracy of classifier on training dataset 0.9892086330935251
Accuracy of classifier on test dataset:  0.9712746858168761
Testing Gaussian classifier completed in 300.52689003944397 seconds


<h1>k-Nearest Neighbors Classification</h1>

In [11]:
import heapq
def kNN(XY_arr, point, k):
    dist = lambda x: np.sum((x-point)**2)
    heap = []
    for i in range(len(XY_arr)):
        d = dist(XY_arr[i][1:])
        if(len(heap) < k):
            heapq.heappush(heap, (-d, i))
        else:
            if -d > heap[0][0]:
                heapq.heappop(heap)
                heapq.heappush(heap, (-d, i))
    
    k_nearest = []
    for i in heap:
        k_nearest.append(XY_arr[i[1]][0])
    return k_nearest

def counter(k_nearest):
    count = []
    for i in range(5):
        count.append(k_nearest.count(i))
    return count.index(max(count))

In [13]:
XY_train = np.hstack((Y_train, X_train))
t1 = time.time()
X_test_error = 0
X_train_error = 0
for i in range(np.shape(X_test)[0]):
    pred = counter(kNN(XY_train, X_test[i], 1))
    if pred != Y_test[i]:
        X_test_error += 1
for i in range(np.shape(X_train)[0]):
    pred = counter(kNN(XY_train, X_train[i], 1))
    if pred != Y_train[i]:
        X_train_error += 1

print("Accuracy of classifier on training dataset (k=1)", 1-(X_train_error/np.shape(X_train)[0]))
print("Accuracy of classifier on test dataset (k=1): ", 1-(X_test_error/np.shape(X_test)[0]))
t2 = time.time()
print("Testing k=1 nearest neighbors classifier completed in", t2-t1, "seconds")

t1 = time.time()
X_test_error = 0
X_train_error = 0
for i in range(np.shape(X_test)[0]):
    pred = counter(kNN(XY_train, X_test[i], 3))
    if pred != Y_test[i]:
        X_test_error += 1
for i in range(np.shape(X_train)[0]):
    pred = counter(kNN(XY_train, X_train[i], 3))
    if pred != Y_train[i]:
        X_train_error += 1

print("Accuracy of classifier on training dataset (k=3)", 1-(X_train_error/np.shape(X_train)[0]))
print("Accuracy of classifier on test dataset (k=3): ", 1-(X_test_error/np.shape(X_test)[0]))
t2 = time.time()
print("Testing k=3 nearest neighbors classifier completed in", t2-t1, "seconds")

t1 = time.time()
X_test_error = 0
X_train_error = 0
for i in range(np.shape(X_test)[0]):
    pred = counter(kNN(XY_train, X_test[i], 6))
    if pred != Y_test[i]:
        X_test_error += 1
for i in range(np.shape(X_train)[0]):
    pred = counter(kNN(XY_train, X_train[i], 6))
    if pred != Y_train[i]:
        X_train_error += 1

print("Accuracy of classifier on training dataset (k=6)", 1-(X_train_error/np.shape(X_train)[0]))
print("Accuracy of classifier on test dataset (k=6): ", 1-(X_test_error/np.shape(X_test)[0]))
t2 = time.time()
print("Testing k=6 nearest neighbors classifier completed in", t2-t1, "seconds")

Accuracy of classifier on training dataset (k=1) 1.0
Accuracy of classifier on test dataset (k=1):  0.7719928186714542
Testing k=1 nearest neighbors classifier completed in 137.34357976913452 seconds
Accuracy of classifier on training dataset (k=3) 0.8836930455635492
Accuracy of classifier on test dataset (k=3):  0.7360861759425494
Testing k=3 nearest neighbors classifier completed in 137.4279887676239 seconds
Accuracy of classifier on training dataset (k=6) 0.7236211031175059
Accuracy of classifier on test dataset (k=6):  0.63016157989228
Testing k=6 nearest neighbors classifier completed in 137.30616307258606 seconds
