# Project 1: Naïve Baye's Classifer

In [43]:
import numpy as np  #import statement for the NumPy library used to store the data into NumPy arrays and perform mathematical computations
import pandas as pd #import statement for the Pandas library used to handle the dataset

In [44]:
df = pd.read_excel('Data.xlsx') #creating and initializing a DataFrame (df) to store Excel file dataset
features = df.iloc[:, :30].to_numpy() #takes all the rows and all the attribute columns (x1-x30) and stores them in a NumPy 2D array
labels = df.iloc[:, 30].to_numpy() #takes all the rows and the class columns for each row (malignant=0, benign=1) and stores them in a NumPy 1D array
data_size = labels.shape[0] #the total number of rows in the dataset stored as an integer variable

In [45]:
def shuffle_arrays(a, b):
    if len(a) != len(b):
        return              #checks if the arrays are not the same length, if so, break out of the function because they cannot be shuffled in unison
    idx = np.random.permutation(len(a)) #generates an array of random indices from 0 to 568 (in this case)
    return a[idx], b[idx] #returns the arrays reshuffled 


In [46]:
shuffled_features, shuffled_labels = shuffle_arrays(features, labels) #calls the function on the features and labels arrays to shuffle them in unison
train_size = int(data_size * 0.75) #to split the shuffled arrays into 75% training data and remaining 25% test data, 
train_features, train_labels = shuffled_features[:train_size], shuffled_labels[:train_size] 
test_features, test_labels = shuffled_features[train_size:], shuffled_labels[train_size:] #creates a training and testing array respectively for the shuffled features and labels according to the 75:25 split

In [47]:
mal_features = train_features[train_labels == 0] #index the training set feature array using a boolean mask such that the indices correspond to 0 labels
ben_features = train_features[train_labels == 1] #index the training set feature array using a boolean mask such that the indices correspond to 1 labels
mal_prior = mal_features.shape[0] / train_features.shape[0] #prior for malignant class 
ben_prior = ben_features.shape[0] / train_features.shape[0] #prior for benign class
mean_mal = np.mean(mal_features, axis=0)
mean_ben = np.mean(ben_features, axis=0) #using the numpy mean function, the axis=0 parameter ensures the mean is for each column, returns an array consisting of the mean for each 30 features
var_mal = np.var(mal_features, axis=0)
var_ben = np.var(ben_features, axis=0) #using the numpy var function, like previously, the axis=0 parameter ensures the variance is for each column, returns an array consisting of the variance for each 30 features

In [48]:
def calculate_bayes(sample, mean, variance):
    if not (len(sample) == len(mean) == len(variance)):
        return None
    var = np.where(variance == 0, 1e-6, variance)
    p = -0.5 * np.log(2 * np.pi * var) - (((sample - mean) ** 2) / (2 * var))
    return p

def predict_class(sample, mean_mal, mean_ben, var_mal, var_ben, mal_prior, ben_prior):
    if not (len(sample) == len(mean_mal) == len(var_mal) == len(mean_ben) == len(var_ben)):
        return None
    mal_total = np.log(mal_prior) + np.sum(calculate_bayes(sample, mean_mal, var_mal))
    ben_total = np.log(ben_prior) + np.sum(calculate_bayes(sample, mean_ben, var_ben))

    return np.argmax([mal_total, ben_total])

In [None]:
correct = 0
total = test_labels.shape[0]
for i in range(total):
    prediction = predict_class(test_features[i], mean_mal, var_mal, mean_ben, var_ben, mal_prior, ben_prior)
    if prediction == test_labels[i]:
        correct += 1
accuracy = correct / total
print(f"Test Accuracy: {accuracy}")

    

Test Accuracy: 0.3706293706293706
143
