# This project demonstrates the implementation and working on the Naive Bayes algorithm on: -
## 1). Iris Dataset.
## 2). Hayes-Roth dataset.
## 3). Car-evaluation dataset.
## 4). Breast Cancer dataset.

# Creator: Mihir Ingole

https://machinelearningmastery.com/naive-bayes-classifier-scratch-python

# Main Code


In [1]:
from random import randrange
from random import seed
from math import pi, sqrt, exp




def datasetSeparated(dataset):
    '''
    This function separates the data according to class value.
    It creates a dictionary where the class values are the keys and the corresponding data are its values.
    
    '''
    #print(f'Step 1: Actual Dataset: {dataset} \n')
    
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector[:-1])
        
    #print(f"Step 2: Separated dataset dict: {separated} \n")
    
    return separated

def meanAndStdDevSummary(values):
    '''
    This function takes the data, belonging to one class, without class label as input and produces a summary 
    in the form of a tuple which contains the mean, standard deviation and count of the records belonging to 
    that class as output.
    '''
    summaries = []
    for ifeat in range(len(values[0])):
        sum = 0
        count = 0
        for data in values:
            sum += float(data[ifeat])
            count += 1
        avg = sum/count
        variance = 0
        for elem in values:
            variance += ((float(elem[ifeat])-avg)**2)/float(count-1)
        stddev = variance ** 0.5
        summaries.append((avg,stddev,count))

    return summaries
    
def summarizedClasses(separatedDataset):
    '''
    This function outputs a dictionary which contains the class labels as keys and the summarized output of 
    meanAndStdDevSummary() function its corresponding value.
    '''
    
    classes_summarized = dict()

    for label in separatedDataset:
        classes_summarized[label] = meanAndStdDevSummary(separatedDataset[label])
    #print(f'Step 3: Summaries of Avg, stddev & count: {classes_summarized} \n')
    return classes_summarized

def gaussianpdf(x, mean, stddev):
    '''
    This function returns the gaussian probability of a data element based on the mean and standard deviation of the entire dataset.
    Input: Feature value of a data element, mean and standard deviation of that feature based on the entire data of 
    its corresponding class.
    Output: Gaussian probability

    '''
    if stddev == 0 or mean == 0:
        return 0
    temp = exp(-((x - mean)**2) / (2 * (stddev**2)))
    return (1/(sqrt(2 * pi) * stddev)) * temp

def calculate_class_prob(dataset, classes_summarized, testSubject):
    '''
    This function implements the naive bayes algorithm. Breaking down the naive bayes idea, this function basically implements
    P(class|X1, X2, ... , Xn) = P(X1|class) * P(X2|class) * .... * P(Xn|class) * P(class)
    Where n is the total number of features.
    Input: dataset, summarized classes (output of summarizedClasses() function) and the test data.
    Output: Calculates the probabilities of the test data for each class and returns a dictionary as class labels and its probabilities.
    '''
    total_records = len(dataset)

    probabilities = dict()
    for label, summary in classes_summarized.items():
        probabilities[label] = summary[0][-1] / total_records
        for ifeat in range(len(summary)):
            mean, stddev, _ = summary[ifeat]
            probabilities[label] *= gaussianpdf(float(testSubject[ifeat]), mean, stddev)
    #print(f'Step 4: Class probabilities of test subjects: {probabilities}')
    return probabilities

def creatingKFolds(dataset,k_folds):
    '''
    This function implements the k fold cross validation algorithm.
    Input: entire dataset with class labels and value of k.
    Output: list containing k divisions of the entire dataset.
    '''
    datasetCopy = list(dataset)
    fold_size = int(len(datasetCopy)/k_folds)
    datasetWithFolds = list()
    for _ in range(k_folds):
        fold = []
        while fold_size > len(fold):
            fold.append(datasetCopy.pop(randrange(len(datasetCopy))))
        datasetWithFolds.append(fold)
        
    return datasetWithFolds

def evaluateNaiveBayes(dataset,k_folds):
    '''
    This function aggregates all the previous functions to return the accuracy after k fold cross validation.
    Functions: Training for every k-1 folds and testing on the kth fold. 
                Calculating the probabilities for every element of the test data.
                Calculating the accuracy based on the true and predicted values.
                Averaging the accuracies of every fold and obtaining the final k fold cross validation accuracy.
    Input: entire dataset and value of k.
    Output: Prints the accuracies of all the k folds and returns the final cross validation accuracy.
    '''
    folds = creatingKFolds(dataset,k_folds)
    
    totalScore = list()
    for i in range(len(folds)):
        trainFolds = list(folds)
        testFold = trainFolds.pop(i)
        trainingData = list()
        
        for fold in trainFolds:
            trainingData += fold
            
        separated = datasetSeparated(trainingData)
        classes_summarized = summarizedClasses(separated)
        
        totalCount = 0
        trueCount = 0
        
        for testSubject in testFold:
            actual = testSubject[-1]
            testSubject = testSubject[:-1]
            probabilities = calculate_class_prob(trainingData, classes_summarized, testSubject)
            predicted = max(probabilities, key=probabilities.get)
            
            totalCount += 1
            if predicted == actual:
                trueCount += 1
        score = trueCount / totalCount
        totalScore.append(score * 100)
    print(f'Following is the accuracies of all the {k_folds} folds respectively: \n{totalScore}')
    return sum(totalScore)/k_folds 

def testNaiveBayes(dataset,testSubject):
    '''
    Return the predicted class of the test element.
    '''
    actual = testSubject[-1]
    testSubject = testSubject[:-1]
    separated = datasetSeparated(dataset)
    classes_summarized = summarizedClasses(separated)
    probabilities = calculate_class_prob(dataset, classes_summarized, testSubject)
    return max(probabilities, key=probabilities.get)
    

# Iris Dataset (Optional for testing the algorithm)

In [2]:
#Iris dataset preprocessing
iris = open("iris.txt",'r').read().split()

irisDataset = []
for elem in iris:
    temp = elem.split(',')
    irisDataset.append(temp)

In [3]:
#Testing the classifier on a random datapoint.

testSubject = irisDataset[69]
print(testNaiveBayes(irisDataset,testSubject))

Iris-versicolor


In [4]:
#Ten fold cross validation on the Iris Dataset.
seed(1)
evaluateNaiveBayes(irisDataset,10)

Following is the accuracies of all the 10 folds respectively: 
[86.66666666666667, 100.0, 93.33333333333333, 100.0, 100.0, 100.0, 100.0, 86.66666666666667, 86.66666666666667, 100.0]


95.33333333333333

# Hayes_roth dataset

https://www.geeksforgeeks.org/python-removing-first-element-of-list/

In [5]:
#Hayes_roth dataset preprocessing
import pandas as pd

hayes_roth_data = open("hayes-roth.data",'r').read().split()

hrTrainingData = []
for elem in hayes_roth_data:
    temp = elem.split(',')
    hrTrainingData.append(temp)
    
#Removing the first element of the training set since it contains distinct value.


for elem in hrTrainingData:
    elem.pop(0)
    
df_hayesRoth = pd.DataFrame(hrTrainingData)
display(df_hayesRoth)

Unnamed: 0,0,1,2,3,4
0,2,1,1,2,1
1,2,1,3,2,2
2,3,1,4,1,3
3,2,4,2,2,3
4,1,1,3,4,3
...,...,...,...,...,...
127,1,1,4,3,3
128,2,1,2,1,1
129,1,2,1,2,2
130,1,2,2,1,2


https://www.statology.org/one-hot-encoding-in-python/                                           
https://datagy.io/sklearn-one-hot-encode/                                                      
https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/                    
https://www.geeksforgeeks.org/delete-a-column-from-a-pandas-dataframe/                               
https://www.geeksforgeeks.org/how-to-convert-numpy-array-to-list/                                      
https://stackoverflow.com/questions/32078737/create-pandas-dataframe-manually-without-columns-name

In [6]:
#One Hot Encoding

'''
Here we are performing one hot encoding on the features of car-evaluation dataset
'''


from sklearn.preprocessing import OneHotEncoder

df_hayesRoth_input = pd.DataFrame(df_hayesRoth)
temp = df_hayesRoth_input[4].to_list()
del df_hayesRoth_input[4]


encoder = OneHotEncoder(categories='auto',
                        drop=None,
                       sparse=False)
        
transformed = encoder.fit_transform(df_hayesRoth_input)
transformed = transformed.tolist()

for i in range(len(transformed)):
    transformed[i].append(temp[i])

oneHotEncodedHayesRoth = transformed

print('Dataset after One Hot encoding: ')
display(pd.DataFrame(oneHotEncodedHayesRoth))

Dataset after One Hot encoding: 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3
128,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
129,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
130,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2


In [7]:
seed(1)
'''
The below code performs evaluation after one hot encoding the car-evalutaion dataset.
'''
evaluateNaiveBayes(oneHotEncodedHayesRoth,10)

Following is the accuracies of all the 10 folds respectively: 
[23.076923076923077, 23.076923076923077, 30.76923076923077, 30.76923076923077, 30.76923076923077, 30.76923076923077, 15.384615384615385, 23.076923076923077, 7.6923076923076925, 7.6923076923076925]


22.307692307692303

In [8]:
# Testing the naive bayes classifier on a random datapoint.

testSubject = hrTrainingData[69]
print(f'Preficted value: {testNaiveBayes(hrTrainingData,testSubject)} \nActual value: {testSubject[-1]}')

Preficted value: 2 
Actual value: 2


In [9]:
#Label Encoding

'''
The Hayes Roth dataset is already label encoded.
'''
seed(1)
#Ten fold cross validation on the Hayes-Roth Dataset.
'''
The below code performs evaluation on label encoding the Hayes-Roth dataset.
'''
evaluateNaiveBayes(hrTrainingData,10)

Following is the accuracies of all the 10 folds respectively: 
[46.15384615384615, 69.23076923076923, 69.23076923076923, 100.0, 61.53846153846154, 84.61538461538461, 53.84615384615385, 53.84615384615385, 69.23076923076923, 61.53846153846154]


66.92307692307693

# Car evaluation

In [10]:
#Car-evaluation dataset preprocessing

import pandas as pd
car_data = open("car.data",'r').read().split()

carTraininData = []
for elem in car_data:
    temp = elem.split(',')
    carTraininData.append(temp)  
    
df_car = pd.DataFrame(carTraininData)

In [11]:
df_car.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


https://www.statology.org/one-hot-encoding-in-python/                                           
https://datagy.io/sklearn-one-hot-encode/                                                      
https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/                    
https://www.geeksforgeeks.org/delete-a-column-from-a-pandas-dataframe/                               
https://www.geeksforgeeks.org/how-to-convert-numpy-array-to-list/                                      
https://stackoverflow.com/questions/32078737/create-pandas-dataframe-manually-without-columns-name

In [12]:
#One Hot Encoding

'''
Here we are performing one hot encoding on the features of car-evaluation dataset
'''



from sklearn.preprocessing import OneHotEncoder

df_car_input = pd.DataFrame(df_car)
temp = df_car_input[6].to_list()
del df_car_input[6]


encoder = OneHotEncoder(categories='auto',
                        drop=None,
                       sparse=False)
        
transformed = encoder.fit_transform(df_car_input)
transformed = transformed.tolist()

for i in range(len(transformed)):
    transformed[i].append(temp[i])

oneHotEncodedCarEval = transformed

print('Dataset after One Hot encoding: ')
display(pd.DataFrame(oneHotEncodedCarEval))

Dataset after One Hot encoding: 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,unacc
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,unacc
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,unacc
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,unacc
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,unacc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,good
1724,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,vgood
1725,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,unacc
1726,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,good


In [13]:
seed(1)
'''
The below code performs evaluation after one hot encoding the car-evalutaion dataset.
'''
evaluateNaiveBayes(oneHotEncodedCarEval,10)

Following is the accuracies of all the 10 folds respectively: 
[71.51162790697676, 68.02325581395348, 68.6046511627907, 66.86046511627907, 73.83720930232558, 70.34883720930233, 68.02325581395348, 72.09302325581395, 70.34883720930233, 69.76744186046511]


69.94186046511626

https://www.geeksforgeeks.org/how-to-convert-pandas-dataframe-into-a-list/

In [14]:
#Label Encoding

'''
Here we are performing label encoding on the features of car-evaluation dataset
'''


from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for feature in df_car:
    if feature != 6:
        df_car[feature] = encoder.fit_transform(df_car[feature])
        
car_evaluation = df_car.values.tolist()
print('Dataset after label encoding: ')
display(car_evaluation)

Dataset after label encoding: 


[[3, 3, 0, 0, 2, 1, 'unacc'],
 [3, 3, 0, 0, 2, 2, 'unacc'],
 [3, 3, 0, 0, 2, 0, 'unacc'],
 [3, 3, 0, 0, 1, 1, 'unacc'],
 [3, 3, 0, 0, 1, 2, 'unacc'],
 [3, 3, 0, 0, 1, 0, 'unacc'],
 [3, 3, 0, 0, 0, 1, 'unacc'],
 [3, 3, 0, 0, 0, 2, 'unacc'],
 [3, 3, 0, 0, 0, 0, 'unacc'],
 [3, 3, 0, 1, 2, 1, 'unacc'],
 [3, 3, 0, 1, 2, 2, 'unacc'],
 [3, 3, 0, 1, 2, 0, 'unacc'],
 [3, 3, 0, 1, 1, 1, 'unacc'],
 [3, 3, 0, 1, 1, 2, 'unacc'],
 [3, 3, 0, 1, 1, 0, 'unacc'],
 [3, 3, 0, 1, 0, 1, 'unacc'],
 [3, 3, 0, 1, 0, 2, 'unacc'],
 [3, 3, 0, 1, 0, 0, 'unacc'],
 [3, 3, 0, 2, 2, 1, 'unacc'],
 [3, 3, 0, 2, 2, 2, 'unacc'],
 [3, 3, 0, 2, 2, 0, 'unacc'],
 [3, 3, 0, 2, 1, 1, 'unacc'],
 [3, 3, 0, 2, 1, 2, 'unacc'],
 [3, 3, 0, 2, 1, 0, 'unacc'],
 [3, 3, 0, 2, 0, 1, 'unacc'],
 [3, 3, 0, 2, 0, 2, 'unacc'],
 [3, 3, 0, 2, 0, 0, 'unacc'],
 [3, 3, 1, 0, 2, 1, 'unacc'],
 [3, 3, 1, 0, 2, 2, 'unacc'],
 [3, 3, 1, 0, 2, 0, 'unacc'],
 [3, 3, 1, 0, 1, 1, 'unacc'],
 [3, 3, 1, 0, 1, 2, 'unacc'],
 [3, 3, 1, 0, 1, 0, 'unacc'],
 [3, 3, 1,

In [15]:
seed(1)
'''
The below code performs evaluation after label encoding the car-evalutaion dataset.
'''
evaluateNaiveBayes(car_evaluation,10)

Following is the accuracies of all the 10 folds respectively: 
[76.16279069767442, 72.09302325581395, 73.25581395348837, 72.67441860465115, 75.0, 75.0, 71.51162790697676, 79.65116279069767, 73.83720930232558, 74.4186046511628]


74.36046511627907

# Breast Cancer

In [16]:
#Breast Cancer dataset preprocessing


breastCancer = open("breast-cancer.data",'r').read().split()

breastCancerData = []
for elem in breastCancer:
    temp = elem.split(',')
    breastCancerData.append(temp)

df_breast_cancer = pd.DataFrame(breastCancerData)

In [17]:
df_breast_cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


https://datagy.io/sklearn-one-hot-encode/                                                
https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/                                 
https://www.geeksforgeeks.org/delete-a-column-from-a-pandas-dataframe/

In [18]:
#One Hot Encoding

'''
Here we are performing one hot encoding on the features of car-evaluation dataset
'''

from sklearn.preprocessing import OneHotEncoder

df_breast_cancer_input = pd.DataFrame(df_breast_cancer)
temp = df_breast_cancer_input[9].to_list()
del df_breast_cancer_input[9]


encoder = OneHotEncoder(categories='auto',
                        drop=None,
                       sparse=False)
        
transformed = encoder.fit_transform(df_breast_cancer_input)
transformed = transformed.tolist()

for i in range(len(transformed)):
    transformed[i].append(temp[i])

oneHotEncodedBCdata = transformed

In [19]:
#Displaying the one-hot encoded dataset
display(pd.DataFrame(oneHotEncodedBCdata))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,no
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,no
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,no
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,no
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,no
282,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,yes
283,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,no
284,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,no


In [20]:
seed(1)
'''
The below code performs evaluation after one hot encoding the breast cancer dataset.
'''
evaluateNaiveBayes(oneHotEncodedBCdata,10)

Following is the accuracies of all the 10 folds respectively: 
[82.14285714285714, 71.42857142857143, 57.14285714285714, 64.28571428571429, 75.0, 89.28571428571429, 89.28571428571429, 82.14285714285714, 85.71428571428571, 67.85714285714286]


76.42857142857142

In [21]:
#Label Encoding
'''
Here we are performing label encoding on the features of car-evaluation dataset
'''
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for feature in df_breast_cancer:
    if feature != 9:
        df_breast_cancer[feature] = encoder.fit_transform(df_breast_cancer[feature])
        
labelEncodedBCdataset = df_breast_cancer.values.tolist()

In [22]:
#Displaying the label encoded breat cancer dataset.
display(labelEncodedBCdataset)

[[0, 1, 2, 5, 0, 1, 2, 0, 2, 'no'],
 [0, 2, 2, 3, 0, 1, 1, 1, 5, 'no'],
 [0, 2, 2, 3, 0, 1, 1, 0, 2, 'no'],
 [0, 4, 0, 2, 0, 1, 1, 1, 3, 'no'],
 [0, 2, 2, 0, 0, 1, 1, 1, 4, 'no'],
 [0, 4, 0, 2, 0, 1, 1, 0, 2, 'no'],
 [0, 3, 2, 4, 0, 1, 1, 0, 2, 'no'],
 [0, 4, 0, 3, 0, 1, 0, 0, 2, 'no'],
 [0, 2, 2, 10, 0, 1, 1, 0, 2, 'no'],
 [0, 2, 2, 3, 0, 1, 1, 1, 3, 'no'],
 [0, 2, 2, 0, 0, 1, 2, 0, 1, 'no'],
 [0, 3, 0, 4, 0, 1, 1, 0, 2, 'no'],
 [0, 4, 1, 1, 0, 1, 0, 0, 5, 'no'],
 [0, 3, 0, 4, 0, 1, 2, 0, 5, 'no'],
 [0, 2, 2, 5, 0, 1, 2, 0, 3, 'no'],
 [0, 4, 1, 5, 0, 1, 0, 0, 2, 'no'],
 [0, 2, 2, 2, 0, 1, 1, 0, 2, 'no'],
 [0, 3, 2, 5, 0, 1, 2, 0, 2, 'no'],
 [0, 4, 0, 5, 0, 1, 2, 0, 2, 'no'],
 [0, 3, 0, 5, 0, 1, 0, 1, 5, 'no'],
 [0, 3, 0, 7, 0, 1, 1, 0, 2, 'no'],
 [0, 4, 0, 2, 0, 1, 1, 0, 2, 'no'],
 [0, 1, 2, 4, 0, 1, 1, 1, 2, 'no'],
 [0, 3, 2, 7, 0, 1, 1, 0, 3, 'no'],
 [0, 3, 2, 6, 0, 1, 1, 1, 3, 'no'],
 [0, 2, 2, 4, 0, 1, 1, 0, 3, 'no'],
 [0, 3, 2, 3, 0, 1, 0, 0, 2, 'no'],
 [0, 4, 0, 4, 0, 1, 2, 1, 3

In [23]:
seed(1)
'''
The below code performs evaluation after label encoding the breast cancer dataset.
'''
evaluateNaiveBayes(labelEncodedBCdataset,10)

Following is the accuracies of all the 10 folds respectively: 
[75.0, 78.57142857142857, 60.71428571428571, 67.85714285714286, 75.0, 92.85714285714286, 89.28571428571429, 78.57142857142857, 82.14285714285714, 75.0]


77.5