# Overview

In this Project, we will implement a **Naive Bayes** classifier, apply it to various classification datasets, and explore evaluation paradigms as well as the impact of individual features.

In [1]:
# This function open a csv file and read the data into a useable format
def preprocess(filename):
    data = open(filename).readlines()
    datalines = []
    for line in data:
        datalines.append(line.strip().split(","))
    header = datalines[0]
    datalines = datalines[1:]
    
    features = []
    labels = []
    
    for line in datalines:
        features.append([float(i) for i in line[2:]])
        labels.append(line[1])
        
    return (features,labels,header[2:])
       

In [2]:
# This function build a supervised NB model
import math

def train(features, labels,header):
    total = len(labels)
    
    priors = dict()
    for label in labels:
        if label in priors.keys():
            priors[label] += 1
        else:
            priors.setdefault(label, 1)
        
    attri = dict()
    likelihoods = dict()
    
    for name in header:
        attri[name] = dict()
        likelihoods[name] = dict()
        for item in priors:
            attri[name][item] = []
            likelihoods[name][item] = []
            
    n = len(features)
    for i in range(n):
        m = len(features[i])
        for j in range(m):
            attri[header[j]][labels[i]].append(features[i][j])
    for name in header:
        for item in priors:
            mean = sum(attri[name][item])/len(attri[name][item])
            standard_deviation = math.sqrt(sum([(value - mean)**2 for value in attri[name][item]])/len(attri[name][item]))
            likelihoods[name][item] = [mean, standard_deviation]
    return likelihoods
    
   

In [3]:
# This function predict the class for a set of instances, based on a trained model
def predict(likelihoods,test,labels,header):
    total = len(labels)
    
    priors = dict()
    for label in labels:
        if label in priors.keys():
            priors[label] += 1
        else:
            priors.setdefault(label, 1)
    
    for item in priors:
        priors[item] = math.log(priors[item] / total)
    
    predict_list = []
        
    for line in test:
        n = len(line)
        gause_value = dict()
        for item in priors:
            temp = 0
            for i in range(n):
                x = line[i]
                mean = likelihoods[header[i]][item][0]
                standard_devi = likelihoods[header[i]][item][1]
                if standard_devi == 0:
                    gaussian = 0
                else:
                    gaussian = (-1/2)*math.log(math.pi)-(1/2)*math.log(2)-math.log(standard_devi)-(1/2*((x-mean)/standard_devi)**2)*math.log(math.exp(1))
                temp += gaussian
            gause_value[item] = priors[item] + temp
        predict_list.append(gause_value)

    return predict_list

In [4]:
# This function evaluate a set of predictions
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

def evaluate(predict_list, labels):
    pred = []
    true = labels

    for i in predict_list:
        pred.append(max(i, key=i.get))

        
## Calculate accuracy, precision, recall, and f_score with sklearn
        
    accuracy = accuracy_score(true, pred)
    precision = precision_score(true, pred, average='weighted', zero_division=0)
    recall = recall_score(true, pred, average='weighted', zero_division=0)
    f1 = f1_score(true, pred, average='weighted', zero_division=0)
    c_m = confusion_matrix(true, pred)
    return (accuracy,precision,recall,f1,c_m)



## Calculate accuracy, precision, recall, and f_score without sklearn

#     total = len(labels)
#     priors = Counter(labels)
#     c = len(priors)
#     precision = 0
#     recall = 0
#     accuracy = 0
    
#     for i in range(total):
#         if pre[i] == labels[i]:
#             accuracy += 1
        
        
#     for item in priors:
#         TP = 0
#         TN = 0
#         FP = 0
#         FN = 0
        
#         for i in range(total):
#             if pre[i] == labels[i]:
#                 if labels[i] == item:
#                     TP += 1
#                 else:
#                     TN += 1
#             else:
#                 if labels[i] == item:
#                     FN += 1
#                 elif pre[i] == item:
#                     FP += 1
#                 else:
#                     TN += 1
#         precision = precision + (TP/(TP+FP))
#         recall = recall + (TP/(TP+FN))
        
#     accuracy = accuracy / total
#     macro_precision = precision / c
#     macro_recall = recall / c
#     b = 1
#     f_score = ((1+ b**2)*macro_precision*macro_recall) / ((b**2)*macro_precision + macro_recall)
    
#     return (accuracy,macro_precision,macro_recall,f_score)


In [5]:
# This cell act as our "main" function where we call the above functions 
# on the full OBJECTIVITY data set, and print the evaluation score.



# First, read in the data and apply NB model to the OBJECTIVITY data

features, labels, header = preprocess("objectivity.csv")
likelihoods = train(features, labels, header)
test = features
predict_list = predict(likelihoods, test, labels, header)






# Second, print the full evaluation results from the evaluate() function

accuracy, precision, recall, f_score, c_m = evaluate(predict_list, labels)

print("Confusion matrix:")
print(c_m, "\n")
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F_score: ",f_score)
print("\n")


# Third, print data statistics and model predictions, as instructed below 
# N is the total number of instances, F the total number of features, L the total number of labels
# The "class probabilities" may be unnormalized
# The "predicted class ID" must be in range (0, L)

N = len(features)
F = len(header)
L = len(Counter(labels))

N_3 = zip(predict_list[N-3].values(), predict_list[N-3].keys())
N_3_ID, N_3_value = max(enumerate(N_3), key=lambda x: x[1][0])

N_2 = zip(predict_list[N-2].values(), predict_list[N-2].keys())
N_2_ID, N_2_value = max(enumerate(N_2), key=lambda x: x[1][0])

N_1 = zip(predict_list[N-1].values(), predict_list[N-1].keys())
N_1_ID, N_1_value = max(enumerate(N_1), key=lambda x: x[1][0])

print("Feature vectors of instances [0, 1, 2]: ", features[0],features[1],features[2])

print("\nNumber of instances (N): ", N)
print("Number of features (F): ", F)
print("Number of labels (L): ", L)

print("\n\nPredicted class probabilities for instance N-3: ", predict_list[N-3])
print("Predicted class ID for instance N-3: ", (N_3_ID,N_3_value[1]))
print("\nPredicted class probabilities for instance N-2: ", predict_list[N-2])
print("Predicted class ID for instance N-2: ", (N_2_ID,N_2_value[1]))
print("\nPredicted class probabilities for instance N-1: ", predict_list[N-1])
print("Predicted class ID for instance N-1: ", (N_1_ID,N_1_value[1]))


Confusion matrix:
[[555  80]
 [128 237]] 

Accuracy:  0.792
Precision:  0.7888820429447003
Recall:  0.792
F_score:  0.7884679088105591


Feature vectors of instances [0, 1, 2]:  [109.0, 8.0, 0.0, 0.0, 0.0, 0.0, 12.0, 0.0, 1.0, 2.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 3.0] [309.0, 35.0, 0.0, 0.0, 2.0, 1.0, 17.0, 0.0, 7.0, 5.0, 16.0, 0.0, 0.0, 0.0, 7.0, 0.0, 7.0, 0.0, 0.0, 19.0, 1.0, 0.0, 10.0] [149.0, 15.0, 0.0, 0.0, 0.0, 1.0, 4.0, 0.0, 4.0, 0.0, 7.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 6.0, 0.0, 0.0, 2.0]

Number of instances (N):  1000
Number of features (F):  23
Number of labels (L):  2


Predicted class probabilities for instance N-3:  {'objective': -150.516960152298, 'subjective': -82.82526101853317}
Predicted class ID for instance N-3:  (1, 'subjective')

Predicted class probabilities for instance N-2:  {'objective': -66.35717394938793, 'subjective': -83.32555354318245}
Predicted class ID for instance N-2:  (0, 'objective')

Predicted class probabilitie

In [6]:
# This cell act as our "main" function where we call the above functions 
# on the full ADULT data set, and print the evaluation score.



# First, read in the data and apply NB model to the ADULT data

features, labels, header = preprocess("adult.csv")
likelihoods = train(features, labels, header)
test = features
predict_list = predict(likelihoods, test, labels, header)






# Second, print the full evaluation results from the evaluate() function

accuracy, precision, recall, f_score, c_m = evaluate(predict_list, labels)

print("Confusion matrix:")
print(c_m, "\n")
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F_score: ",f_score)
print("\n")



# Third, print data statistics and model predictions, as instructed below 
# N is the total number of instances, F the total number of features, L the total number of labels
# The "class probabilities" may be unnormalized
# The "predicted class ID" must be in range (0, L)

N = len(features)
F = len(header)
L = len(Counter(labels))

N_3 = zip(predict_list[N-3].values(), predict_list[N-3].keys())
N_3_ID, N_3_value = max(enumerate(N_3), key=lambda x: x[1][0])

N_2 = zip(predict_list[N-2].values(), predict_list[N-2].keys())
N_2_ID, N_2_value = max(enumerate(N_2), key=lambda x: x[1][0])

N_1 = zip(predict_list[N-1].values(), predict_list[N-1].keys())
N_1_ID, N_1_value = max(enumerate(N_1), key=lambda x: x[1][0])

print("Feature vectors of instances [0, 1, 2]: ", features[0],features[1],features[2])

print("\nNumber of instances (N): ", N)
print("Number of features (F): ", F)
print("Number of labels (L): ", L)

print("\n\nPredicted class probabilities for instance N-3: ", predict_list[N-3])
print("Predicted class ID for instance N-3: ", (N_3_ID,N_3_value[1]))
print("\nPredicted class probabilities for instance N-2: ", predict_list[N-2])
print("Predicted class ID for instance N-2: ", (N_2_ID,N_2_value[1]))
print("\nPredicted class probabilities for instance N-1: ", predict_list[N-1])
print("Predicted class ID for instance N-1: ", (N_1_ID,N_1_value[1]))


Confusion matrix:
[[1817   96]
 [ 389  198]] 

Accuracy:  0.806
Precision:  0.7883973393527856
Recall:  0.806
F_score:  0.7806400438266895


Feature vectors of instances [0, 1, 2]:  [31.0, 142470.0, 13.0, 0.0, 0.0, 40.0] [31.0, 323069.0, 9.0, 0.0, 0.0, 20.0] [25.0, 122489.0, 13.0, 0.0, 1726.0, 60.0]

Number of instances (N):  2500
Number of features (F):  6
Number of labels (L):  2


Predicted class probabilities for instance N-3:  {'<=50K': -38.94392814438511, '>50K': -41.45333754859243}
Predicted class ID for instance N-3:  (0, '<=50K')

Predicted class probabilities for instance N-2:  {'<=50K': -36.74339621680319, '>50K': -41.86399859812137}
Predicted class ID for instance N-2:  (0, '<=50K')

Predicted class probabilities for instance N-1:  {'<=50K': -36.405924581549826, '>50K': -40.77617081107401}
Predicted class ID for instance N-1:  (0, '<=50K')


In [7]:
# This cell act as our "main" function where we call the above functions 
# on the full ABSENTEEISM data set, and print the evaluation score.



# First, read in the data and apply NB model to the ABSENTEEISM data

features, labels, header = preprocess("absenteeism.csv")
likelihoods = train(features, labels, header)
test = features
predict_list = predict(likelihoods, test, labels, header)






# Second, print the full evaluation results from the evaluate() function

accuracy, precision, recall, f_score, c_m = evaluate(predict_list, labels)

print("Confusion matrix:")
print(c_m, "\n")
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F_score: ",f_score)
print("\n")



# Third, print data statistics and model predictions, as instructed below 
# N is the total number of instances, F the total number of features, L the total number of labels
# The "class probabilities" may be unnormalized
# The "predicted class ID" must be in range (0, L)

N = len(features)
F = len(header)
L = len(Counter(labels))

N_3 = zip(predict_list[N-3].values(), predict_list[N-3].keys())
N_3_ID, N_3_value = max(enumerate(N_3), key=lambda x: x[1][0])

N_2 = zip(predict_list[N-2].values(), predict_list[N-2].keys())
N_2_ID, N_2_value = max(enumerate(N_2), key=lambda x: x[1][0])

N_1 = zip(predict_list[N-1].values(), predict_list[N-1].keys())
N_1_ID, N_1_value = max(enumerate(N_1), key=lambda x: x[1][0])

print("Feature vectors of instances [0, 1, 2]: ", features[0],features[1],features[2])

print("\nNumber of instances (N): ", N)
print("Number of features (F): ", F)
print("Number of labels (L): ", L)

print("\n\nPredicted class probabilities for instance N-3: ", predict_list[N-3])
print("Predicted class ID for instance N-3: ", (N_3_ID,N_3_value[1]))
print("\nPredicted class probabilities for instance N-2: ", predict_list[N-2])
print("Predicted class ID for instance N-2: ", (N_2_ID,N_2_value[1]))
print("\nPredicted class probabilities for instance N-1: ", predict_list[N-1])
print("Predicted class ID for instance N-1: ", (N_1_ID,N_1_value[1]))


Confusion matrix:
[[11  8  5  2  4 14  0]
 [ 4 13 17  6 24 24  0]
 [ 2  9 59  3 46 31  7]
 [ 1  3  4  3  1  3  1]
 [ 3  9 28  0 42 29  1]
 [17  6 33  7 44 96  5]
 [ 2  2  2  1  4 11  6]] 

Accuracy:  0.35222052067381315
Precision:  0.35629210861982685
Recall:  0.35222052067381315
F_score:  0.3496703185018923


Feature vectors of instances [0, 1, 2]:  [18.0, 50.0, 239.554, 97.0, 1.0, 0.0, 98.0, 178.0, 31.0] [18.0, 38.0, 239.554, 97.0, 0.0, 0.0, 89.0, 170.0, 31.0] [13.0, 33.0, 239.554, 97.0, 2.0, 1.0, 90.0, 172.0, 30.0]

Number of instances (N):  653
Number of features (F):  9
Number of labels (L):  7


Predicted class probabilities for instance N-3:  {'0': -35.04508390851188, '2': -45.433974146408886, '8': -36.67209742868353, '>24': -68.21715642872516, '1': -49.765047685902246, '3': -43.25400339703182, '24': -228.76823170460926}
Predicted class ID for instance N-3:  (0, '0')

Predicted class probabilities for instance N-2:  {'0': -28.104802702828287, '2': -29.32670678731827, '8': -27.91

## Evaluation

In [18]:
# Cross-validation

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

file_name = ["objectivity.csv","adult.csv","absenteeism.csv"]

for name in file_name:
    features, labels, header = preprocess(name)
    avg_accuracy, avg_precision, avg_recall, avg_f_score = [],[],[],[]
    cross = 10
    skf = StratifiedKFold(n_splits=cross, shuffle=True)

    for x, y in skf.split(features, labels):
        train_features = []
        train_labels = []
        test_features = []
        test_labels = []

        for i in x:
            train_features.append(features[i])
            train_labels.append(labels[i])
        for j in y:
            test_features.append(features[j])
            test_labels.append(labels[j])

        likelihoods = train(train_features, train_labels, header)
        predict_list = predict(likelihoods, test_features, labels, header)

        accuracy, precision, recall, f_score, c_m = evaluate(predict_list, test_labels)

        avg_accuracy.append(accuracy)
        avg_precision.append(precision)
        avg_recall.append(recall)
        avg_f_score.append(f_score)
        
    print("File name: ", name)
    print("avg_accuracy: ", np.array(avg_accuracy).mean())
    print("avg_precision: ", np.array(avg_precision).mean())
    print("avg_recall: ", np.array(avg_recall).mean())
    print("avg_f_score: ", np.array(avg_f_score).mean())
    print("\n")
    

File name:  objectivity.csv
avg_accuracy:  0.788
avg_precision:  0.7861520714461183
avg_recall:  0.788
avg_f_score:  0.7839687710960465


File name:  adult.csv
avg_accuracy:  0.8048
avg_precision:  0.7870905780147466
avg_recall:  0.8048
avg_f_score:  0.7793134267919625


File name:  absenteeism.csv
avg_accuracy:  0.31867132867132864
avg_precision:  0.31452519967829023
avg_recall:  0.31867132867132864
avg_f_score:  0.3095787508522922


