In [1]:
# This function opens a data file in csv, and transform it into a usable format 
import numpy as np
def load_data():
    data = open("student.csv").read().strip()   
    datalines = data.split("\n")
    datafields = []
    for line in datalines:
        datafields.append(line.split(","))
        
    return datafields


In [2]:
# This function splits a data set into a training set and hold-out test set
from sklearn.model_selection import train_test_split
import numpy

def split_data(datafields, random_number):
    # separate data and labels
    data_label = datafields[0]    
    datafields = datafields[1:] 
    train, test = train_test_split(datafields, test_size = 0.1, random_state = random_number) 
    # convert the type from list to array
    train = numpy.array(train)   
    test = numpy.array(test)
    data_label = numpy.array(data_label)
    return data_label, train, test

# Used 90% of data for training and 10% for testing to compare 10-fold cross validation later 

In [3]:
# This function builds a supervised NB model and returns prior/ likelihood dictionaries
def train(data_label, train_set):
    train = train_set
    # a dictionary to count observed grades in training data
    prior_count = {"A+":0, "A":0, "B":0, "C":0, "D":0, "F":0} 
    # a dictionary containing likelihood per each feature, per each grade
    # it looks like {'grade1': {'att1': {'val1': pro1, 'val2': pro2},'att2': {'val1': pro1}, ....}
    likelihood = {}       
    for instance in train:
        key = instance[-1]        # get a grade (will be used as a key) from training data to use as a key
        prior_count[key] += 1     # count the number of grade in prior_count dictionary 
        if key not in likelihood.keys():      # create a new key(grade) when the grade is observed first time
            likelihood[key] = {}
        index = 0                             # index for feature label
        for feature in instance[:-1]:         # count observed feature per each label in likelihood dictionary with its grade
            label = data_label[index]
            if label not in likelihood[key]:
                likelihood[key][label]= {} 
            if feature not in likelihood[key][label]:
                likelihood[key][label][feature]= 0
            likelihood[key][label][feature] += 1
            index += 1
    total = len(train)          # count the total number of training instances (N)
    prior = {}                  # a dictionary for prior 
    for grade in prior_count.keys():      # a dictionary containing the probability of each grade
        prior[grade] = round(prior_count[grade]/total, 4)
    
    for grade in likelihood.keys():        # calculate conditional probability of each feature based on the counted number
        for label in data_label[:-1]:      # get rid of the last label(grade) in data_label
            for feature in likelihood[grade][label].keys():    # update the number of count with the probability
                likelihood[grade][label][feature] = round(likelihood[grade][label][feature]/prior_count[grade], 4)
    return prior, likelihood



In [4]:
# This function predicts the grade for an instance or a set of instances, based on a trained model 
def predict(data_label, test_set, prior, likelihood):
    test = test_set
    epsilon = 0.0001             # use epsilon smoothing to avoid zero frequency; it is small enough cf. 1/649(0.0015)
    predicted_set = []           # create a list containing the predicted grades
    for instance in test:        # find the most possible grade for each instance
        predicted = {'A+':0,'A':0,'B':0,'C':0,'D':0,'F':0}     # create a dictionary of each grade with its probability 
        predicted_max = 0             # initialise the maximum probability
        predicted_grade = ''          # initialise the grade with the highest probability
        for grade in predicted.keys():          # calculate the probability of each grade
            probability = 1 
            if not likelihood[grade]:           # substitute 0 with epsilon value
                predicted[grade] *= epsilon
            else:
                index = 0                     # index for feature label
                for feature in instance[:-1]:
                    label = data_label[index]
                    if feature not in likelihood[grade][label]:
                        probability *= epsilon       # substitute '0' probability with epsilon value
                    else: probability *= likelihood[grade][label][feature]
                    index += 1
            probability *= prior[grade]
            predicted[grade] = probability
            if predicted[grade] > predicted_max:       # update the maximum probability and the grade
                predicted_max = predicted[grade]
                predicted_grade = grade
        predicted_set.append(predicted_grade)         # save the predicted grade 
    return predicted_set


In [5]:
# This function evaluates a set of predictions in terms of accuracy
def evaluate(real_grades, predicted_set):              
    correctly_predicted = 0                 
    total_predicted = len(predicted_set)
    for i in range(total_predicted):          # count correctly predicted grade
        if real_grades[i] == predicted_set[i]:
            correctly_predicted += 1
    accuracy = (correctly_predicted/total_predicted)      # calculate the accuracy 
    return accuracy


In [6]:
# this code block is to estimate the Naive Bayes model using the hold-out strategy 

random_num_list = [1, 5, 33, 40, 100]        # create random numbers
accuracy_list = []                           # initialise a list to save each result
accuracy_sum = 0
for num in random_num_list:
    datafields = load_data()
    data_label, train_set, test_set = split_data(datafields, num)
    prior, likelihood = train(data_label, train_set)
    predicted_set = predict(data_label, test_set, prior, likelihood)
    real_grades = test_set[:,-1]         # get the actual grade from testing data
    accuracy = round(evaluate(real_grades, predicted_set),4)
    accuracy_list.append(accuracy)
    accuracy_sum += accuracy
print(accuracy_list)
print(accuracy_sum/len(accuracy_list))
print("Average accuracy : %.4f %%" % (accuracy_sum*100/len(accuracy_list)))

print("Basic accuracy for comparison: %.4f %%" % (accuracy_list[0]*100))  # following questions are based on this model 


[0.3692, 0.3692, 0.3231, 0.3538, 0.4769]
0.37844
Average accuracy : 37.8440 %
Basic accuracy for comparison: 36.9200 %


In [7]:
# The code below is to manually inspect a few instances for which the classifier 
#  made correct and incorrect predictions, then observe any patterns. 

import pandas as pd
file = pd.read_csv("result.txt", sep = " ")
pd.set_option('display.max_columns', None)
pd.DataFrame(file)

# instances were randomly chosen 

Unnamed: 0,'school','sex','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','guardian','traveltime','studytime','failures','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic','famrel','freetime','goout','Dalc','Walc','health','absences','Grade','PredictedGrade'
0,'GP','F','U','LE3','T','mid','mid','other','other','home','other','low','medium','none','no','no','no','yes','no','yes','yes','yes','4','3','3','1','1','2','none','C','C'
1,'GP','F','R','GT3','T','mid','low','other','other','reputation','mother','medium','medium','none','no','yes','no','no','yes','no','yes','yes','4','3','5','1','2','3','more_than_ten','D','D'
2,'MS','F','U','GT3','T','low','low','at_home','services','other','father','medium','low','low','no','no','no','no','yes','no','no','no','5','5','5','2','3','2','none','F','F'
3,'GP','F','U','GT3','T','mid','low','other','other','home','mother','low','medium','none','no','yes','no','no','yes','yes','yes','yes','4','2','5','1','2','1','more_than_ten','B','B'
4,'MS','F','R','LE3','T','mid','mid','other','services','course','father','low','medium','none','no','no','no','yes','yes','yes','no','yes','5','4','3','1','1','1','none','C','A'
5,'GP','M','U','GT3','T','high','mid','other','other','course','mother','low','medium','none','no','yes','yes','yes','yes','yes','yes','yes','5','2','3','1','1','2','four_to_six','C','B'
6,'GP','M','U','GT3','T','low','none','other','other','reputation','mother','medium','medium','none','no','yes','no','yes','yes','yes','yes','yes','4','3','2','1','1','3','none','A+','C'
7,'GP','F','R','GT3','T','low','low','at_home','other','course','mother','high','low','high','no','yes','no','yes','no','yes','no','no','5','2','5','1','5','4','four_to_six','D','F'


In [8]:
# This function calculates precision, recall and f1 score for the further analysis
def getMetrics(real_grades, predicted_set):
    grade_list = ['A+','A','B','C','D','F']
    precision_dic = {'A+':0,'A':0,'B':0,'C':0,'D':0,'F':0}     # initialise dictionaries to save results
    recall_dic = {'A+':0,'A':0,'B':0,'C':0,'D':0,'F':0}
    f1_dic = {'A+':0,'A':0,'B':0,'C':0,'D':0,'F':0}
    avg_precision, avg_recall, avg_f1 = 0, 0, 0
    
    # count true positive(TP), true negative(TN), false pasitive(FP), false negative(FN) per class           
    for grade in grade_list:             
        # initialise variables TP, TN, FP, FN
        TP, TN, FP, FN = 0, 0, 0, 0  
        for i in range(len(predicted_set)):
            if predicted_set[i] == grade:
                if real_grades[i] == grade:
                    TP += 1
                else: FP += 1
            else:
                if real_grades[i] == grade:
                    FN += 1
                else: TN += 1
        if TP == 0 :             # when TP is 0, set the results as 0 to avoid zero division error
            precision_dic[grade], recall_dic[grade], f1_dic[grade] = 0, 0, 0

        else:     
            precision_dic[grade] = round(TP/(TP + FP),4)
            recall_dic[grade] = round(TP/(TP + FN),4)
            f1_dic[grade] = round(2*(precision_dic[grade] * recall_dic[grade])\
                    /(precision_dic[grade] + recall_dic[grade]),4) 
            
        avg_precision += precision_dic[grade]
        avg_recall += recall_dic[grade]            
        avg_f1 += f1_dic[grade]
    avg_precision = round(avg_precision/len(grade_list),4)  # calculate macro average matrics
    avg_recall = round(avg_recall/len(grade_list),4)
    avg_f1 = round(avg_f1/len(grade_list),4)
    print("Precision_dic :", precision_dic)
    print("Recall_dic :", recall_dic)
    print("F1_dic :", f1_dic)
    print("Average precision :", avg_precision)
    print("Average recall :", avg_recall)
    print("Average f1 :", avg_f1)

#execute the codes above
datafields = load_data()
data_label, train_set, test_set = split_data(datafields, 1)
prior, likelihood = train(data_label, train_set)
predicted_set = predict(data_label, test_set, prior, likelihood)
real_grades = test_set[:,-1]      # get the actual grade from testing data
accuracy = evaluate(real_grades, predicted_set)
getMetrics(real_grades, predicted_set)
print("Accuracy for comparison: %.4f " % accuracy) 


Precision_dic : {'A+': 0, 'A': 0.1111, 'B': 0.25, 'C': 0.3333, 'D': 0.4118, 'F': 0.6154}
Recall_dic : {'A+': 0, 'A': 0.1667, 'B': 0.1538, 'C': 0.4615, 'D': 0.4375, 'F': 0.6154}
F1_dic : {'A+': 0, 'A': 0.1333, 'B': 0.1904, 'C': 0.3871, 'D': 0.4243, 'F': 0.6154}
Average precision : 0.2869
Average recall : 0.3058
Average f1 : 0.2918
Accuracy for comparison: 0.3692 


In [9]:
# This code block is to compare the outcomes of hold-out and cross-validation strategies
from sklearn.model_selection import train_test_split
import numpy as np

# This function splits a data into n groups for cross-validation
def split_n_data(datafields, random_number, n):    # takes the number of iteration as argument n
    data_label = datafields[0]      # separate data and labels
    datafields = datafields[1:] 
    test_sets, sets = [], []      # create lists to keep each train and test set
    
    while not n == 1:    
        remaining, test = train_test_split(datafields, test_size = 1/n, random_state = random_number) 
        n -= 1
        if(n == 1):              
            test_sets += [test]
            test_sets += [remaining]
    
        else: 
            test_sets += [test]
            datafields = remaining
    for i in range(len(test_sets)):
        train_sets = []
        test_set = test_sets[i]
        for j in range(len(test_sets)):
            if(j != i):
                train_sets += test_sets[j]
        sets += [[test_set, train_sets]]

    return data_label, sets          # return data lists

# the code below to get accuracy from cross-validation strategy
datafields = load_data()
data_label, sets = split_n_data(datafields, 1, 10)
accuracy_list = []
accuracy_sum = 0
for [test_set, train_set] in sets:
    test_set = np.array(test_set)
    train_set = np.array(train_set)
    prior, likelihood = train(data_label, train_set)
    predicted_set = predict(data_label, test_set, prior, likelihood)
    real_grades = test_set[:,-1]                # get the actual grade from testing data
    accuracy = round(evaluate(real_grades, predicted_set),4)
    accuracy_list.append(accuracy)
    accuracy_sum += accuracy

# the code below to get accuracy from hold-out strategy
data_label, train_set, test_set = split_data(datafields, 1)
prior, likelihood = train(data_label, train_set)
predicted_set = predict(data_label, test_set, prior, likelihood)
real_grades = test_set[:,-1]      # get the actual grade from testing data
accuracy = evaluate(real_grades, predicted_set)

print("10 iterations")
mean = (accuracy_sum/len(accuracy_list))
print(accuracy_list)
print("Average of accuracy : %.4f" % mean)


10 iterations
[0.3692, 0.2923, 0.4308, 0.3538, 0.3846, 0.3077, 0.3385, 0.2769, 0.3385, 0.3438]
Average of accuracy : 0.3436
