## Native Bayes Classifer

In [1]:
import numpy as np
import pandas as pd
from math import log, exp
from collections import defaultdict

In [2]:
class NaiveBayesClassifier():
    def __init__(self):
        self.priors = None
        self.posteriors = None


    def train(self, X, y):
#         Fits training instances X and class labels y to a supervised Naive Bayes model.
#         Uses Laplace Smoothing to make all posterior probabilities non-zero.

        def calculate_priors(class_list):
#         Calculate the class priors and fits them to the classifier.
         
            prior_dd = defaultdict(int)
            # Count all class labels and add to dictionary
            for class_label in class_list:
                prior_dd[class_label] += 1
            self.priors = dict(prior_dd)


        def calculate_posteriors(priors_dict, instance_list, class_list):
#                 calculate_posteriors takes in a dictionary or prior probabilities for class labels,
#                  It calculates and fits posterior probabilities to the classifier.


            # Initialize a list of dictionaries, one for each attribute
            posteriors_dictlist = [dict() for x in range(len(instance_list))]

            # Initialize a default dict for each class label, for each attribute dictionary
            for attribute_dd in posteriors_dictlist:
                for class_label in priors_dict.keys():
                    # Start at 1 for Laplace smoothing
                    attribute_dd[class_label] = defaultdict(lambda:1)


            # Count the number of instances for each conditional probability P(Attribute=attr_instance | Class)
            for col in range(len(instance_list)):
                for row in range(len(instance_list[col])):
                    
                    posteriors_dictlist[col][class_list[row]][instance_list[col][row]] += 1
                
                # Keep track of all attribute possibilites
                attr_set = set()
                for label in posteriors_dictlist[col].keys():
                    for attr in posteriors_dictlist[col][label].keys():
                        attr_set.add(attr)
                
                # Add attributes with counts of 1 (Laplace Smoothing) when no occurances for a given class
                for label in posteriors_dictlist[col].keys():
                    for attr in attr_set:
                        if attr not in posteriors_dictlist[col][label].keys():
                            # Start at 1 for Laplace smoothing
                            posteriors_dictlist[col][label][attr] = 1
                            
            self.posteriors = posteriors_dictlist


        # Fit all prior and posterior probabilities to the model
        calculate_priors(y)
        calculate_posteriors(self.priors, X, y)


    def predict(self, test_set):
# predict the class of a set of instances using a trained supervised Naive Bayes model.
        
        if (self.priors is None or self.posteriors is None):
            raise ValueError("Naive Models model has not been fit.")

        predictions = []
        n_test_instances = len(test_set[0])

        # Make a prediction for every instance in the test set
        for test_row in range(n_test_instances):
            label_predict_probs = []
            
            # Calculate prediction probability for each class label
            for label in self.priors.keys():
                label_count = self.priors[label]
                
                # Prior log probability log(P(label))
                label_prob = log(label_count / n_test_instances)
                
                # Sum the prediction probability and log(posterior probabilities) to avoid underlow
                # Dividing by the number of labels + number of attribute values (Laplace Smoothing)
                for test_col in range(len(test_set)):
                    attr = test_set[test_col][test_row]
                    
                    posterior_prob = self.posteriors[test_col][label][attr] / \
                            (label_count + len(self.posteriors[test_col][label]))
                    
                    label_prob += log(posterior_prob)
                
                # Turn log probabilitiy back in probability
                label_prob = exp(label_prob)
                label_predict_probs.append((label_prob, label))

            # Sort the predictions from high-low and predict the label with the highest probability
            label_predict_probs.sort(reverse=True)
            predictions.append(label_predict_probs[0][1])
        
        return predictions  # return a list of class predictions

In [3]:
def preprocess(csv_path):
#     read the file
    df = pd.read_csv(csv_path, header=None)

    # Add a list of each instance for each attribute (the first N-1 columns in the DataFrame)
    instance_list = []
    if ((len(df.columns) > 1)):
        for attribute_index in range(0, (len(df.columns) - 2)):
            instance_list.append(df[attribute_index].tolist())
    
    # Make sure attribute instances are in String format
    for index in range (0, len(instance_list)):
        instance_list[index] = [str(i) for i in instance_list[index]]
        
    class_list = []
    if ((len(df.columns) > 0)):
        class_list = df[(len(df.columns) - 1)].tolist()
    class_list = [str(i) for i in class_list]
    
#     rethrn (<2D list of instances>, <list of class labels>, <number of unique labels>)
    n_classes = len(set(class_list))
    return instance_list, class_list, n_classes

In [4]:
def evaluate_model(predicted_classes, actual_classes):
#     Evaluates the number of correct predictions made by a Multinomial Naive Bayes classifier.
    n_correct = 0
    for test in range(len(predicted_classes)):
        if predicted_classes[test] == actual_classes[test]:
            n_correct += 1
            
    # Returns an accuracy score between [0,1].
    return n_correct / len(predicted_classes) 

In [5]:
def test_and_print_results(dataset_csv_path):
# Trains and evaluates a Multinomial Naive Bayes learner
    
    data = preprocess(dataset_csv_path)
    
    NB = NaiveBayesClassifier()
    NB.train(data[0], data[1])
    predicted_classes = NB.predict(data[0])
    accuracy = evaluate_model(predicted_classes, data[1])
    
#     print the accuracy score
    print('Accuracy: '+ '{0:.2f}'.format(accuracy * 100) + '% ')

In [6]:
test_and_print_results('Dataset.csv')

Accuracy: 85.25% 


### Prediction of Test Set

In [7]:
data = preprocess('Dataset.csv')
test = preprocess('test.csv')
NB = NaiveBayesClassifier()
NB.train(data[0], data[1])
predicted_test = NB.predict(test[0])

print(predicted_test, end = "")

['Class', '-1', '1', '-1', '1', '-1', '-1', '-1', '1', '-1', '-1', '-1', '-1', '1', '-1', '-1', '-1', '-1', '1', '-1', '-1', '-1', '1', '1', '-1', '-1', '1', '1', '-1', '-1', '-1', '-1', '1', '1', '-1', '1', '-1', '-1', '1', '-1', '-1', '1', '1', '1', '-1', '-1', '1', '-1', '-1', '-1', '-1', '0', '1', '-1', '1', '1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '1', '1', '-1', '-1', '1', '1', '-1', '1', '1', '-1', '-1', '1', '1', '1', '1', '-1', '-1', '-1', '1', '1', '1', '1', '1', '1', '-1', '1', '-1', '1', '-1', '1', '-1', '-1', '-1', '1', '1', '1', '1', '-1']