In [4]:
import pandas as pd 
import numpy as np
import math

In [5]:
class NaiveBayes:
    def fit(self, X, Y, use_laplace_log=False):
        self.use_laplace_log = use_laplace_log
        self.dataset = np.append(X,Y.reshape(-1,1), axis=1)
        self.total_rows = len(self.dataset)
        self.class_data = self.get_data_by_class(self.dataset)
        self.class_stats = self.get_stats_per_class(self.dataset)
        self.feature_stats = self.get_stats_per_feature(self.dataset)
        self.bernoulli_values = self.get_bernoulli_values()
        
    def get_data_by_class(self, dataset):
        separated = {}
        for i,row in enumerate(dataset):
            class_value = row[-1]
            if(class_value not in separated):
                separated[class_value] = []
            separated[class_value].append(row)
        return separated
    
    def set_headers(self, headers):
        self.headers = []
        for header in headers:
            if '_b' in header:
                self.headers.append(1)
            else:
                self.headers.append(0)
        return self.headers

    def mean(self, numbers):
        return sum(numbers)/float(len(numbers))
    
    def stdev(self, numbers):
        mean = self.mean(numbers)
        variance = sum([(x-mean)**2 for x in numbers]) / float(len(numbers)-1)
        return math.sqrt(variance)
    
    def get_bernoulli_values(self):
        self.bernoulli_values = {}
        for class_name in self.class_stats:
            self.bernoulli_values[class_name] = []
            for index, is_bernoulli in enumerate(self.headers):
                bernoulli_value = {}
                if(is_bernoulli == 1):
                    unique_vals = (np.unique(self.dataset[:,index]))
                    for unique_val in unique_vals:
                        if(self.use_laplace_log):
                            bernoulli_value[unique_val] = self.calculate_bernoulli_log_probability(unique_val,index,class_name)
                        else:
                            bernoulli_value[unique_val] = self.calculate_bernoulli_probability(unique_val,index,class_name)
                self.bernoulli_values[class_name].append(bernoulli_value)
        return self.bernoulli_values

    # [Mean, Stdev, Count] for each feature in dataset
    def get_stats_per_feature(self, dataset):
        stats = [[self.mean(feature), self.stdev(feature), len(feature)] for feature in zip(*dataset)]
        del(stats[-1])
        return stats
    
    def get_stats_per_class(self, dataset):
        # Dictionary of class -> rows of that class
        data_per_class = self.get_data_by_class(dataset)
        stats_per_class = {}
        for class_name, rows in data_per_class.items():
            stats_per_class[class_name] = self.get_stats_per_feature(rows)
        return stats_per_class
    
    # Estimates value x from mean and stdev!
    def calculate_gaussian_probability(self, x, mean, stdev):
        exponent = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

    # Estimates log gaussian probability!
    def calculate_log_gaussian_probability(self, x, mean, stdev):
        variance = stdev**2
        answer = (-0.5)*math.log((2*math.pi*variance))-(0.5)*((x-mean)**2/variance)
        return answer

    # Count (f_val on f_index and class_name)/Count (class_name)
    def calculate_bernoulli_probability(self, f_val, f_index, class_name):        
        count_f_val_and_class = 0
        for row in self.class_data[class_name]:
            if(row[f_index] == f_val):
                count_f_val_and_class += 1
        return (count_f_val_and_class)/self.class_stats[class_name][0][2]

    # Log (f_val on f_index and class_name+1)/Count (class_name)+2
    def calculate_bernoulli_log_probability(self, f_val, f_index, class_name):        
        count_f_val_and_class = 0
        for row in self.class_data[class_name]:
            if(row[f_index] == f_val):
                count_f_val_and_class += 1
        return math.log((count_f_val_and_class+1)/(self.class_stats[class_name][0][2]+2))

    # Calculate probability a row is a given class
    # self.headers is 1 for all feature indexes that are bernoulli
    def predict_probabilities(self, row):
        probabilities = {}
        for class_name, class_stats in self.class_stats.items():
            
            # First calculate P(c)
            if(self.use_laplace_log):
                probabilities[class_name] = math.log(self.class_stats[class_name][0][2])/(float(self.total_rows))
            else:
                probabilities[class_name] = self.class_stats[class_name][0][2]/float(self.total_rows)
            
            # Loop through features and multiply their probabilities
            # Prob(f_val given class_name) and multiply to probability[class_name]
            for i in range(len(class_stats)):
                if(self.headers[i] == 1):
                    # Calculate P(x | c) * P(c)
                    if(row[i] in self.bernoulli_values[class_name][i]):
                        # Have some examples of this feature value for this class.
                        if(self.use_laplace_log):
                            probabilities[class_name] += self.bernoulli_values[class_name][i][row[i]]
                        else:
                            probabilities[class_name] *= self.bernoulli_values[class_name][i][row[i]]
                    else:
                        # No examples of this feature value with this class.
                        if(self.use_laplace_log):
                            # Assume 1/((class count)+2) probability with smoothing
                            probabilities[class_name] += math.log((1)/(self.class_stats[class_name][0][2]+2))
                        else:
                            # Assume 0 probability without smoothing
                            probabilities[class_name] *= 1/2
                else:
                    mean = class_stats[i][0]
                    stdev = class_stats[i][1]

                    # Calculate P(x | c) * P(c)
                    if(self.use_laplace_log):
                        probabilities[class_name] += self.calculate_log_gaussian_probability(row[i], mean, stdev)
                    else:
                        probabilities[class_name] *= self.calculate_gaussian_probability(row[i], mean, stdev)

        
        # Return P(c | "features") for each class value
        return probabilities
    
    def predict(self, rows_to_predict):
        
        predictions = []
        if(self.headers == []):
            print("Please input which headers are bernoulli using NaiveBayes().set_headers")
            return
        for row in rows_to_predict:
            probabilities = self.predict_probabilities(row)
            best_label, best_prob = None, -1
            for class_name, probability in probabilities.items():
                if best_label is None or probability > best_prob:
                    best_prob = probability
                    best_label = class_name
            predictions.append(best_label)
        return np.array(predictions)

In [6]:
# %store -r adult_data_headers
# %store -r adult_data_export

In [7]:
# NaiveBayes = NaiveBayes()
# NaiveBayes.fit(adult_data_export)
# print(NaiveBayes.set_headers(adult_data_headers))



In [8]:
# NaiveBayes.predict([[39,77516,13,0,2174,0,40,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [50,83311,13,0,0,0,13,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [38,215646,9,0,0,0,40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [53,234721,7,0,0,0,40,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [28,338409,13,1,0,0,40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
# [37,284582,14,1,0,0,40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [49,160187,5,1,0,0,16,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
# [52,209642,9,0,0,0,45,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [31,45781,14,1,14084,0,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [42,159449,13,0,5178,0,40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [37,280464,10,0,0,0,80,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
# [30,141297,13,0,0,0,40,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
# [23,122272,13,1,0,0,30,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]])

In [9]:
# %run validation.ipynb

In [10]:
# evaluate_acc(adult_data_export[:-1],adult_data_export[-1])

In [11]:
# # Test summarizing a dataset
# dataset = [[3.393533211,2.331273381,0],
# [3.110073483,1.781539638,0],
# [1.343808831,3.368360954,0],
# [3.582294042,4.67917911,0],
# [2.280362439,2.866990263,0],
# [7.423436942,4.696522875,1],
# [5.745051997,3.533989803,1],
# [9.172168622,2.511101045,1],
# [7.792783481,3.424088941,1],
# [7.939820817,0.791637231,1]]

# NaiveBayes = NaiveBayes()
# NaiveBayes.fit(dataset)
# NaiveBayes.predict(dataset)
# print(NaiveBayes.get_stats_per_class(dataset))