# Naive Bayes Implementation

## Load the Dataset

In [1]:
import pandas as pd
import numpy as np
import math
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/"
dataset = "pima-indians-diabetes/pima-indians-diabetes.data"
columns = ["pregnant-time",
           "plasma-glucose",
           "diastolic",
           "triceps",
           "insulin",
           "BMI",
           "DPF",
           "Age",
           "diabetic"]
diabetes = pd.read_csv(url + dataset, header=None,
           names = columns)

## Split the dataset for train and test

In [2]:
def split_data(dataset, split_ratio=0.7):
    """ Split the data for training and testing"""
    import random
    size = len(dataset)
    index = np.arange(size)
    np.random.shuffle(index)
    train_size = round(len(dataset) * split_ratio)
    return diabetes.iloc[index[:train_size], :], diabetes.iloc[index[train_size:], :]

In [4]:
train, test = split_data(diabetes)
train.reset_index(inplace=True)
test.reset_index(inplace=True)
train = train.drop('index', axis=1)
test = test.drop('index', axis=1)

## Fitting the model

In [5]:
def fit(className):
    from collections import defaultdict
    summary_data = defaultdict(list)
    for cols in columns[:-1]:
        zero = []
        one = []
        zero.append(dict(train.groupby(className).describe().loc[:, cols].
                                    loc[:, 'mean'])[0])
        one.append(dict(train.groupby(className).describe().loc[:, cols].
                                    loc[:, 'mean'])[1])
        zero.append(dict(train.groupby(className).describe().loc[:, cols].
                                    loc[:, 'std'])[0])
        one.append(dict(train.groupby(className).describe().loc[:, cols].
                                    loc[:, 'std'])[1])
        summary_data[0].append(tuple(zero))
        summary_data[1].append(tuple(one))
    return summary_data

In [6]:
fitter = fit("diabetic")

## Splitting for features and response

In [7]:
test_features = test.iloc[:, :-1]
actual_response = test.diabetic

## Prediction and Accuracy

In [8]:
def gaussian_naive_bayes(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def classify(row, fitter):
    ret = {}
    for klass, fit in fitter.items():
        prob = 1 / len(fitter) # Prior Probability
        for i, (mean, std) in enumerate(fit):
            prob *= gaussian_naive_bayes(row[i], mean, std)
        ret[klass] = prob
    return ret

def predict(fitter, test):
    import operator
    predictions = []
    class_probs = test.apply(lambda x: classify(x, fitter), axis=1)
    for each in class_probs:
        predictions.append(max(each.items(), key=operator.itemgetter(1))[0])
    return predictions

def score_accuracy(pred, actual):
    corr = 0
    for a, b, in zip(pred, actual):
        if a == b:
            corr += 1
    return round(corr / len(actual) * 100, 2)

In [9]:
pred = predict(fitter, test_features)
score_accuracy(pred, actual_response)

73.91