In [0]:
import pandas as pd
import random
import math
import statistics as st

In [0]:
def loadCsv(filename):
    df = pd.read_csv(filename)
    dataset = df.values.tolist()
    dataset = dataset[1:]
    return dataset

In [0]:
def splitDataset(dataset, ratio):
    trainSize = int(len(dataset) * ratio)
    trainSet = []
    testSet = dataset.copy()
    for i in range(trainSize):
        ind = random.randrange(len(testSet))
        trainSet.append(testSet.pop(ind))
        
    return trainSet, testSet
    # Uncomment to see the prediction
    # return trainSet, testSet[1: 3]

In [0]:
def mean(numbers):
    return st.mean(numbers)

def stdev(numbers):
    return st.stdev(numbers)

In [0]:
def summarize(dataset):
    summaries = [(mean(attri), stdev(attri)) for attri in zip(*dataset)]
    del summaries[-1]
    return summaries

In [0]:
# This fuction makes a dictionary of the dataset that has two 
# keys 0 and 1 and all the data is appended to those keys
def seprateByClass(dataset):
    seprated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        seprated.setdefault(dataset[i][-1], []).append(dataset[i])
    return seprated

In [0]:
def summarizeByClass(dataset):
    #seprate the data based on 0 and 1 output
    seprated = seprateByClass(dataset)
    summaries = {}
    #first it sends 0 with class 0 data and then 1 with class 1 data
    for cV, inst in seprated.items():
        summaries[cV] = summarize(inst)
    return summaries

In [0]:
def calculateProb(x, mean, stdev):
    exp = math.exp(-(math.pow(x-mean, 2)/ (2 * math.pow(stdev, 2))))
    return (1/(math.sqrt(2*math.pi) * stdev)) * exp

In [0]:
def calculateClassProb(summaries, input):
    prob = {}
    # cV will be 1 or 0 and cS will be the mean and stdev
    for cV, cS in summaries.items():
        prob[cV] = 1
        #class will have many cS 8 so for each we have to loop
        for i in range(len(cS)):
            #for each cS extract mean and stdev
            mean, stdev = cS[i]
            x = input[i]
            # for all the values in input it is classified that if its prob is 1 or 0. The weights. Uncomment print to see
            prob[cV] *= calculateProb(x, mean, stdev)
            # print(x, prob[cV])
    return prob


In [0]:
def predict(summaries, input):
    prob = calculateClassProb(summaries, input)
    #gets the prob of the input  can be of any class
    # print(prob)
    bestLabel, bestProb = None, -1
    for cV, proba in prob.items():
        # if no the prob of 0 is greater or 1 is greater
        if proba > bestProb:
            bestProb = proba
            bestLabel = cV
    #return the best label
    return bestLabel

In [0]:
def getPredictions(summaries, testSet):
    predictions = []
    # Takes each test input and then passes to the predict function for prediction
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [0]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return correct/float(len(testSet))

In [0]:
def main():
    splitRatio = 0.75
    dataset = loadCsv("diabetes-data.csv")
    trainSet, testSet = splitDataset(dataset, splitRatio)
    print("-------------------Output of naive bayes classifier-------------------")
    print('Splitting {} rows into training = {} and testing = {} rows'.format(len(dataset), len(trainSet), len(testSet)))
    summary = summarizeByClass(trainSet)
    predictions = getPredictions(summary, testSet)
    acc = getAccuracy(testSet, predictions)
    print(acc)

In [110]:
main()

-------------------Output of naive bayes classifier-------------------
Splitting 767 rows into training = 575 and testing = 192 rows
0.765625
