# To implement a gaussian naive bayes classifier on the pima india database



## Dataset description:
+ Number of times pregnant 
+ Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
+ Diastolic blood pressure (mm Hg) 
+ Triceps skin fold thickness (mm) 
+ 2-Hour serum insulin (mu U/ml) 
+ Body mass index (weight in kg/(height in m)^2) 
+ Diabetes pedigree function 
+ Age (years) 
+ Class variable (0 or 1)    
[source](https://archive.ics.uci.edu/ml/datasets/pima+indians+diabetes)

### 1. import the required libraries and the loader function

In [1]:
import csv
import random
import math

def load_data(filename):
    lines = csv.reader(open(filename, "rt"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

### 2. Split the data into training and testing sets

In [2]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

### 3. Split the data class-wise to calculate the per class statistics

In [3]:
# We can do that by creating a map of each class value to a list of instances 
# that belong to that class and sort the entire dataset of instances into 
# the appropriate lists.
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

### 4. calculate mean and standard deviation

In [4]:
def mean(numbers):
    return sum(numbers) / float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

### 5. Summerize the dataset

In [5]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

### 6. summarize the data by class i.e group by class and then summarize that (the attribute)

In [6]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

### 7. Calculate the probability of the attribute using the gaussian function, given the data, mean, and standard deviation

In [7]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

### 8. combine the prob of all attributes to get the probability of the instance of that class

In [8]:
#Now that we can calculate the probability
# of an attribute belonging to a class,
# we can combine the probabilities
# of all of the attribute values for a data instance and come up4
# with a probability of the entire data instance belonging to the class.
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

### 9. after finding the prob of the instance, find the largest prob and get its class

In [9]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

### 10. Get predictions for each instance and then return that list

In [10]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions



### 11. Performance measures:
+ Accuracy = |predicted true| / |actual true|

In [11]:
def getAccuracy(testSet, predictions):
   correct = 0
   for x in range(len(testSet)):
      if testSet[x][-1] == predictions[x]:
         correct += 1
   return (correct/float(len(testSet))) * 100.0

### 12. The main function
+ select dataset
+ set split ratio (75:25 :: train::test)
+ train and test
+ get accuracy

In [12]:
def main():
    filename = "pima-indians-diabetes.data.csv"
    splitRatio = 0.75
    dataset = load_data(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split',len(dataset),'rows into train',len(trainingSet), 'test',len(testSet))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy:is ',accuracy)

main()

('Split', 768, 'rows into train', 576, 'test', 192)
('Accuracy:is ', 72.91666666666666)
