In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
# Example of Naive Bayes implemented from Scratch in Python

# Different fetures in the "pima-indians-diabetes.data.csv"

- Pregnancies: Number of times pregnant
- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- BloodPressure: Diastolic blood pressure (mm Hg)
- SkinThickness: Triceps skin fold thickness (mm)
- Insulin: 2-Hour serum insulin (mu U/ml)
- BMI: Body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: Diabetes pedigree function
- Age: Age (years)
- Outcome: Class variable (0 or 1)

In [3]:
import csv
import math
import random

In [4]:
# Load data from file

def loadCsv(filename):
    lines = csv.reader(open(filename,'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset
d = loadCsv("data/pima-indians-diabetes.data.csv")

In [5]:
d[0]

[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]

random.randrange(100) #it generate single digit b/w 0 to 100

random.random() #it generate single random number b/w 0 to 1

l = ['f','gh','he','w','r','rt']
l.pop(2) #pop() return and delete the list data at that particular index
l
l.pop(3)
l

In [6]:
#split data into train and test
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [7]:
#group instances by class (+ve or -ve) in the form of dictionary
def separateByClass(dataset):
    seperated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[-1] not in seperated):
            seperated[vector[-1]] = []
        seperated[vector[-1]].append(vector)
    return seperated

In [9]:
#compute mean of a vector
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [10]:
#computed std dev of a vector
def stdev(numbers):
    avg = mean(numbers)
    varience = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(varience)

In [14]:
zip(*d) #it will return columns/feture
for attribute in zip(*d):
    print('a')

<zip at 0x7efe2b7ece48>

a
a
a
a
a
a
a
a
a


In [15]:
#compute summaries where summary is mean and std dev of each column in a dataset
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1] #delete last column bec it is class column
    return summaries

In [21]:
#now we want to compute mean and stdev of people who have diabetes and who haven't
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
#     print(separated.items())
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [22]:
print("These all are the mean and stdev pair of all the attribute in each class label")
dataset = loadCsv('data/pima-indians-diabetes.data.csv')
summarizeByClass(dataset) #it returns all the mean and stdev pair of all the attribute for each class label

These all are the mean and stdev pair of all the attribute in each class label


{1.0: [(4.865671641791045, 3.741239044041554),
  (141.25746268656715, 31.939622058007195),
  (70.82462686567165, 21.49181165060413),
  (22.16417910447761, 17.67971140046571),
  (100.33582089552239, 138.6891247315351),
  (35.14253731343278, 7.262967242346376),
  (0.5505, 0.372354483554611),
  (37.06716417910448, 10.968253652367915)],
 0.0: [(3.298, 3.01718458262189),
  (109.98, 26.14119975535359),
  (68.184, 18.063075413305828),
  (19.664, 14.889947113744254),
  (68.792, 98.86528929231767),
  (30.30419999999996, 7.689855011650112),
  (0.42973400000000017, 0.29908530435741093),
  (31.19, 11.667654791631156)]}

In [23]:
#compute prob using a Gaussian distribution
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent


In [24]:
#compute P(x|C)
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [25]:
#predict class label for an inputVector
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [26]:
#predict class label for a test dataset
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [27]:
#compute accuracy of predictions for the test dataset
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [28]:
def main():
    filename = 'data/pima-indians-diabetes.data.csv'
    splitRatio = 0.67
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split',len(dataset),' rows into train=',len(trainingSet),' and test=', len(testSet),' rows')
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy',accuracy)

In [121]:
main()

Split 768  rows into train= 514  and test= 254  rows
Accuracy 80.31496062992126
