In [1]:
import csv
import random
import math

In [4]:
def loadcsv(filename):
    dataset = list(csv.reader(open(filename, "r")))
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    trainSet, testSet = dataset[:trainSize], dataset[trainSize:]
    return [trainSet, testSet]

def mean(numbers):
    return sum(numbers) / (len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = 0
    for i in numbers:
        variance += (i - avg) ** 2

    return math.sqrt(variance / (len(numbers) - 1))

def summarizeByClass(trainingSet):
    seperated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[-1] not in seperated):
            seperated[vector[-1]] = []
        seperated[vector[-1]].append(vector)

    summaries = {}
    for classValue, instances in seperated.items():
        summaries[classValue] = [(mean(attribute), stdev(attribute)) for attribute in zip(*instances)][:-1]
    return summaries

def calculateProbablity(x, mean, stdev):
    exponent = math.exp((-(x - mean) ** 2) / (2 * (stdev ** 2)))
    return (1 / math.sqrt(2 * math.pi * (stdev ** 2))) * exponent

def predict(summaries, testSetValue):
    probablities = {}
    for classValue, instances in summaries.items():
        probablities[classValue] = 1
        for i in range(len(instances)):
            mean, stdev = instances[i]
            x = testSetValue[i]
            probablities[classValue] *= calculateProbablity(x, mean, stdev)
    
    bestLabel, bestProb = None, -1
    for classValue, probability in probablities.items():
        if bestLabel is None or probability >  bestProb:
            bestProb = probability
            bestLabel = classValue
    
    return bestLabel

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct / len(testSet)) * 100.0

filen = 'diabetes2.csv'
splitRatio = 0.9
dataset = loadcsv(filename=filen)
actual = []
trainingSet, testSet = splitDataset(dataset, splitRatio)

for i in range(len(testSet)):
    vector = testSet[i]
    actual.append(vector[-1])

print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset), len(trainingSet), len(testSet)))

summaries = summarizeByClass(trainingSet)
predictions = getPredictions(summaries, testSet)

print("\nActual values : \n", actual)
print("\nPredictions :\n", predictions)

accuracy = getAccuracy(testSet, predictions)

print("Accuracy : \n", accuracy)



Split 768 rows into train = 691 and test = 77 rows


NameError: name 'v' is not defined