In [1]:
import csv
def loadCSV(filename):
    dataset = list(csv.reader(open(filename,'rb')))
    for i in range(len(dataset)):
        dataset[i] = (float(x) for x in dataset[i])
    return dataset

filename = "pima-indians-diabetes.csv"
dataset = loadCSV(filename)
print('Loaded data file {0} with {1} rows').format(filename, len(dataset))

Loaded data file pima-indians-diabetes.csv with 768 rows


In [2]:
import random

def split(dataSet, splitratio):
    trainsize = int(len(dataSet)*splitratio)
    testdata = list(dataSet)
    traindata = []
    while len(traindata) < trainsize:
        index = random.randrange(len(testdata))
        traindata.append(testdata.pop(index))
    return traindata, testdata
datatrain, datatest = split(dataset, 0.5)
print "split {0} rows for training data {1} and testing data {2}".format(len(dataset), len(datatrain), len(datatest))
print len(dataset)

split 768 rows for training data 384 and testing data 384
768


In [3]:
def seperatedByClass(dataset):
    seperated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in seperated):
            seperated[vector[-1]] = []
        seperated[vector[-1]].append(vector)
    return seperated
                                     
dataset1 = [[1,20,1], [2,21,0], [3,22,1]]
print sum(dataset1[0])
    

22


In [4]:
import math
def mean(numbers):
    return sum(numbers)/float(len(numbers))
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)
numbers = [1,2,3,4,5]
print('Summary of {0}: mean={1}, stdev={2}').format(numbers, mean(numbers), stdev(numbers))

Summary of [1, 2, 3, 4, 5]: mean=3.0, stdev=1.58113883008


In [5]:
def summarize(dataset):
    summaries = [(mean(attribute),stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries
dataset2 = [[1,20,0], [2,21,1], [3,22,0]]
print zip(*dataset2)
summary = summarize(dataset2)
print('Attribute summaries: {0}').format(summary)

[(1, 2, 3), (20, 21, 22), (0, 1, 0)]
Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]


In [6]:
def summarizeByClass(dataset):
    seperated = seperatedByClass(dataset)
    summaries = {}
    for classvalue, instances in seperated.iteritems():
        summaries[classvalue] = summarize(instances)
    return summaries
dataset3 = [[1,20,1], [2,21,0], [3,22,1], [4,22,0], [5,23,1], [6,24,0]]
summary = summarizeByClass(dataset3)
print('Summary by class value: {0}').format(summary)
    

Summary by class value: {0: [(4.0, 2.0), (22.333333333333332, 1.5275252316519465)], 1: [(3.0, 2.0), (21.666666666666668, 1.5275252316519465)]}


In [7]:
def calculateProbability(x,mean,stdev):
    return (1/(math.sqrt(2*math.pi)*stdev))*(math.exp(-0.5*pow((x-mean)/stdev,2)))
x = 71.5
mean = 73
stdev = 6.2
probability = calculateProbability(x, mean, stdev)
print('Probability of belonging to this class: {0}').format(probability)

Probability of belonging to this class: 0.0624896575937


In [8]:
def calculateClassProbabilities(summaries, inputvector):
    probabilities = {}
    for classvalue, instances in summaries.iteritems():
        probabilities[classvalue] = 1
        for i in range(len(instances)):
            mean, stdev = instances[i]
            x = inputvector[i]
            probabilities[classvalue] *= calculateProbability(x,mean,stdev)
    return probabilities
summaries1 = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector1 = [1.1, '?']
probabilities = calculateClassProbabilities(summaries1, inputVector1)
print('Probabilities for each class: {0}').format(probabilities)

Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150438e-05}


In [9]:
def predict(summaries, inputvector):
    probabilities = calculateClassProbabilities(summaries, inputvector)
    bestlabel, bestprob = None, -1
    for classvalue, probability in probabilities.iteritems():
        if bestlabel is None or probability > bestprob:
            bestlabel = classvalue
            bestprob = probability
    return bestlabel
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [1.1, '?']
result = predict(summaries, inputVector)
print('Prediction: {0}').format(result)

Prediction: A


In [10]:
def getPredictions(summaries, testdata):
    predictions = []
    for i in range(len(testdata)):
        predictions.append(predict(summaries, testdata[i]))
    return predictions
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [19.1, '?']]
predictions = getPredictions(summaries, testSet)
print('Predictions: {0}').format(predictions)

Predictions: ['A', 'B']


In [11]:
def getAccuracy(testdata, predictions):
    correct = 0
    for i in range(len(testdata)):
        if testdata[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testdata)))*100
testdata = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testdata, predictions)
print('Accuracy: {0}').format(accuracy)

Accuracy: 66.6666666667
