In [27]:
import csv
import math
import random
import pandas as pd
import numpy as np

def encodeData(DF):
    #print(DF['sex'].unique())

    for cnum,column in enumerate(DF.columns):
        lableDict = {}
        uniques = DF[column].unique()
        #print(column,uniques)
        for i, uniqueVal in enumerate(uniques):
            #print('into dict',column,uniqueVal,i)
            lableDict[uniqueVal] = i
        for key,value in lableDict.items():
            #print(key,value)
            DF.iloc[DF[column] == key,cnum] = value
            #print(DF[DF[column] == key])
    return DF.to_numpy()


def loadCsv():
    datasetInto = pd.read_csv('heart_disease_uci.csv')

    dataset = encodeData(datasetInto.iloc[:,1:])
    for i in range(1,len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

loadCsv()

array([[0, 0, 0, ..., 0.0, 0, 0],
       [1.0, 0.0, 0.0, ..., 3.0, 1.0, 2.0],
       [1.0, 0.0, 0.0, ..., 2.0, 2.0, 2.0],
       ...,
       [23.0, 0.0, 3.0, ..., nan, 0.0, 2.0],
       [14.0, 0.0, 3.0, ..., nan, nan, 0.0],
       [5.0, 0.0, 3.0, ..., nan, nan, 2.0]], dtype=object)

In [28]:



def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]


def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
            separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return np.sum(numbers)/float(len(numbers))

def stdev(numbers):
    #print(numbers,len(numbers))
    avg = np.mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries = [(mean(numbers), stdev(numbers)) for numbers in zip(*dataset)]
    del summaries[-1]
    return summaries

def calculateProbability(x, mean, stdev):
    #main function for naiveBayes
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
        probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet)))*100.0



filename = 'heart_disease_uci.csv'
dataset = loadCsv()
trainSet = []
testSet = list(dataset)
while len(trainSet) < 800:
    index = random.randrange(len(testSet))
    trainSet.append(testSet.pop(index))

#prepare model
summaries = summarizeByClass(trainSet)
#test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))

def getPrecisionRecall(actual,prediction):
    
    #tp,fp, fn, tn = 0
    a0,a1,a2,a3,a4 = 0,0,0,0,0
    p0,p1,p2,p3,p4 = 0,0,0,0,0
    t0,t1,t2,t3,t4 = 0,0,0,0,0
    correct = [a0,a1,a2,a3,a4]
    incorrect = [p0,p1,p2,p3,p4]
    totals = [t0,t1,t2,t3,t4]
    for i in range(0,len(actual)):
        for j in range(0,5):
            print(j,actual[i],prediction[i],totals[j])
            if actual[i] == j:
                totals[j] +=1
                if prediction[i] == j:
                    correct[j] += 1
                else:
                    incorrect[j] += 1

    #group 1 scores
    #recall = correct / all
    #precision = correct / (correct+incorrect)
    for j in range(0,5):
        recall = np.divide(correct[j], totals[j])
        precision = np.divide(correct[j],(correct[j] + incorrect[j]))
        print('Group %d Results: recall - %f, precision %f, total correct: %d, total incorrect - %d'%(j,recall,precision,correct[j],incorrect[j]))
actualValues = []
for num in testSet:
    actualValues.append(int(num[-1]))
predictions = np.array(predictions).astype(int)
print(actualValues)
getPrecisionRecall(actualValues,predictions)





Accuracy: 47.5%
[2, 0, 2, 0, 3, 0, 0, 2, 4, 0, 2, 0, 0, 3, 2, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 3, 2, 2, 2, 2, 0, 3, 2, 2, 0, 2, 0, 3, 3, 0, 0, 3, 3, 0, 2, 0, 4, 2, 3, 2, 2]
0 2 0 0
1 2 0 0
2 2 0 0
3 2 0 0
4 2 0 0
0 0 0 0
1 0 0 0
2 0 0 1
3 0 0 0
4 0 0 0
0 2 0 1
1 2 0 0
2 2 0 1
3 2 0 0
4 2 0 0
0 0 0 1
1 0 0 0
2 0 0 2
3 0 0 0
4 0 0 0
0 3 0 2
1 3 0 0
2 3 0 2
3 3 0 0
4 3 0 0
0 0 0 2
1 0 0 0
2 0 0 2
3 0 0 1
4 0 0 0
0 0 0 3
1 0 0 0
2 0 0 2
3 0 0 1
4 0 0 0
0 2 0 4
1 2 0 0
2 2 0 2
3 2 0 1
4 2 0 0
0 4 0 4
1 4 0 0
2 4 0 3
3 4 0 1
4 4 0 0
0 0 0 4
1 0 0 0
2 0 0 3
3 0 0 1
4 0 0 1
0 2 0 5
1 2 0 0
2 2 0 3
3 2 0 1
4 2 0 1
0 0 0 5
1 0 0 0
2 0 0 4
3 0 0 1
4 0 0 1
0 0 0 6
1 0 0 0
2 0 0 4
3 0 0 1
4 0 0 1
0 3 0 7
1 3 0 0
2 3 0 4
3 3 0 1
4 3 0 1
0 2 0 7
1 2 0 0
2 2 0 4
3 2 0 2
4 2 0 1
0 0 0 7
1 0 0 0
2 0 0 5

  variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
  recall = np.divide(correct[j], totals[j])
  precision = np.divide(correct[j],(correct[j] + incorrect[j]))
