In [18]:
# k-nearest neighbors on the Iris Flowers Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import pandas as pd
import numpy as np

def encodeData(DF):
    #print(DF['sex'].unique())

    for cnum,column in enumerate(DF.columns):
        lableDict = {}
        uniques = DF[column].unique()
        #print(column,uniques)
        for i, uniqueVal in enumerate(uniques):
            #print('into dict',column,uniqueVal,i)
            lableDict[uniqueVal] = i
        for key,value in lableDict.items():
            #print(key,value)
            DF.iloc[DF[column] == key,cnum] = value
            #print(DF[DF[column] == key])
    return DF.to_numpy()


def loadCsv():
	datasetInto = pd.read_csv('heart_disease_uci.csv')
	#datasetInto = datasetInto.dropna()
	dataset = encodeData(datasetInto.iloc[:,1:])
	for i in range(1,len(dataset)):
		dataset[i] = [float(x) for x in dataset[i]]
	return dataset


# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores,actual, predicted

# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = euclidean_distance(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors

# Make a prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
	neighbors = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction

# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
	predictions = list()
	for row in test:
		output = predict_classification(train, row, num_neighbors)
		predictions.append(output)
	return(predictions)

def getPrecisionRecall(actual,prediction):
    
    #tp,fp, fn, tn = 0
    a0,a1,a2,a3,a4 = 0,0,0,0,0
    p0,p1,p2,p3,p4 = 0,0,0,0,0
    t0,t1,t2,t3,t4 = 0,0,0,0,0
    correct = [a0,a1,a2,a3,a4]
    incorrect = [p0,p1,p2,p3,p4]
    totals = [t0,t1,t2,t3,t4]
    for j in range(0,5):
        for i in range(0,len(actual)):
            #print(actual[i],j)
            print(j,actual[i],prediction[i],totals[j])
            if actual[i] == j:
                totals[j] +=1
                if prediction[i] == actual[i]:
                    correct[j] += 1
                else:
                    incorrect[j] += 1

    #group 1 scores
    #recall = correct / all
    #precision = correct / (correct+incorrect)
    for j in range(0,5):
        recall = np.divide(correct[j], totals[j])
        precision = np.divide(correct[j],(correct[j] + incorrect[j]))
        print('Group %d Results: recall - %f, precision %f, total correct - %d, total incorrect - %d, total - %d'%(j,recall,precision,correct[j],incorrect[j],totals[j]))
    print(totals)
    print(correct)
    print(incorrect)

# Test the kNN on the Iris Flowers dataset
seed(1)
filename = 'heart_disease_uci.csv'
dataset = loadCsv()
newDataset = []
for dataarr in dataset:
	newDataset.append(dataarr.tolist())

# evaluate algorithm
n_folds = 5
num_neighbors = 5
scores,actual, predicted = evaluate_algorithm(newDataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
getPrecisionRecall(actual,predicted)

Scores: [54.23728813559322, 47.45762711864407, 52.54237288135594, 54.23728813559322, 61.016949152542374]
Mean Accuracy: 53.898%
0 0.0 0.0 0
0 2.0 2.0 1
0 0.0 0.0 1
0 0.0 0.0 2
0 0.0 0.0 3
0 0.0 0.0 4
0 0.0 2.0 5
0 0.0 0.0 6
0 2.0 2.0 7
0 0.0 0.0 7
0 0.0 0.0 8
0 0.0 0.0 9
0 3.0 2.0 10
0 3.0 0.0 10
0 0.0 0.0 10
0 2.0 2.0 11
0 2.0 0.0 11
0 2.0 3.0 11
0 0.0 0.0 11
0 0.0 0.0 12
0 2.0 2.0 13
0 0.0 0.0 13
0 0.0 0.0 14
0 2.0 0.0 15
0 3.0 0.0 15
0 0.0 0.0 15
0 0.0 0.0 16
0 2.0 0.0 17
0 2.0 0.0 17
0 3.0 0.0 17
0 0.0 0.0 17
0 2.0 0.0 18
0 3.0 0.0 18
0 0.0 0.0 18
0 0.0 0.0 19
0 0.0 0.0 20
0 3.0 0.0 21
0 3.0 2.0 21
0 2.0 0.0 21
0 2.0 0.0 21
0 0.0 0.0 21
0 0.0 0.0 22
0 2.0 2.0 23
0 0.0 0.0 23
0 0.0 0.0 24
0 0.0 2.0 25
0 0.0 0.0 26
0 2.0 0.0 27
0 2.0 2.0 27
0 0.0 0.0 27
0 3.0 0.0 28
0 0.0 2.0 28
0 2.0 2.0 29
0 0.0 0.0 29
0 2.0 3.0 30
0 2.0 0.0 30
0 3.0 0.0 30
0 0.0 0.0 30
0 0.0 0.0 31
1 0.0 0.0 0
1 2.0 2.0 0
1 0.0 0.0 0
1 0.0 0.0 0
1 0.0 0.0 0
1 0.0 0.0 0
1 0.0 2.0 0
1 0.0 0.0 0
1 2.0 2.0 0
1 0.0 0.0

  recall = np.divide(correct[j], totals[j])
  precision = np.divide(correct[j],(correct[j] + incorrect[j]))
