In [1]:
import csv
import heapq
import numpy as np

In [2]:
def read_and_scale_points(dataset_name, label_col_number):
    points = []
    labels = []
    with open (dataset_name, 'r') as fin:
        csvReader = csv.reader(fin, delimiter=',')
        for row in csvReader:
            if csvReader.line_num != 1:
                float_row = []
                for i in range(0, len(row)):
                    if i != label_col_number:
                        float_row.append(float(row[i]))
                floar_array = np.array(float_row)
                points.append(np.interp(floar_array, (floar_array.min(), floar_array.max()), (0, 1)).tolist())
                labels.append(row[label_col_number])
    return points, labels

In [3]:
def kNN(points, labels, x, k):
    kMins = []
    heapq.heapify(kMins)
    for i in range(len(points)):
        point = points[i]
        sum = 0.0
        for j in range(len(point)):
            sum += (point[j] - x[j]) ** 2
        sum = sum ** (0.5)
        if len(kMins) < k:
            heapq.heappush(kMins, (-sum, labels[i]))
        else:
            heapq.heappushpop(kMins, (-sum, labels[i]))
    answer = []
    for i in range(k):
        lbls = {}
        count, lbl = 0, '' 
        for (_, label) in list(kMins):
            lbls[label] = lbls.get(label, 0) + 1
            if lbls[label] >= count:
                count, lbl = lbls[label], label
        answer.append(lbl)
        heapq.heappop(kMins)
    return answer

In [4]:
from tqdm import tqdm_notebook

In [5]:
def LOO(dataset_name, k, label_col_number):
    points, labels = read_and_scale_points(dataset_name, label_col_number)
    loo = [0] * k
    for i in tqdm_notebook(range(len(points))):
        point = points[i]
        left_labels = list(labels)
        del left_labels[points.index(point)]
        left_points = list(points)
        left_points.remove(point)
        
        a = kNN(left_points, left_labels, point, k)
        for j in range(k):
            if a[j] != labels[i]:
                loo[j] += 1
    for j in range(k):
        loo[j] = loo[j] * 1.0 / len(points)
    return loo
                    

In [9]:
loo = LOO('datasets/cancer.csv', 10, 0)
for i in range(len(loo)):
    print("k = " + str(i) + ", loo = " + str(loo[i]))

HBox(children=(IntProgress(value=0, max=569), HTML(value=u'')))

k = 0, loo = 0.0738137082601
k = 1, loo = 0.0755711775044
k = 2, loo = 0.0685413005272
k = 3, loo = 0.0738137082601
k = 4, loo = 0.079086115993
k = 5, loo = 0.0773286467487
k = 6, loo = 0.079086115993
k = 7, loo = 0.0826010544815
k = 8, loo = 0.0966608084359
k = 9, loo = 0.0966608084359


In [10]:
loo = LOO('datasets/spam.csv', 10, 57)
for i in range(len(loo)):
    print("k = " + str(i + 1) + ", loo = " + str(loo[i]))

HBox(children=(IntProgress(value=0, max=4601), HTML(value=u'')))

k = 1, loo = 0.16300804173
k = 2, loo = 0.163660073897
k = 3, loo = 0.152140838948
k = 4, loo = 0.156922408172
k = 5, loo = 0.148011301891
k = 6, loo = 0.148011301891
k = 7, loo = 0.138665507498
k = 8, loo = 0.149315366225
k = 9, loo = 0.125407520104
k = 10, loo = 0.125407520104
