In [1]:
from csv import reader
from math import sqrt
import numpy as np

In [16]:
 # Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset
 
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())
 
# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
		print('[%s] => %d' % (value, i))
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup
 
# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax
 
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
 
# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)
 
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = [distances[i][0] for i in range(num_neighbors)]
    return neighbors
 
    
from collections import Counter 
  
def majority(arr): 
  
    # convert array into dictionary 
    freqDict = Counter(arr) 
  
    # traverse dictionary and check majority element 
    size = len(arr) 
    for (key,val) in freqDict.items(): 
         if (val > (size/2)): 
             # print(key) 
             return key
    #print('None')
    return -1
    
# Make a prediction with neighbors
def predict_classification(train, num_neighbors, test_row, prediction_type='majority'):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    
    print ("Closet datapoints: ")
    print (neighbors)
    
    print ("Closet datapoints classes: ", end =" ")
    print (output_values)
    
    if prediction_type == 'majority':
        prediction = majority(output_values)
    else:
        prediction = max(set(output_values), key=output_values.count)
    return prediction
 

In [18]:
def prepare_dataset(filename):
    dataset = load_csv(filename)
    #print (dataset)
    for i in range(len(dataset[0])-1):
        str_column_to_float(dataset, i)

    str_column_to_int(dataset, len(dataset[0])-1)
    return dataset

def run_knn(dataset, num_neighbors, input_data):
    label = predict_classification(dataset, num_neighbors, input_data)
    print('Input Data %s -> Prediction: %s' % (input_data, label))


dataset = prepare_dataset('example_knn.csv')
run_knn(dataset, num_neighbors=3, input_data=[3,4] )

[Good] => 0
[Bad] => 1
Closet datapoints: 
[[3.0, 7.0, 1], [3.0, 3.0, 0], [3.0, 2.0, 1]]
Closet datapoints classes:  [1, 0, 1]
Data [3, 4], Prediction: 1


In [19]:
run_knn(dataset, num_neighbors=5, input_data=[3,4] )

Closet datapoints: 
[[3.0, 7.0, 1], [3.0, 3.0, 0], [3.0, 2.0, 1], [2.0, 1.0, 0], [1.0, 1.0, 0]]
Closet datapoints classes:  [1, 0, 1, 0, 0]
Data [3, 4], Prediction: 0
