In [53]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.stats import mode

In [54]:
# read data

data = pd.read_csv('glass.data.txt', index_col = 'Id')
data.head(5)

Unnamed: 0_level_0,Refractive,Na,Mg,Al,Si,K,Ca,Ba,Fe,Class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [55]:
# split into train and test datasets by ratio

ratio = 0.8

n_train = int(len(data) * ratio)
n_test = len(data) - n_train
indexes = np.full(len(data), 0)
indexes[[np.random.choice(len(data), n_train, replace=False)]] = 1

train_data = data[indexes == 1]
test_data = data[indexes == 0]

In [56]:
# convert to ndarray

train_x = np.array(train_data.values)[:,:-1]
train_y = np.array(train_data.values)[:,-1]

In [57]:
def count_dists(train_x, current):
    # count distances from current unknown point to other known points
    return [euclidean(x, current) for x in train_x]

In [58]:
def get_k_nearest(dists, k):
    # return k nearest neighbors 
    return np.argsort(dists)[:k]    

In [59]:
def get_classes(indexes, train_y):
    # return classes of given points
    return [train_y[index] for index in indexes]

In [60]:
def make_predictions(train_x, train_y, test_x, k):
    # make predictions about class of unknown points in test dataset
    predicted = []
    for current in test_x:
        dists = count_dists(train_x, current)
        nearest = get_k_nearest(dists, k)
        classes = get_classes(nearest, train_y)
        result = mode(classes)[0]
        predicted.append(result)
    return predicted

In [61]:
# convert to ndarray

test_x = np.array(test_data.values)[:,:-1]
test_y = np.array(test_data.values)[:,-1]

In [62]:
def accuracy(result):
    # count accuracy of made predictions
    return sum(result['Real'] == result['Predicted']) / len(result)

In [63]:
# for example make prediction when k = 20 (number of classes = 7)

k = 20
predictions = np.array(make_predictions(train_x, train_y, test_x, k))  
result_data = pd.DataFrame(test_x)
result_data['Predicted'] = predictions
result_data['Real'] = test_y
print (accuracy(result_data))
result_data

0.697674418605


Unnamed: 0,0,1,2,3,4,5,6,7,8,Predicted,Real
0,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,2.0,1.0
1,1.51743,13.3,3.6,1.14,73.09,0.58,8.17,0.0,0.0,1.0,1.0
2,1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0.0,0.17,1.0,1.0
3,1.51784,12.68,3.67,1.16,73.11,0.61,8.7,0.0,0.0,1.0,1.0
4,1.51911,13.9,3.73,1.18,72.12,0.06,8.89,0.0,0.0,1.0,1.0
5,1.51783,12.69,3.54,1.34,72.95,0.57,8.75,0.0,0.0,1.0,1.0
6,1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0.0,0.0,1.0,1.0
7,1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.0,0.0,1.0,1.0
8,1.51869,13.19,3.37,1.18,72.72,0.57,8.83,0.0,0.16,1.0,1.0
9,1.51905,13.6,3.62,1.11,72.64,0.14,8.76,0.0,0.0,1.0,1.0


In [64]:
# make predictions and count accuracy for different k

for k in range(1, 25):
    predictions = np.array(make_predictions(train_x, train_y, test_x, k))    
    result_data = pd.DataFrame(test_x)
    result_data['Predicted'] = predictions
    result_data['Real'] = test_y
    print ("k =", k, ", accuracy =", accuracy(result_data))

k = 1 , accuracy = 0.720930232558
k = 2 , accuracy = 0.674418604651
k = 3 , accuracy = 0.651162790698
k = 4 , accuracy = 0.674418604651
k = 5 , accuracy = 0.674418604651
k = 6 , accuracy = 0.627906976744
k = 7 , accuracy = 0.674418604651
k = 8 , accuracy = 0.651162790698
k = 9 , accuracy = 0.674418604651
k = 10 , accuracy = 0.697674418605
k = 11 , accuracy = 0.674418604651
k = 12 , accuracy = 0.651162790698
k = 13 , accuracy = 0.651162790698
k = 14 , accuracy = 0.674418604651
k = 15 , accuracy = 0.697674418605
k = 16 , accuracy = 0.697674418605
k = 17 , accuracy = 0.697674418605
k = 18 , accuracy = 0.674418604651
k = 19 , accuracy = 0.697674418605
k = 20 , accuracy = 0.697674418605
k = 21 , accuracy = 0.651162790698
k = 22 , accuracy = 0.651162790698
k = 23 , accuracy = 0.674418604651
k = 24 , accuracy = 0.651162790698


In [65]:
# It seems that all points in this dataset are "located" in one place in the space
# and it is difficult to divide them into separate classes.
# So knn doesn't work good with this dataset and shows the best result with k = 1