In [29]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.stats import mode

In [30]:
# read data

data = pd.read_csv('glass.data.txt', index_col = 'Id')
data.head(5)

Unnamed: 0_level_0,Refractive,Na,Mg,Al,Si,K,Ca,Ba,Fe,Class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [31]:
# split into train and test datasets by ratio

ratio = 0.8

n_train = int(len(data) * ratio)
n_test = len(data) - n_train
indexes = np.full(len(data), 0)
indexes[[np.random.choice(len(data), n_train, replace=False)]] = 1

train_data = data[indexes == 1]
test_data = data[indexes == 0]

In [32]:
# convert to ndarray

train_x = np.array(train_data.values)[:,:-1]
train_y = np.array(train_data.values)[:,-1]

In [33]:
def count_dists(train_x, current):
    # count distances from current unknown point to other known points
    return [euclidean(x, current) for x in train_x]

In [34]:
def get_k_nearest(dists, k):
    # return k nearest neighbors 
    return np.argsort(dists)[:k]    

In [35]:
def get_classes(indexes, train_y):
    # return classes of given points
    return [train_y[index] for index in indexes]

In [36]:
def make_predictions(train_x, train_y, test_x, k):
    # make predictions about class of unknown points in test dataset
    predicted = []
    for current in test_x:
        dists = count_dists(train_x, current)
        nearest = get_k_nearest(dists, k)
        classes = get_classes(nearest, train_y)
        result = mode(classes)[0]
        predicted.append(result)
    return predicted

In [37]:
# convert to ndarray

test_x = np.array(test_data.values)[:,:-1]
test_y = np.array(test_data.values)[:,-1]

In [38]:
def accuracy(result):
    # count accuracy of made predictions
    return sum(result['Real'] == result['Predicted']) / len(result)

In [39]:
# for example make prediction when k = 20 (number of classes = 7)

k = 20
predictions = np.array(make_predictions(train_x, train_y, test_x, k))  
result_data = pd.DataFrame(test_x)
result_data['Predicted'] = predictions
result_data['Real'] = test_y
print (accuracy(result_data))
result_data

0.581395348837


Unnamed: 0,0,1,2,3,4,5,6,7,8,Predicted,Real
0,1.51755,13.0,3.6,1.36,72.99,0.57,8.4,0.0,0.11,1.0,1.0
1,1.51966,14.77,3.75,0.29,72.02,0.03,9.0,0.0,0.0,1.0,1.0
2,1.51764,12.98,3.54,1.21,73.0,0.65,8.53,0.0,0.0,1.0,1.0
3,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.0,0.0,2.0,1.0
4,1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.0,0.0,1.0,1.0
5,1.51784,13.08,3.49,1.28,72.86,0.6,8.49,0.0,0.0,1.0,1.0
6,1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,1.0,1.0
7,1.51755,12.71,3.42,1.2,73.2,0.59,8.64,0.0,0.0,1.0,1.0
8,1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0.0,0.0,1.0,1.0
9,1.51926,13.2,3.33,1.28,72.36,0.6,9.14,0.0,0.11,1.0,1.0


In [40]:
# make predictions and count accuracy for different k

for k in range(1, 25):
    predictions = np.array(make_predictions(train_x, train_y, test_x, k))    
    result_data = pd.DataFrame(test_x)
    result_data['Predicted'] = predictions
    result_data['Real'] = test_y
    print ("k =", k, ", accuracy =", accuracy(result_data))
    
predictions = np.array(make_predictions(train_x, train_y, test_x, 3))    
result_data = pd.DataFrame(test_x)
result_data['Predicted'] = predictions
result_data['Real'] = test_y

k = 1 , accuracy = 0.604651162791
k = 2 , accuracy = 0.627906976744
k = 3 , accuracy = 0.697674418605
k = 4 , accuracy = 0.651162790698
k = 5 , accuracy = 0.651162790698
k = 6 , accuracy = 0.674418604651
k = 7 , accuracy = 0.604651162791
k = 8 , accuracy = 0.674418604651
k = 9 , accuracy = 0.627906976744
k = 10 , accuracy = 0.674418604651
k = 11 , accuracy = 0.581395348837
k = 12 , accuracy = 0.627906976744
k = 13 , accuracy = 0.651162790698
k = 14 , accuracy = 0.627906976744
k = 15 , accuracy = 0.627906976744
k = 16 , accuracy = 0.627906976744
k = 17 , accuracy = 0.604651162791
k = 18 , accuracy = 0.604651162791
k = 19 , accuracy = 0.558139534884
k = 20 , accuracy = 0.581395348837
k = 21 , accuracy = 0.581395348837
k = 22 , accuracy = 0.604651162791
k = 23 , accuracy = 0.581395348837
k = 24 , accuracy = 0.558139534884


In [41]:
# It seems that all points in this dataset are "located" in one place in the space
# and it is difficult to divide them into separate classes.
# So knn doesn't work good with this dataset and shows the best result with k = 1

In [42]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_x, train_y) 
sklearn_y = neigh.predict(test_x)
result_data['Sklearn'] = sklearn_y
result_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,Predicted,Real,Sklearn
0,1.51755,13.0,3.6,1.36,72.99,0.57,8.4,0.0,0.11,1.0,1.0,1.0
1,1.51966,14.77,3.75,0.29,72.02,0.03,9.0,0.0,0.0,1.0,1.0,1.0
2,1.51764,12.98,3.54,1.21,73.0,0.65,8.53,0.0,0.0,1.0,1.0,1.0
3,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.0,0.0,3.0,1.0,3.0
4,1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.0,0.0,1.0,1.0,1.0
5,1.51784,13.08,3.49,1.28,72.86,0.6,8.49,0.0,0.0,1.0,1.0,1.0
6,1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,1.0,1.0,1.0
7,1.51755,12.71,3.42,1.2,73.2,0.59,8.64,0.0,0.0,1.0,1.0,1.0
8,1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0.0,0.0,1.0,1.0,1.0
9,1.51926,13.2,3.33,1.28,72.36,0.6,9.14,0.0,0.11,2.0,1.0,2.0
