In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from scipy.spatial import distance

In [2]:
train_data=pd.read_csv('trainKNN.txt',header=None)
train_data.columns = ['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
train_data = train_data.drop('ID', axis=1) # Drop ID since irrelevant to predictions
test_data = pd.read_csv('testKNN.txt', header=None)
test_data.columns=['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
test_data = test_data.drop('ID', axis=1)

In [3]:
train_data.head(5)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
test_data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0.0,0.17,1
1,1.52152,13.12,3.58,0.9,72.2,0.23,9.82,0.0,0.16,1
2,1.523,13.31,3.58,0.82,71.99,0.12,10.17,0.0,0.03,1
3,1.51709,13.0,3.47,1.79,72.72,0.66,8.18,0.0,0.0,2
4,1.5166,12.99,3.18,1.23,72.97,0.58,8.81,0.0,0.24,2


In [5]:
print(train_data.describe())
print(test_data.describe())

               RI          Na          Mg          Al          Si           K  \
count  196.000000  196.000000  196.000000  196.000000  196.000000  196.000000   
mean     1.518295   13.375204    2.758980    1.454337   72.635408    0.519388   
std      0.003055    0.783145    1.392641    0.491688    0.763578    0.672703   
min      1.511310   10.730000    0.000000    0.290000   69.810000    0.000000   
25%      1.516458   12.877500    2.362500    1.190000   72.317500    0.140000   
50%      1.517630   13.280000    3.480000    1.360000   72.810000    0.560000   
75%      1.518985   13.792500    3.610000    1.622500   73.080000    0.610000   
max      1.533930   15.790000    4.490000    3.500000   75.180000    6.210000   

               Ca          Ba          Fe  Type of glass  
count  196.000000  196.000000  196.000000     196.000000  
mean     8.910714    0.164235    0.050255       2.668367  
std      1.421490    0.485198    0.086359       2.062416  
min      5.430000    0.000000    0

In [6]:
def std(df):
    for col in df.columns:
        if col !='Type of glass':
            df[col] = (df[col] -df [col].mean()) / df[col].std()
    return df
train =std (train_data)
test= std(test_data)
train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,0.888537,0.338119,1.242977,-0.720654,-1.120263,-0.682898,-0.11306,-0.33849,-0.581932,1
1,-0.224221,0.657345,0.603903,-0.191863,0.12388,-0.058551,-0.760269,-0.33849,-0.581932,1
2,-0.692234,0.197659,0.568,0.174223,0.464382,-0.19234,-0.795443,-0.33849,-0.581932,1
3,-0.207857,-0.21095,0.668529,-0.33423,-0.033275,0.075237,-0.485909,-0.33849,-0.581932,1
4,-0.286405,-0.134335,0.618265,-0.43592,0.582248,0.045506,-0.591432,-0.33849,-0.581932,1


In [7]:
euclid_model = KNeighborsClassifier(n_neighbors=8, metric=distance.sqeuclidean) 
manhattan_model = KNeighborsClassifier(n_neighbors=8, metric=distance.cityblock) 
x_train = train.drop(["Type of glass"], axis=1)
y_train = train["Type of glass"]
euclid_model.fit(x_train,y_train) # Train models
manhattan_model.fit(x_train, y_train)
x_test = test.drop("Type of glass", axis=1) 
y_test = test["Type of glass"]

In [9]:
manhattan_predictions =manhattan_model.predict(x_test)
euclid_predictions =euclid_model.predict(x_test)
df =pd.DataFrame ({'actual': y_test, 'manhattan': manhattan_predictions, 'euclid': euclid_predictions})
df.head()

Unnamed: 0,actual,manhattan,euclid
0,1,1,1
1,1,1,1
2,1,1,1
3,2,2,2
4,2,1,1


In [10]:
manhattan_count = len(df.loc[df['manhattan'] == df['actual']])
euclid_count = len(df.loc[df['euclid'] == df['actual']])
print('Manhattan Accuracy: {}%'.format(round(100*manhattan_count/len(df), 2)))
print(classification_report(y_test, manhattan_predictions, target_names=df['actual'].astype(str).unique()))
print ('\n')
print('Square Euclidean Accuracy: {}%'.format(round(100*euclid_count/len(df), 2)))
print(classification_report(y_test, euclid_predictions, target_names=df['actual'].astype(str).unique()))

Manhattan Accuracy: 66.67%
              precision    recall  f1-score   support

           1       0.50      1.00      0.67         3
           2       0.25      0.33      0.29         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         3

    accuracy                           0.67        18
   macro avg       0.62      0.67      0.63        18
weighted avg       0.62      0.67      0.63        18



Square Euclidean Accuracy: 61.11%
              precision    recall  f1-score   support

           1       0.60      1.00      0.75         3
           2       0.33      0.67      0.44         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      0.33      0.50         3
           7       0.75      1.00      0.86         3

    accuracy  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
