In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from scipy.spatial import distance

In [25]:
k_train = pd.read_csv("trainKNN.txt", header=None)
k_train.columns = ['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']

In [26]:
k_train = k_train.drop('ID', axis=1) # Drop ID since irrelevant to predictions

In [27]:
k_train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [28]:
k_test = pd.read_csv('testKNN.txt', header=None)
k_test.columns=['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']

In [29]:
k_test = k_test.drop('ID', axis=1)

In [30]:
k_test.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0.0,0.17,1
1,1.52152,13.12,3.58,0.9,72.2,0.23,9.82,0.0,0.16,1
2,1.523,13.31,3.58,0.82,71.99,0.12,10.17,0.0,0.03,1
3,1.51709,13.0,3.47,1.79,72.72,0.66,8.18,0.0,0.0,2
4,1.5166,12.99,3.18,1.23,72.97,0.58,8.81,0.0,0.24,2


In [32]:
def standardize (df):
    for col in df.columns:
        if col != "Type of glass": # Don't standardize the categories
            df[col] = (df[col] - df[col].mean())/df[col].std()
    return df

In [35]:
# Standardize the data
k_train = standardize(k_train)
k_test = standardize(k_test)
k_train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,0.888537,0.338119,1.242977,-0.720654,-1.120263,-0.682898,-0.11306,-0.33849,-0.581932,1
1,-0.224221,0.657345,0.603903,-0.191863,0.12388,-0.058551,-0.760269,-0.33849,-0.581932,1
2,-0.692234,0.197659,0.568,0.174223,0.464382,-0.19234,-0.795443,-0.33849,-0.581932,1
3,-0.207857,-0.21095,0.668529,-0.33423,-0.033275,0.075237,-0.485909,-0.33849,-0.581932,1
4,-0.286405,-0.134335,0.618265,-0.43592,0.582248,0.045506,-0.591432,-0.33849,-0.581932,1


### Square Euclidean distance model

In [38]:
# k = 8 neighbors
euclid_model = KNeighborsClassifier(n_neighbors=8, metric=distance.sqeuclidean)

### Manhattan distance model

In [37]:
manhattan_model = KNeighborsClassifier(n_neighbors=8, metric=distance.cityblock)

In [41]:
x_train = k_train.drop(["Type of glass"], axis=1)
y_train = k_train["Type of glass"]

#### Train the models

In [42]:
euclid_model.fit(x_train,y_train)
manhattan_model.fit(x_train, y_train)

KNeighborsClassifier(metric=<function cityblock at 0x7f9d99924280>,
                     n_neighbors=8)

In [43]:
x_test = k_test.drop("Type of glass", axis=1) 
y_test = k_test["Type of glass"]

#### Predictions

In [58]:
manhattan_predictions = manhattan_model.predict(x_test)
euclid_predictions = euclid_model.predict(x_test) 

In [60]:
df = pd.DataFrame({'actual': y_test, 'manhattan': manhattan_predictions, 'euclid': euclid_predictions})
df.head()

Unnamed: 0,actual,manhattan,euclid
0,1,1,1
1,1,1,1
2,1,1,1
3,2,2,2
4,2,1,1


In [61]:
manhattan_count = len(df.loc[df['manhattan'] == df['actual']])
print('Manhattan Accuracy: {}%'.format(round(100*manhattan_count/len(df), 2)))
print(classification_report(y_test, manhattan_predictions, target_names=df['actual'].astype(str).unique()))

Manhattan Accuracy: 66.67%
              precision    recall  f1-score   support

           1       0.50      1.00      0.67         3
           2       0.25      0.33      0.29         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         3

    accuracy                           0.67        18
   macro avg       0.62      0.67      0.63        18
weighted avg       0.62      0.67      0.63        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
euclid_count = len(df.loc[df['euclid'] == df['actual']])
print('Square Euclidean Accuracy: {}%'.format(round(100*euclid_count/len(df), 2)))
print(classification_report(y_test, euclid_predictions, target_names=df['actual'].astype(str).unique()))

Square Euclidean Accuracy: 61.11%
              precision    recall  f1-score   support

           1       0.60      1.00      0.75         3
           2       0.33      0.67      0.44         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      0.33      0.50         3
           7       0.75      1.00      0.86         3

    accuracy                           0.61        18
   macro avg       0.61      0.61      0.56        18
weighted avg       0.61      0.61      0.56        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
