In [1]:
import numpy as np
import seaborn as sns
import warnings

import matplotlib.cm as cm
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.manifold import TSNE
from umap import UMAP

warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
def read_data(filename: str, delimeter=';'):
    with open(filename, mode="r", newline='') as f:
        features = list()
        targets = list()
        for row in f:
            line = list()
            for s in row.split(delimeter):
                s = s.strip()
                if s:
                    line.append(s)
            
            t = int(line[-1])
            if t < 3:
                t -= 1
                features.append(list(map(float ,line[1:-1])))
                targets.append(t)
    return features, targets

In [4]:
def plot_embeddings(embedded_tsne, targets, predicted):
    labels = list(range(np.max(targets)+1))
    palette = np.array(sns.color_palette(n_colors=len(labels)))

    patchs = []
    for i, color in enumerate(palette):
        patchs.append(mpatches.Patch(color=color, label=i))


    plt.figure(figsize=(8, 4))
    plt.subplot(1, 2, 1)
    plt.scatter(embedded_tsne[:,0], embedded_tsne[:,1], c=palette[targets])
    plt.legend(handles=patchs, loc='upper right')
    plt.title("Targets")

    plt.subplot(1, 2, 2)
    plt.scatter(embedded_tsne[:,0], embedded_tsne[:,1], c=palette[predicted])
    plt.legend(handles=patchs, loc='upper right')
    plt.title("Predictions")

    plt.show()

In [5]:
features, targets = read_data('glass.data', delimeter=',')
features = np.array(features)
features = UMAP().fit_transform(features)
features = StandardScaler().fit_transform(features)

targets = np.array(targets)

In [6]:
parameters = {
    'n_neighbors': np.linspace(5, 15, 10, dtype=np.int32),
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'weights': ['distance', 'uniform'],
    'p': [1, 2]
}

clf = GridSearchCV(KNeighborsClassifier(), parameters)

In [8]:
clf.fit(features, targets)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 15]),
                         'p': [1, 2], 'weights': ['distance', 'uniform']})

In [9]:
clf.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}

In [10]:
print(classification_report(targets, clf.predict(features), digits=3, zero_division=0))

              precision    recall  f1-score   support

           0      0.792     0.871     0.830        70
           1      0.870     0.789     0.828        76

    accuracy                          0.829       146
   macro avg      0.831     0.830     0.829       146
weighted avg      0.832     0.829     0.829       146

