In [1]:
import numpy as np
import random
import seaborn as sns
import warnings

import matplotlib.cm as cm
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.manifold import TSNE
from umap import UMAP


warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
def read_data(filename: str, delimeter=';'):
    with open(filename, mode="r", newline='') as f:
        features = list()
        targets = list()
        for row in f:
            line = list()
            for s in row.split(delimeter):
                s = s.strip()
                if s:
                    line.append(s)
            
            t = int(line[-1])
            if t < 3:
                features.append(list(map(float ,line[1:-1])))
                targets.append(t-1)
            if t == 6:
                features.append(list(map(float ,line[1:-1])))
                targets.append(2)
            if t == 7:
                features.append(list(map(float ,line[1:-1])))
                targets.append(3)
    return features, targets

In [3]:
def make_meshgrid(x, y, h=.02):
        x_min, x_max = x.min() - 1, x.max() + 1
        y_min, y_max = y.min() - 1, y.max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                            np.arange(y_min, y_max, h))
        return xx, yy

def plot_contours(model, xx, yy, ax, **params):
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

def plot_surface(model, xx, yy, ax, **params):
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.plot_surface(xx, yy, Z, **params)
    return out

In [4]:
def plot_results(features, targets, model, figsize=(10, 5), axis='off', in3d=False, colormap='coolwarm', rot=30):
    xx, yy = make_meshgrid(features[:, 0], features[:, 1])

    fig = plt.figure(figsize=figsize)
    
    if in3d:
        fig = plt.figure()
        ax = fig.add_subplot(projection='3d')
    else:
        ax = fig.gca()
    
    colormap = colormap
    labels = np.unique(targets).tolist()
    palette = np.array(sns.color_palette(colormap, n_colors=len(labels)))
    cmap = sns.color_palette(colormap, as_cmap=True)


    if in3d:
        plot_surface(model, xx, yy, ax, cmap=cmap, alpha=0.8)
    else:
        plot_contours(model, xx, yy, ax, cmap=cmap, alpha=0.8)
    ax.scatter(features[:, 0], features[:, 1], c=targets, cmap=cmap, s=40, edgecolors='k')
    plt.axis(axis)
    if in3d:
        ax.view_init(40, rot)
    plt.show()

In [5]:
features, targets = read_data('glass.data', delimeter=',')
features = np.array(features)
targets = np.array(targets)

In [6]:
print(classification_report(targets, targets))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        76
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00        29

    accuracy                           1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00      1.00       184



In [7]:
np.unique(targets)

array([0, 1, 2, 3])

In [8]:
smote_features, smote_targets = SMOTE().fit_resample(features, targets)

In [9]:
print(classification_report(smote_targets, smote_targets))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        76
           1       1.00      1.00      1.00        76
           2       1.00      1.00      1.00        76
           3       1.00      1.00      1.00        76

    accuracy                           1.00       304
   macro avg       1.00      1.00      1.00       304
weighted avg       1.00      1.00      1.00       304



In [10]:
smote_features = UMAP().fit_transform(smote_features)
smote_features = StandardScaler().fit_transform(smote_features)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(smote_features, smote_targets, test_size=0.3, stratify=smote_targets, shuffle=True)
x_train.shape, x_test.shape

((212, 2), (92, 2))

# SVM (SVC)

In [12]:
svc_parameters = {
    'kernel':['poly', 'rbf', 'sigmoid'],
    'C': np.linspace(1, 10, 11),
    'degree': np.linspace(3, 6, 4)
}

clf_svc = GridSearchCV(SVC(), svc_parameters)

In [13]:
clf_svc.fit(x_train, y_train)

In [14]:
clf_svc.best_params_

{'C': 1.9, 'degree': 3.0, 'kernel': 'rbf'}

In [15]:
print(classification_report(y_train, clf_svc.predict(x_train), digits=3, zero_division=0))

              precision    recall  f1-score   support

           0      0.768     0.811     0.789        53
           1      0.721     0.585     0.646        53
           2      0.742     0.925     0.824        53
           3      0.915     0.811     0.860        53

    accuracy                          0.783       212
   macro avg      0.787     0.783     0.780       212
weighted avg      0.787     0.783     0.780       212



In [16]:
print(classification_report(y_test, clf_svc.predict(x_test), digits=3, zero_division=0))

              precision    recall  f1-score   support

           0      0.750     0.783     0.766        23
           1      0.737     0.609     0.667        23
           2      0.733     0.957     0.830        23
           3      0.947     0.783     0.857        23

    accuracy                          0.783        92
   macro avg      0.792     0.783     0.780        92
weighted avg      0.792     0.783     0.780        92



In [17]:
best_svc = SVC(**clf_svc.best_params_)

In [18]:
best_svc.fit(x_train, y_train)

In [19]:
len(best_svc.support_vectors_)

149

In [None]:
plot_results(smote_features, smote_targets, best_svc, figsize=(15, 10))

In [None]:
plot_results(smote_features, smote_targets, best_svc, figsize=(13, 13), in3d=True, axis='on')

# KNN

In [None]:
knn_parameters = {
    'n_neighbors': np.linspace(5, 15, 10, dtype=np.int32),
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'weights': ['distance', 'uniform'],
    'p': [1, 2]
}

clf_knn = GridSearchCV(KNeighborsClassifier(), knn_parameters)

In [None]:
clf_knn.fit(x_train, y_train)

In [None]:
clf_knn.best_params_

In [None]:
print(classification_report(y_train, clf_knn.predict(x_train), digits=3, zero_division=0))

In [None]:
print(classification_report(y_test, clf_knn.predict(x_test), digits=3, zero_division=0))

In [None]:
plot_results(smote_features, smote_targets, clf_knn, figsize=(15, 10))

In [None]:
plot_results(smote_features, smote_targets, clf_knn, in3d=True, figsize=(13, 13), rot=45, axis='on')

# RandomForest

In [None]:
rf_parameters = {
    'n_estimators': np.linspace(5, 50, 10, dtype=np.int32),
    'criterion': ['gini', 'entropy'],
    'max_depth': np.linspace(5, 10, 5, dtype=np.int32),
}

clf_rf = GridSearchCV(RandomForestClassifier(), rf_parameters)

In [None]:
clf_rf.fit(x_train, y_train)

In [None]:
clf_rf.best_params_

In [None]:
print(classification_report(y_train, clf_rf.predict(x_train), digits=3, zero_division=0))

In [None]:
print(classification_report(y_test, clf_rf.predict(x_test), digits=3, zero_division=0))

In [None]:
plot_results(smote_features, smote_targets, clf_rf, figsize=(15, 10))

In [None]:
plot_results(smote_features, smote_targets, clf_rf, in3d=True, figsize=(13, 13), axis='on', rot=130)

# Compare

In [None]:
def plot_bars(features, targets):
    svc_accuracy = round(accuracy_score(targets, best_svc.predict(features)) * 100, 1)
    knn_accuracy = round(accuracy_score(targets, clf_knn.predict(features)) * 100, 1)
    rf_accuracy = round(accuracy_score(targets, clf_rf.predict(features)) * 100, 1)

    svc_precision = round(precision_score(targets, best_svc.predict(features), average='macro') * 100, 1)
    knn_precision = round(precision_score(targets, clf_knn.predict(features), average='macro') * 100, 1)
    rf_precision = round(precision_score(targets, clf_rf.predict(features), average='macro') * 100, 1)

    svc_f1 = round(f1_score(targets, best_svc.predict(features), average='macro') * 100, 1)
    knn_f1 = round(f1_score(targets, clf_knn.predict(features), average='macro') * 100, 1)
    rf_f1 = round(f1_score(targets, clf_rf.predict(features), average='macro') * 100, 1)

    svc_recall = round(recall_score(targets, best_svc.predict(features), average='macro') * 100, 1)
    knn_recall = round(recall_score(targets, clf_knn.predict(features), average='macro') * 100, 1)
    rf_recall = round(recall_score(targets, clf_rf.predict(features), average='macro') * 100, 1)

    labels = ['accuracy', 'precision', 'recall', 'f1']
    svc_metrics = [svc_accuracy, svc_precision, svc_recall, svc_f1]
    knn_metrics = [knn_accuracy, knn_precision, knn_recall, knn_f1]
    rf_metrics = [rf_accuracy, rf_precision, rf_recall, rf_f1]

    x = np.arange(len(labels))
    width = 0.2

    fig, ax = plt.subplots()

    fig.set_size_inches(12, 5)

    svc_bar = ax.bar(x-width-0.01, svc_metrics, width, label='SVC')
    knn_bar = ax.bar(x, knn_metrics, width, label='KNN')
    rf_bar = ax.bar(x+width+0.01, rf_metrics, width, label='RF')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Scores, %')
    ax.set_title('Scores for SVC, KNN, RF')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
            fancybox=True, shadow=True, ncol=3)

    ax.bar_label(svc_bar)
    ax.bar_label(knn_bar)
    ax.bar_label(rf_bar)

    fig.tight_layout()

    plt.show()

In [None]:
plot_bars(x_train, y_train)

In [None]:
plot_bars(x_test, y_test)