In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.pipeline import Pipeline
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

from src.base.training.models.architectures.lenet import LeNet
from src.base.training.models.architectures.lenet_light import LeNetLight

In [2]:
DATA_PATH = "C:\\Users\\micdu\\Code\\pythonProject\\dmtl\\data"

def load_samples(dataset_fn, n_samples, train=True):
    dataset = dataset_fn(
        DATA_PATH,
        train=train,
        download=True,
        transform=ToTensor()
    )
    loader = DataLoader(dataset, batch_size=n_samples)
    return next(iter(loader))

def shuffle(x, y):
    shuffle_index = torch.randperm(x.shape[0])
    return x[shuffle_index], y[shuffle_index]

def load_model(model_fn, path):
    model = model_fn()
    model.load_state_dict(torch.load(path))
    model.eval()
    return model

def use_models(x, model_fn, paths):
    x_out = load_model(model_fn, paths[0])(x)
    for path in paths[1:]:
        model = load_model(model_fn, path)
        x_out = torch.cat((x_out, model(x)), dim=1)
    return x_out

def load_and_prepare(n_samples=100, train=True, model_fn=LeNet):
    mnist_x, mnist_y = load_samples(datasets.MNIST, n_samples, train=train)
    x, y = shuffle(mnist_x, mnist_y)
    x_out = use_models(x, model_fn, [
        "models/daeclust_07/6254e814cad034bdf8068d43f347bd6ac5195d7825c8a89469f5c3ecb00e6684/final_model.state",
        "models/daeclust_07/bc921caf11e52a378db890398a1362e799a8a6a96f8d7f31038bd2d03a3ab7d0/final_model.state",
        "models/daeclust_07/d018ae7db86692ad7df89f6b3d85848cb5277c2aae418b1270b96508b4cfdbf8/final_model.state"
    ])
    return x_out.detach().numpy(), y.detach().numpy()

In [3]:
x_test, y_test = load_and_prepare(model_fn=LeNet, n_samples=5000, train=False)

predictions_argmax = np.argmax(x_test, axis=1) % 10
classification_report(predictions_argmax, y_test, output_dict=True)

{'0': {'precision': 0.6934782608695652,
  'recall': 0.9300291545189504,
  'f1-score': 0.7945205479452054,
  'support': 343},
 '1': {'precision': 0.8774080560420315,
  'recall': 0.9728155339805825,
  'f1-score': 0.9226519337016574,
  'support': 515},
 '2': {'precision': 0.07735849056603773,
  'recall': 0.803921568627451,
  'f1-score': 0.14113597246127366,
  'support': 51},
 '3': {'precision': 0.992,
  'recall': 0.2848937392303274,
  'f1-score': 0.44265952699687633,
  'support': 1741},
 '4': {'precision': 1.0,
  'recall': 0.5015045135406219,
  'f1-score': 0.6680026720106881,
  'support': 997},
 '5': {'precision': 0.9824561403508771,
  'recall': 0.5108323831242874,
  'f1-score': 0.6721680420105026,
  'support': 877},
 '6': {'precision': 0.31601731601731603,
  'recall': 0.9733333333333334,
  'f1-score': 0.477124183006536,
  'support': 150},
 '7': {'precision': 0.181640625,
  'recall': 1.0,
  'f1-score': 0.3074380165289256,
  'support': 93},
 '8': {'precision': 0.081799591002045,
  'recall'

In [4]:
x_test[0]

array([-2.9567780e+00, -2.9694698e+00, -2.9771998e+00, -3.4705944e+00,
       -3.4944117e+00, -3.5562758e+00, -2.5571110e+00, -1.4094365e+00,
       -1.3593700e+00, -1.7310058e+00, -3.4170986e+01, -3.4157955e+01,
       -3.4088192e+01,  0.0000000e+00, -1.8760986e+01, -1.9427149e+01,
       -2.8975773e+01, -2.9635868e+01, -2.9866068e+01, -2.9558195e+01,
       -8.2453241e+00, -4.4450626e+00, -1.2474253e-02, -1.1553190e+01,
       -1.1525580e+01, -1.1462899e+01, -9.3638458e+00, -9.2379446e+00,
       -9.2419968e+00, -9.3391886e+00], dtype=float32)

In [5]:
y_test[0]

2

In [6]:
np.reshape(x_test, (-1, 3, 10))[0]

array([[-2.9567780e+00, -2.9694698e+00, -2.9771998e+00, -3.4705944e+00,
        -3.4944117e+00, -3.5562758e+00, -2.5571110e+00, -1.4094365e+00,
        -1.3593700e+00, -1.7310058e+00],
       [-3.4170986e+01, -3.4157955e+01, -3.4088192e+01,  0.0000000e+00,
        -1.8760986e+01, -1.9427149e+01, -2.8975773e+01, -2.9635868e+01,
        -2.9866068e+01, -2.9558195e+01],
       [-8.2453241e+00, -4.4450626e+00, -1.2474253e-02, -1.1553190e+01,
        -1.1525580e+01, -1.1462899e+01, -9.3638458e+00, -9.2379446e+00,
        -9.2419968e+00, -9.3391886e+00]], dtype=float32)

In [7]:
pred_sum = np.argmax(np.sum(np.reshape(x_test, (-1, 3, 10)), axis=1), axis=1)
classification_report(pred_sum, y_test, output_dict=True)

{'0': {'precision': 0.4282608695652174,
  'recall': 0.9704433497536946,
  'f1-score': 0.5942684766214178,
  'support': 203},
 '1': {'precision': 0.7740805604203153,
  'recall': 0.9888143176733781,
  'f1-score': 0.868369351669941,
  'support': 447},
 '2': {'precision': 0.013207547169811321,
  'recall': 0.875,
  'f1-score': 0.026022304832713755,
  'support': 8},
 '3': {'precision': 0.994,
  'recall': 0.27010869565217394,
  'f1-score': 0.42478632478632483,
  'support': 1840},
 '4': {'precision': 1.0,
  'recall': 0.4314063848144953,
  'f1-score': 0.6027727546714888,
  'support': 1159},
 '5': {'precision': 0.9868421052631579,
  'recall': 0.45546558704453444,
  'f1-score': 0.6232686980609419,
  'support': 988},
 '6': {'precision': 0.30735930735930733,
  'recall': 0.9793103448275862,
  'f1-score': 0.46787479406919275,
  'support': 145},
 '7': {'precision': 0.19921875,
  'recall': 0.9902912621359223,
  'f1-score': 0.3317073170731707,
  'support': 103},
 '8': {'precision': 0.0408997955010225,
 

In [8]:
def use_pca(x_train, y_train, x_test, y_test, train_test_fn, n_components=15):
    if isinstance(n_components, int) and n_components > 0:
        pca = PCA(n_components=n_components)
        x_train = pca.fit_transform(x_train, y_train)
        x_test = pca.transform(x_test)
    return train_test_fn(x_train, y_train, x_test, y_test)

In [9]:
def train_test_decision_tree(x_train, y_train, x_test, y_test):
    decision_tree = DecisionTreeClassifier(random_state=0, max_depth=25)
    decision_tree = decision_tree.fit(x_train, y_train)
    tree_pred = decision_tree.predict(x_test)
    return classification_report(tree_pred, y_test, output_dict=True)

In [10]:
def train_test_svm(x, y, x_test, y_test):
    # LinearSVC, ovo, ovr
    svm_clf = svm.SVC()
    svm_clf.fit(x, y)
    svm_pred = svm_clf.predict(x_test)
    return classification_report(svm_pred, y_test, output_dict=True)

In [11]:
# https://scikit-learn.org/stable/modules/naive_bayes.html
def train_test_gnb(x, y, x_test, y_test):
    gnb = GaussianNB()
    gnb = gnb.fit(x, y)
    gnb_pred = gnb.predict(x_test)
    return classification_report(gnb_pred, y_test, output_dict=True)

In [12]:
# https://scikit-learn.org/stable/modules/neighbors.html
def train_test_neighbors(x_train, y_train, x_test, y_test):
    nca = NeighborhoodComponentsAnalysis(random_state=42)
    knn = KNeighborsClassifier(n_neighbors=5)
    nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
    nca_pipe.fit(x_train, y_train)
    nca_knn_preds = nca_pipe.predict(x_test)
    return classification_report(nca_knn_preds, y_test, output_dict=True)

In [13]:
def train_test_random_forest(x_train, y_train, x_test, y_test):
    rnd_forest = RandomForestClassifier(max_depth=10, random_state=0)
    rnd_forest.fit(x_train, y_train)
    forest_pred = rnd_forest.predict(x_test)
    return classification_report(forest_pred, y_test, output_dict=True)

In [None]:
def train_test(n_samples, train_test_fn):
    x_train, y_train = load_and_prepare(model_fn=LeNet, n_samples=n_samples, train=True)
    x_test, y_test = load_and_prepare(model_fn=LeNet, n_samples=5000, train=False)
    results = []
    result = train_test_fn(x_train, y_train, x_test, y_test)
    result["pca"] = 0
    result["n_samples"] = n_samples
    result["classifier"] = train_test_fn.__name__
    results.append(result)
    pca_components = [3, 6, 9, 12, 15, 18, 21]
    for n_components in pca_components:
        print("n_components", n_components)
        result = use_pca(x_train, y_train, x_test, y_test, train_test_fn, n_components=n_components)
        result["pca"] = n_components
        result["n_samples"] = n_samples
        result["classifier"] = train_test_fn.__name__
        results.append(result)
    return results

def train_test_all(n_samples):
    tree_results = train_test(n_samples, train_test_decision_tree)
    svm_results = train_test(n_samples, train_test_svm)
    gnb_results = train_test(n_samples, train_test_gnb)
    neighbors = train_test(n_samples, train_test_neighbors)
    rnd_results = train_test(n_samples, train_test_random_forest)
    return tree_results, svm_results, gnb_results, neighbors, rnd_results

def train_test_samples(samples):
    exps = []
    for n_samples in samples:
        tree_results, svm_results, gnb_results, neighbors, rnd_results = train_test_all(n_samples)
        exps += tree_results
        exps += svm_results
        exps += gnb_results
        exps += neighbors
        exps += rnd_results
    return exps

exps = train_test_samples([50, 100, 200, 500, 1000, 2000, 5000])



In [None]:
exps_flat = [{
    "n_components": exp.get("pca"),
    "n_samples": exp.get("n_samples"),
    "classifier": exp.get("classifier"),
    "accuracy": exp.get("accuracy")
} for exp in exps]

In [None]:
df = pd.DataFrame(exps_flat)

maxes = df.groupby(["classifier", "n_samples"])["accuracy"].max().reset_index()

In [None]:
maxes["classifier"] = maxes["classifier"].replace('train_test_decision_tree', "Decision Tree")
maxes["classifier"] = maxes["classifier"].replace('train_test_gnb', "Gaussian Naive Bayesian")
maxes["classifier"] = maxes["classifier"].replace('train_test_neighbors', "Nearest Neighbours")
maxes["classifier"] = maxes["classifier"].replace('train_test_random_forest', "Random Forest")
maxes["classifier"] = maxes["classifier"].replace('train_test_svm', "SVM")

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

ax = sns.lineplot(data=maxes, x="n_samples", y="accuracy", hue="classifier")
ax.set_title("Précision du modèle en fonction du nombre d'échantillon")
ax.set_ylabel("Précision")
ax.set_xlabel("Nombre d'échantillons")
plt.savefig("classification_acc_balanced.jpg")
plt.show()