In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# ============================
# 1) Chargement des données
# ============================

X_train = np.load("Document/classification/X_train.npy")
X_test  = np.load("Document/classification/X_test.npy")
y_train = np.load("Document/classification/y_train.npy")
y_test  = np.load("Document/classification/y_test.npy")

# Certains fichiers .npy peuvent être stockés en 2D (shape (n,1)),
# donc on aplatit au cas où pour avoir un vecteur 1D (n,)
y_train = y_train.ravel()
y_test = y_test.ravel()

print("Shapes :")
print("X_train :", X_train.shape)
print("X_test  :", X_test.shape)
print("y_train :", y_train.shape)
print("y_test  :", y_test.shape)


# =====================================================
# 2) Définition des modèles + pipelines + hyperparameters
# =====================================================

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=2000, random_state=0))
    ]),

    "SVM RBF": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(random_state=0))
    ]),

    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ])
}

param_grids = {
    "Logistic Regression": {
        "clf__C": [0.1, 1, 10]
    },

    "SVM RBF": {
        "clf__C": [0.1, 1, 10],
        "clf__gamma": ["scale", 0.01, 0.001]
    },

    "KNN": {
        "clf__n_neighbors": [3, 5, 7, 9]
    }
}


# =====================================
# 3) Cross-validation sur le train
# =====================================

best_models = {}
cv_results = {}

for name in models:
    print("\n=== Optimisation du modèle :", name, "===")
    
    grid = GridSearchCV(
        estimator=models[name],
        param_grid=param_grids[name],
        cv=5,                 # 5-fold cross-validation
        scoring="accuracy",
        n_jobs=-1
    )
    grid.fit(X_train, y_train)

    best_models[name] = grid.best_estimator_
    cv_results[name] = grid.best_score_

    print("→ Best CV accuracy :", grid.best_score_)
    print("→ Best params :", grid.best_params_)


# ===================================
# 4) Choisir le meilleur modèle
# ===================================

best_model_name = max(cv_results, key=cv_results.get)
best_model = best_models[best_model_name]

print("\n#############################")
print("Meilleur modèle :", best_model_name)
print("CV accuracy :", cv_results[best_model_name])
print("#############################")


# ===================================
# 5) Evaluation finale sur le test set
# ===================================

y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print("\nAccuracy finale sur le test :", test_accuracy)



Shapes :
X_train : (500, 50)
X_test  : (500, 50)
y_train : (500,)
y_test  : (500,)

=== Optimisation du modèle : Logistic Regression ===
→ Best CV accuracy : 0.8379999999999999
→ Best params : {'clf__C': 0.1}

=== Optimisation du modèle : SVM RBF ===
→ Best CV accuracy : 0.844
→ Best params : {'clf__C': 10, 'clf__gamma': 0.001}

=== Optimisation du modèle : KNN ===
→ Best CV accuracy : 0.804
→ Best params : {'clf__n_neighbors': 9}

#############################
Meilleur modèle : SVM RBF
CV accuracy : 0.844
#############################

Accuracy finale sur le test : 0.866
