In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv("features.csv", sep=";")
df = df.drop('semanticobjscore', axis=1) 
df = df.drop('semanticsubjscore', axis=1) 
df = df.drop('URL', axis=1) 

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


X = df.drop(['Label', 'TextID'], axis=1)  
y = df['Label']

y_encoded = y.map({'objective': 1.0, 'subjective': 0.0}).astype(float)


# Refaire la séparation train/test/val avec les labels encodés
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,          
    random_state=3,         
    stratify=y_encoded      
)

La fonction MLPClassifier prend par défaut ReLu comme fonction d'activation et la fonction sigmoïde pour la couche de sortie (car on est dans un cas binaire). L'algorithme par défaut est l'algorithme Adam (avec taux d'apprentissage 0,001 et taux de régularisation 0,0001)

LÀ ON VA UTILISER RANDOMSEARCH POUR CHOISIR LES HYPERPARAMÈTRES AU MIEUX 

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from scipy.stats import loguniform, uniform, randint 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler



scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model_base = MLPClassifier(max_iter=2000, random_state=3)



# Nous définissons l'espace de recherche (le "param_dist")
param_dist = {
    # 1. Architecture (hidden_layer_sizes) : C'est une liste de tuples à choisir
    'hidden_layer_sizes': [(50,), (100,), (50, 25), (100, 50), (128, 64)], 
    
    # 2. Régularisation (alpha) : Loguniform est idéal pour les paramètres de régularisation
    'alpha': loguniform(1e-4, 1e-0), 
    
    # 3. Optimisation (learning_rate_init) : Loguniform est aussi bon pour le taux d'apprentissage
    'learning_rate_init': loguniform(1e-5, 1e-1), # On utilise log pour augmenter le poids accordé aux valeurs entre 0.000001 et 0.1
    # 4. Fonction d'activation : Une liste de choix discrets
    'activation': ['tanh', 'relu'],
    
    # 5. Type de solveur : Une liste de choix discrets
    'solver': ['adam', 'sgd'] 
}


# Instanciation de RandomizedSearchCV
# n_iter=50 : Le nombre total d'itérations aléatoires que nous voulons tester (compromis temps/précision)
# scoring='f1' : La métrique sur laquelle nous arbitrons (maximiser le score de précision)
# cv=5 : Utilise la validation croisée K-Fold avec 5 plis (plus fiable qu'un simple split Train/Val)
# verbose=2 : Affiche le progrès détaillé
# n_jobs=-1 : Utilise tous les cœurs du processeur pour accélérer le calcul
random_search = RandomizedSearchCV(
    estimator=model_base,
    param_distributions=param_dist,
    n_iter=50, 
    scoring='precision', 
    cv=5, 
    random_state=3, 
    verbose=2,
    n_jobs=-1 
)

# X_train et y_train sont les données utilisées pour la recherche d'hyperparamètres.
# L'objet random_search effectue l'entraînement et la validation croisée en interne.
print("Début de la recherche aléatoire...")
random_search.fit(X_train_scaled, y_train) 
print("Recherche aléatoire terminée.")


# Récupération du meilleur jeu d'hyperparamètres (celui qui a maximisé le Score de précision CV)
best_params = random_search.best_params_

# Récupération du modèle qui a obtenu le meilleur score de validation croisée
best_model = random_search.best_estimator_

print("\n---------------------------------------------")
print("MEILLEURS HYPERPARAMÈTRES TROUVÉS (JUSTIFICATION) :")
print(best_params)
print(f"Meilleur score de validation croisée (Score de précision) : {random_search.best_score_:.4f}")
print("---------------------------------------------")

# Évaluation finale sur l'ensemble de TEST
y_pred_test = best_model.predict(X_test_scaled)

test_precision = precision_score(y_test, y_pred_test)
print(f"Performance finale sur la base de TEST : Score de précision = {test_precision:.4f}")

print("\nTableau de bord de performance :")
print("Test accuracy:", accuracy_score(y_test, y_pred_test))
print("Test precision:", precision_score(y_test, y_pred_test))
print("Test recall:", recall_score(y_test, y_pred_test))
print("Test F1:", f1_score(y_test, y_pred_test))




Début de la recherche aléatoire...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[CV] END activation=relu, alpha=0.3730678440610188, hidden_layer_sizes=(50, 25), learning_rate_init=1.1884804293088614e-05, solver=sgd; total time=   2.2s
[CV] END activation=relu, alpha=0.3730678440610188, hidden_layer_sizes=(50, 25), learning_rate_init=1.1884804293088614e-05, solver=sgd; total time=   2.3s
[CV] END activation=relu, alpha=0.3730678440610188, hidden_layer_sizes=(50, 25), learning_rate_init=1.1884804293088614e-05, solver=sgd; total time=   2.3s
[CV] END activation=relu, alpha=0.3730678440610188, hidden_layer_sizes=(50, 25), learning_rate_init=1.1884804293088614e-05, solver=sgd; total time=   2.2s
[CV] END activation=relu, alpha=0.3730678440610188, hidden_layer_sizes=(50, 25), learning_rate_init=1.1884804293088614e-05, solver=sgd; total time=   2.4s
[CV] END activation=tanh, alpha=0.0009807384199410206, hidden_layer_sizes=(50,), learning_rate_init=0.00601581692082581, solver=sgd; total time=   2.5s
[CV] END activation=tanh, alpha=0.0009807384199410206, hidden_layer_sizes



[CV] END activation=relu, alpha=0.5565889749107682, hidden_layer_sizes=(128, 64), learning_rate_init=3.7227966704999966e-05, solver=adam; total time=  11.7s
[CV] END activation=relu, alpha=0.5565889749107682, hidden_layer_sizes=(128, 64), learning_rate_init=3.7227966704999966e-05, solver=adam; total time=  11.8s
[CV] END activation=tanh, alpha=0.0006250346661557744, hidden_layer_sizes=(50, 25), learning_rate_init=2.5143016381118958e-05, solver=adam; total time=   2.2s
[CV] END activation=tanh, alpha=0.0006250346661557744, hidden_layer_sizes=(50, 25), learning_rate_init=2.5143016381118958e-05, solver=adam; total time=   2.2s
[CV] END activation=tanh, alpha=0.0006250346661557744, hidden_layer_sizes=(50, 25), learning_rate_init=2.5143016381118958e-05, solver=adam; total time=   2.4s
[CV] END activation=relu, alpha=0.5565889749107682, hidden_layer_sizes=(128, 64), learning_rate_init=3.7227966704999966e-05, solver=adam; total time=  11.9s
[CV] END activation=tanh, alpha=0.000625034666155774



[CV] END activation=relu, alpha=0.5565889749107682, hidden_layer_sizes=(128, 64), learning_rate_init=3.7227966704999966e-05, solver=adam; total time=  12.1s
[CV] END activation=relu, alpha=0.01633469114296742, hidden_layer_sizes=(100, 50), learning_rate_init=2.2080193309853895e-05, solver=sgd; total time=   2.6s
[CV] END activation=relu, alpha=0.01633469114296742, hidden_layer_sizes=(100, 50), learning_rate_init=2.2080193309853895e-05, solver=sgd; total time=   2.8s
[CV] END activation=relu, alpha=0.01633469114296742, hidden_layer_sizes=(100, 50), learning_rate_init=2.2080193309853895e-05, solver=sgd; total time=   2.7s
[CV] END activation=relu, alpha=0.01633469114296742, hidden_layer_sizes=(100, 50), learning_rate_init=2.2080193309853895e-05, solver=sgd; total time=   2.7s
[CV] END activation=relu, alpha=0.01633469114296742, hidden_layer_sizes=(100, 50), learning_rate_init=2.2080193309853895e-05, solver=sgd; total time=   2.9s
[CV] END activation=tanh, alpha=0.0020810639294043276, hid



[CV] END activation=tanh, alpha=0.0017011198402837671, hidden_layer_sizes=(100,), learning_rate_init=0.047801516120269984, solver=adam; total time=   0.2s
[CV] END activation=tanh, alpha=0.0014571014263368891, hidden_layer_sizes=(128, 64), learning_rate_init=0.02459732059158962, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.0017011198402837671, hidden_layer_sizes=(100,), learning_rate_init=0.047801516120269984, solver=adam; total time=   0.2s
[CV] END activation=relu, alpha=0.6813102700640399, hidden_layer_sizes=(128, 64), learning_rate_init=5.662437283389554e-05, solver=adam; total time=  11.8s
[CV] END activation=tanh, alpha=0.0017011198402837671, hidden_layer_sizes=(100,), learning_rate_init=0.047801516120269984, solver=adam; total time=   0.2s
[CV] END activation=tanh, alpha=0.0017011198402837671, hidden_layer_sizes=(100,), learning_rate_init=0.047801516120269984, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.06756565957747941, hidden_layer_s



[CV] END activation=tanh, alpha=0.32273655269212315, hidden_layer_sizes=(100,), learning_rate_init=0.03315119293715507, solver=sgd; total time=   0.9s
[CV] END activation=relu, alpha=0.6813102700640399, hidden_layer_sizes=(128, 64), learning_rate_init=5.662437283389554e-05, solver=adam; total time=  12.2s
[CV] END activation=tanh, alpha=0.32273655269212315, hidden_layer_sizes=(100,), learning_rate_init=0.03315119293715507, solver=sgd; total time=   1.1s
[CV] END activation=tanh, alpha=0.32273655269212315, hidden_layer_sizes=(100,), learning_rate_init=0.03315119293715507, solver=sgd; total time=   1.2s




[CV] END activation=tanh, alpha=0.32273655269212315, hidden_layer_sizes=(100,), learning_rate_init=0.03315119293715507, solver=sgd; total time=   1.4s
[CV] END activation=tanh, alpha=0.32273655269212315, hidden_layer_sizes=(100,), learning_rate_init=0.03315119293715507, solver=sgd; total time=   1.3s
[CV] END activation=tanh, alpha=0.7700114591702354, hidden_layer_sizes=(100,), learning_rate_init=6.603270307144708e-05, solver=adam; total time=   2.8s
[CV] END activation=relu, alpha=0.07151846211082948, hidden_layer_sizes=(128, 64), learning_rate_init=0.030811868351166435, solver=adam; total time=   0.3s
[CV] END activation=tanh, alpha=0.7700114591702354, hidden_layer_sizes=(100,), learning_rate_init=6.603270307144708e-05, solver=adam; total time=   3.3s
[CV] END activation=tanh, alpha=0.7700114591702354, hidden_layer_sizes=(100,), learning_rate_init=6.603270307144708e-05, solver=adam; total time=   3.4s
[CV] END activation=relu, alpha=0.07151846211082948, hidden_layer_sizes=(128, 64), 



[CV] END activation=relu, alpha=0.20485718737070294, hidden_layer_sizes=(128, 64), learning_rate_init=2.6526925190445296e-05, solver=adam; total time=  12.1s
[CV] END activation=relu, alpha=0.20485718737070294, hidden_layer_sizes=(128, 64), learning_rate_init=2.6526925190445296e-05, solver=adam; total time=  12.1s
[CV] END activation=relu, alpha=0.20485718737070294, hidden_layer_sizes=(128, 64), learning_rate_init=2.6526925190445296e-05, solver=adam; total time=  11.9s
[CV] END activation=relu, alpha=0.20485718737070294, hidden_layer_sizes=(128, 64), learning_rate_init=2.6526925190445296e-05, solver=adam; total time=  12.0s
Recherche aléatoire terminée.

---------------------------------------------
MEILLEURS HYPERPARAMÈTRES TROUVÉS (JUSTIFICATION) :
{'activation': 'tanh', 'alpha': np.float64(0.05833242667368307), 'hidden_layer_sizes': (50,), 'learning_rate_init': np.float64(1.2708815559252943e-05), 'solver': 'adam'}
Meilleur score de validation croisée (Score de précision) : 0.8721
--