In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("features.csv", sep=";")
df = df.drop('semanticobjscore', axis=1) 
df = df.drop('semanticsubjscore', axis=1) 
df = df.drop('URL', axis=1) 

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


X = df.drop(['Label', 'TextID'], axis=1)  
y = df['Label']

# Convertir les labels en nombres
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Classes:", le.classes_)  # Pour voir la correspondance

# Refaire la séparation train/test/val avec les labels encodés
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,          
    random_state=3,         
    stratify=y_encoded      
)

Classes: ['objective' 'subjective']


La fonction MLPClassifier prend par défaut ReLu comme fonction d'activation et la fonction sigmoïde pour la couche de sortie (car on est dans un cas binaire). L'algorithme par défaut est l'algorithme Adam (avec taux d'apprentissage 0,001 et taux de régularisation 0,0001)

LÀ ON VA UTILISER RANDOMSEARCH POUR CHOISIR LES HYPERPARAMÈTRES AU MIEUX 

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from scipy.stats import loguniform, uniform, randint # Pour définir des distributions
from sklearn.metrics import precision_score 
from sklearn.preprocessing import StandardScaler



scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model_base = MLPClassifier(max_iter=2000, random_state=3)



# Nous définissons l'espace de recherche (le "param_dist")
param_dist = {
    # 1. Architecture (hidden_layer_sizes) : C'est une liste de tuples à choisir
    'hidden_layer_sizes': [(50,), (100,), (50, 25), (100, 50), (128, 64)], 
    
    # 2. Régularisation (alpha) : Loguniform est idéal pour les paramètres de régularisation
    'alpha': loguniform(1e-5, 1e-2), # Exemple: choisir une valeur entre 0.00001 et 0.01
    
    # 3. Optimisation (learning_rate_init) : Loguniform est aussi bon pour le taux d'apprentissage
    'learning_rate_init': loguniform(1e-4, 1e-2), # Exemple: choisir une valeur entre 0.0001 et 0.01
    
    # 4. Fonction d'activation : Une liste de choix discrets
    'activation': ['tanh', 'relu'],
    
    # 5. Type de solveur : Une liste de choix discrets
    'solver': ['adam', 'sgd'] 
}


# Instanciation de RandomizedSearchCV
# n_iter=50 : Le nombre total d'itérations aléatoires que nous voulons tester (compromis temps/précision)
# scoring='f1' : La métrique sur laquelle nous arbitrons (maximiser le F1-score)
# cv=5 : Utilise la validation croisée K-Fold avec 5 plis (plus fiable qu'un simple split Train/Val)
# verbose=2 : Affiche le progrès détaillé
# n_jobs=-1 : Utilise tous les cœurs du processeur pour accélérer le calcul
random_search = RandomizedSearchCV(
    estimator=model_base,
    param_distributions=param_dist,
    n_iter=50, 
    scoring='precision', 
    cv=5, 
    random_state=3, 
    verbose=2,
    n_jobs=-1 
)

# X_train et y_train sont les données utilisées pour la recherche d'hyperparamètres.
# L'objet random_search effectue l'entraînement et la validation croisée en interne.
print("Début de la recherche aléatoire...")
random_search.fit(X_train_scaled, y_train) 
print("Recherche aléatoire terminée.")


# Récupération du meilleur jeu d'hyperparamètres (celui qui a maximisé le Score de précision CV)
best_params = random_search.best_params_

# Récupération du modèle qui a obtenu le meilleur score de validation croisée
best_model = random_search.best_estimator_

print("\n---------------------------------------------")
print("MEILLEURS HYPERPARAMÈTRES TROUVÉS (JUSTIFICATION) :")
print(best_params)
print(f"Meilleur score de validation croisée (Score de précision) : {random_search.best_score_:.4f}")
print("---------------------------------------------")

# Évaluation finale sur l'ensemble de TEST
y_pred_test = best_model.predict(X_test_scaled)
test_precision = precision_score(y_test, y_pred_test)
print(f"Performance finale sur la base de TEST : Score de précision = {test_precision:.4f}")

Début de la recherche aléatoire...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END activation=tanh, alpha=1.6299513426714383e-05, hidden_layer_sizes=(100, 50), learning_rate_init=0.0003817767511117906, solver=adam; total time=   1.3s
[CV] END activation=relu, alpha=0.004773543263868112, hidden_layer_sizes=(50, 25), learning_rate_init=0.00010901744948900903, solver=sgd; total time=   1.1s
[CV] END activation=relu, alpha=0.004773543263868112, hidden_layer_sizes=(50, 25), learning_rate_init=0.00010901744948900903, solver=sgd; total time=   1.2s
[CV] END activation=tanh, alpha=1.6299513426714383e-05, hidden_layer_sizes=(100, 50), learning_rate_init=0.0003817767511117906, solver=adam; total time=   1.6s
[CV] END activation=tanh, alpha=1.6299513426714383e-05, hidden_layer_sizes=(100, 50), learning_rate_init=0.0003817767511117906, solver=adam; total time=   1.4s
[CV] END activation=relu, alpha=0.004773543263868112, hidden_layer_sizes=(50, 25), learning_rate_init=0.00010



[CV] END activation=tanh, alpha=5.541979202687864e-05, hidden_layer_sizes=(50,), learning_rate_init=0.002452716233245465, solver=sgd; total time=   2.1s
[CV] END activation=tanh, alpha=0.0005923637206170249, hidden_layer_sizes=(128, 64), learning_rate_init=0.0034013612281362745, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.0005923637206170249, hidden_layer_sizes=(128, 64), learning_rate_init=0.0034013612281362745, solver=adam; total time=   0.5s
[CV] END activation=tanh, alpha=5.541979202687864e-05, hidden_layer_sizes=(50,), learning_rate_init=0.002452716233245465, solver=sgd; total time=   2.2s
[CV] END activation=relu, alpha=0.00022883207745802753, hidden_layer_sizes=(50, 25), learning_rate_init=0.00036055752614611096, solver=adam; total time=   1.7s
[CV] END activation=relu, alpha=0.00022883207745802753, hidden_layer_sizes=(50, 25), learning_rate_init=0.00036055752614611096, solver=adam; total time=   1.5s
[CV] END activation=relu, alpha=0.00022883207745802753, 



[CV] END activation=tanh, alpha=0.001186950295330506, hidden_layer_sizes=(50,), learning_rate_init=0.00011273338263022603, solver=adam; total time=   2.4s
[CV] END activation=relu, alpha=0.00045691266410131633, hidden_layer_sizes=(100, 50), learning_rate_init=0.0001485940554324228, solver=sgd; total time=   1.6s
[CV] END activation=tanh, alpha=3.953011510085378e-05, hidden_layer_sizes=(50, 25), learning_rate_init=0.00015856549555662785, solver=adam; total time=   2.2s
[CV] END activation=tanh, alpha=9.74347928844921e-05, hidden_layer_sizes=(100,), learning_rate_init=0.0011277738038812947, solver=adam; total time=   1.3s
[CV] END activation=tanh, alpha=3.953011510085378e-05, hidden_layer_sizes=(50, 25), learning_rate_init=0.00015856549555662785, solver=adam; total time=   2.2s
[CV] END activation=relu, alpha=0.00045691266410131633, hidden_layer_sizes=(100, 50), learning_rate_init=0.0001485940554324228, solver=sgd; total time=   1.8s
[CV] END activation=relu, alpha=0.00045691266410131633



[CV] END activation=tanh, alpha=0.00287966644290629, hidden_layer_sizes=(50,), learning_rate_init=0.002835561015503015, solver=sgd; total time=   1.7s
[CV] END activation=tanh, alpha=0.0022760982798711803, hidden_layer_sizes=(50, 25), learning_rate_init=0.000539363023503478, solver=adam; total time=   1.2s
[CV] END activation=tanh, alpha=0.00287966644290629, hidden_layer_sizes=(50,), learning_rate_init=0.002835561015503015, solver=sgd; total time=   1.6s
[CV] END activation=tanh, alpha=0.0022760982798711803, hidden_layer_sizes=(50, 25), learning_rate_init=0.000539363023503478, solver=adam; total time=   0.9s
[CV] END activation=tanh, alpha=0.0033597700754194526, hidden_layer_sizes=(50,), learning_rate_init=0.0035113762739951353, solver=sgd; total time=   1.6s
[CV] END activation=tanh, alpha=0.0022760982798711803, hidden_layer_sizes=(50, 25), learning_rate_init=0.000539363023503478, solver=adam; total time=   1.3s
[CV] END activation=tanh, alpha=0.0033597700754194526, hidden_layer_sizes



[CV] END activation=relu, alpha=0.0030450077636692075, hidden_layer_sizes=(128, 64), learning_rate_init=0.0001628708850299689, solver=adam; total time=   3.0s
[CV] END activation=tanh, alpha=4.444760996394969e-05, hidden_layer_sizes=(50, 25), learning_rate_init=0.002748600334889226, solver=sgd; total time=   1.6s
[CV] END activation=relu, alpha=0.0030450077636692075, hidden_layer_sizes=(128, 64), learning_rate_init=0.0001628708850299689, solver=adam; total time=   2.9s
[CV] END activation=relu, alpha=0.0030450077636692075, hidden_layer_sizes=(128, 64), learning_rate_init=0.0001628708850299689, solver=adam; total time=   3.1s
[CV] END activation=tanh, alpha=4.444760996394969e-05, hidden_layer_sizes=(50, 25), learning_rate_init=0.002748600334889226, solver=sgd; total time=   1.7s
[CV] END activation=relu, alpha=0.0030450077636692075, hidden_layer_sizes=(128, 64), learning_rate_init=0.0001628708850299689, solver=adam; total time=   3.6s
[CV] END activation=tanh, alpha=4.444760996394969e-0



[CV] END activation=tanh, alpha=4.94746906016514e-05, hidden_layer_sizes=(50, 25), learning_rate_init=0.0019326354891336735, solver=sgd; total time=   3.2s
[CV] END activation=tanh, alpha=4.94746906016514e-05, hidden_layer_sizes=(50, 25), learning_rate_init=0.0019326354891336735, solver=sgd; total time=   2.9s
[CV] END activation=tanh, alpha=0.007868284475487186, hidden_layer_sizes=(128, 64), learning_rate_init=0.00511047525478675, solver=sgd; total time=   1.8s
[CV] END activation=relu, alpha=0.0024059511865841708, hidden_layer_sizes=(128, 64), learning_rate_init=0.0035476080367868048, solver=sgd; total time=   2.7s
[CV] END activation=relu, alpha=0.0024059511865841708, hidden_layer_sizes=(128, 64), learning_rate_init=0.0035476080367868048, solver=sgd; total time=   2.7s
[CV] END activation=relu, alpha=0.0024059511865841708, hidden_layer_sizes=(128, 64), learning_rate_init=0.0035476080367868048, solver=sgd; total time=   2.9s
[CV] END activation=relu, alpha=0.0024059511865841708, hidd



[CV] END activation=tanh, alpha=0.007868284475487186, hidden_layer_sizes=(128, 64), learning_rate_init=0.00511047525478675, solver=sgd; total time=   3.0s
[CV] END activation=tanh, alpha=0.0014469506116938247, hidden_layer_sizes=(50,), learning_rate_init=0.00012920611913355058, solver=adam; total time=   2.3s
[CV] END activation=tanh, alpha=0.0014469506116938247, hidden_layer_sizes=(50,), learning_rate_init=0.00012920611913355058, solver=adam; total time=   2.5s




[CV] END activation=tanh, alpha=0.0014469506116938247, hidden_layer_sizes=(50,), learning_rate_init=0.00012920611913355058, solver=adam; total time=   2.0s
[CV] END activation=tanh, alpha=0.0014469506116938247, hidden_layer_sizes=(50,), learning_rate_init=0.00012920611913355058, solver=adam; total time=   2.3s
Recherche aléatoire terminée.

---------------------------------------------
MEILLEURS HYPERPARAMÈTRES TROUVÉS (JUSTIFICATION) :
{'activation': 'tanh', 'alpha': np.float64(0.0027144588822510247), 'hidden_layer_sizes': (50,), 'learning_rate_init': np.float64(0.0008114170909427846), 'solver': 'sgd'}
Meilleur score de validation croisée (Score de précision) : 0.8161
---------------------------------------------
Performance finale sur la base de TEST : Score de précision = 0.7969
