In [1]:
from sklearn.datasets import fetch_20newsgroups
groups = fetch_20newsgroups()

In [2]:
data_train = fetch_20newsgroups(subset='train', random_state=21)
train_label = data_train.target
data_test = fetch_20newsgroups(subset='test', random_state=21)
test_label = data_test.target
len(data_train.data), len(data_test.data), len(test_label)

(11314, 7532, 7532)

In [3]:
import numpy as np
np.unique(test_label)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [4]:
import nltk
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names

nltk.download('names')
nltk.download('wordnet')
nltk.download('omw-1.4')  # OMW est l'acronyme de "Open Multilingual Wordnet"

all_names = set(names.words())
WNL = WordNetLemmatizer()

def clean(data):
    cleaned = defaultdict(list)
    count = 0
    for group in data:
        for words in group.split():
            if words.isalpha() and words not in all_names:
                cleaned[count].append(WNL.lemmatize(words.lower()))
        cleaned[count] = ' '.join(cleaned[count])
        count += 1
    return list(cleaned.values())

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:
x_train = clean(data_train.data)
x_train[0]

'bouncing lymenet lehigh university the following address are on the lymenet mailing but are rejecting since the list server originally accepted these address i assume these address have since been improperly functioning mail gateway might also be if you are listed here and would still like to remain on the please write to i will remove these address from the list before the next newsletter go a a general please remember to from all your mailing list before your account is this will save the listserv maintainer from many box lehigh university'

In [6]:
x_test = clean(data_test.data)
len(x_test)

7532

In [8]:
# Décorateur chrono pour mesurer les temps d'exécution
import time

def chrono(f):
    def g(*a, **k):
        t = time.time()
        r = f(*a, **k)
        print(f"Temps d'exécution : {time.time() - t:.4f} secondes")
        return r
    return g

In [9]:
pip install optuna -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m399.4/404.7 kB[0m [31m18.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Le choix de l'optimiseur d'hyperparamètres : GridSearch ou Optuna ?**


### **GridSearch**
**[`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)** effectue une recherche ***exhaustive*** : il teste toutes les combinaisons possibles dans des grilles d'hyperparamètres prédéfinies (force brute).
Bien qu'on ait la garantie de trouver l'optimum global dans l'espace discret prédéfini, le coût computationnel croît exponentiellement avec la taille de l'espace des valeurs possibles.


### **Optuna**
**[`Optuna`](https://pypi.org/project/optuna/)** utilise une recherche ***bayésienne*** (algorithme TPE - Tree-structured Parzen Estimator) en échantillonnant l'espace des hyperparamètres et en privilégiant les régions prometteuses identifiées par les évaluations précédentes. Cela permet d'obtenir de bonnes solutions avec moins d'itérations. C'est pas absolument nécessaire ici mais dans certains cas où les espaces d'hyperparamètres à tester sont de très grande dimension et continus c'est plus rapide et efficace.

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
import optuna

def create_objective_large(X, y, cv=3):
    def objective(trial):
        max_features = trial.suggest_int('max_features', 8000, 26000, step=2000)
        ngram_max = trial.suggest_int('ngram_max', 1, 2)
        max_df = trial.suggest_float('max_df', 0.5, 1.0)
        C = trial.suggest_float('C', 0.1, 10, log=True)

        pipeline = Pipeline([
            ('tf_idf', TfidfVectorizer(
                stop_words='english',
                max_features=max_features,
                ngram_range=(1, ngram_max),
                max_df=max_df
            )),
            ('svc', LinearSVC(C=C, dual=True, max_iter=3000))
        ])

        score = cross_val_score(pipeline, X, y, cv=cv, n_jobs=1).mean()
        return score

    return objective

@chrono
def run_optuna_large(X, y, n_trials, cv=3):
    """
    Optuna explore intelligemment l'espace des hyperparamètres avec seulement n_trials
    """
    objective = create_objective_large(X, y, cv)
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs=-1, show_progress_bar=True)
    return study

print("\n=== Exploration d'un grand espace d'hyperparamètres avec Optuna ===")
study_large = run_optuna_large(x_train, train_label, n_trials=50)
print(f"Meilleurs paramètres : {study_large.best_params}")
print(f"Meilleur score CV : {study_large.best_value:.4f}")

[I 2025-12-11 11:56:56,769] A new study created in memory with name: no-name-5f3f6278-b3be-46ee-84e9-c6c81412d50c



=== Exploration d'un grand espace d'hyperparamètres avec Optuna ===


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-11 11:57:04,062] Trial 1 finished with value: 0.8618534838110307 and parameters: {'max_features': 14000, 'ngram_max': 2, 'max_df': 0.664428817788087, 'C': 16.522810237287217}. Best is trial 0 with value: 0.864592920859166.
[I 2025-12-11 11:57:19,374] Trial 0 finished with value: 0.8572572362297937 and parameters: {'max_features': 8000, 'ngram_max': 1, 'max_df': 0.6569393414160447, 'C': 1.7464995167328052}. Best is trial 0 with value: 0.8572572362297937.
[I 2025-12-11 11:57:36,976] Trial 1 finished with value: 0.8773208432682714 and parameters: {'max_features': 20000, 'ngram_max': 2, 'max_df': 0.6946592364323516, 'C': 1.1001040903392973}. Best is trial 1 with value: 0.8773208432682714.
[I 2025-12-11 11:57:55,742] Trial 2 finished with value: 0.8767904565351903 and parameters: {'max_features': 18000, 'ngram_max': 2, 'max_df': 0.6166131583617969, 'C': 0.8047851547061107}. Best is trial 1 with value: 0.8773208432682714.
[I 2025-12-11 11:58:13,397] Trial 3 finished with value: 0.