# 4. Mod√©lisation : Machine Learning

## Variables d'environnement

In [2]:
TRAIN_TWEETS_PATH = 'data/train_tweets.parquet'
TEST_TWEETS_PATH = 'data/test_tweets.parquet'
VALIDATION_TWEETS_PATH = 'data/validation_tweets.parquet'

## Imports des d√©pendances

In [3]:
from IPython.display import display
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn

In [4]:
mlflow.set_experiment("Sentiments_Tweets")
mlflow.tensorflow.autolog(disable=True)

2026/01/15 17:01:41 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/15 17:01:41 INFO mlflow.store.db.utils: Updating database tables
2026/01/15 17:01:41 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/15 17:01:41 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/15 17:01:41 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/15 17:01:41 INFO alembic.runtime.migration: Will assume non-transactional DDL.
I0000 00:00:1768492902.302517     771 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1768492902.887550     771 cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512

## Imports des jeux de donn√©es

In [5]:
train_df = pd.read_parquet(TRAIN_TWEETS_PATH).fillna('')
test_df = pd.read_parquet(TEST_TWEETS_PATH).fillna('')
validation_df = pd.read_parquet(VALIDATION_TWEETS_PATH).fillna('')

In [6]:
# --- JEU DE DONN√âES : STEMMED ---

# Train
X_train_stemmed = train_df['text_stemmed']
y_train_stemmed = train_df['target']

# Validation
X_val_stemmed = validation_df['text_stemmed']
y_val_stemmed = validation_df['target']

# Test
X_test_stemmed = test_df['text_stemmed']
y_test_stemmed = test_df['target']


# --- JEU DE DONN√âES : LEMMATIZED ---

# Train
X_train_lemmatized = train_df['text_lemmatized']
y_train_lemmatized = train_df['target']

# Validation
X_val_lemmatized = validation_df['text_lemmatized']
y_val_lemmatized = validation_df['target']

# Test
X_test_lemmatized = test_df['text_lemmatized']
y_test_lemmatized = test_df['target']

## TF-IDF avec N-grams
Nous allons tester 3 mod√®les :
- Naive Bayes (Multinomial)
- R√©gression Logistique
- SVM Lin√©aire (LinearSVC)

In [7]:
def display_roc_auc_graph_sklearn(model, X_test, y_test, model_name="model"):
    plt.figure(figsize=(8, 6))

    # Gestion des mod√®les qui n'ont pas predict_proba (comme SVM standard)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1] # On prend la proba de la classe 1
    else:
        # Pour SVM LinearSVC, on utilise decision_function
        y_pred_proba = model.decision_function(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mod√®le (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taux de Faux Positifs')
    plt.ylabel('Taux de Vrais Positifs')
    plt.title(f'Courbe ROC - {model_name}')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)

    filename = f"{model_name}_ROC.png"
    plt.savefig(filename)

    if mlflow.active_run():
        print(f"   -> Envoi du graphique {filename} vers MLflow...")
        mlflow.log_artifact(filename, artifact_path="graphs")

    plt.close() # On ferme pour ne pas surcharger la m√©moire
    os.remove(filename)

In [8]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2)  # On prend les mots seuls (1) ET les duos (2)
)

In [9]:
print("Vectorisation en cours...")
X_train_stemmed_vect = vectorizer.fit_transform(X_train_stemmed)
X_val_stemmed_vect = vectorizer.transform(X_val_stemmed)
X_test_stemmed_vect = vectorizer.transform(X_test_stemmed)
X_train_lemmatized_vect = vectorizer.fit_transform(X_train_lemmatized)
X_val_lemmatized_vect = vectorizer.transform(X_val_lemmatized)
X_test_lemmatized_vect = vectorizer.transform(X_test_lemmatized)
print("Termin√© !")

Vectorisation en cours...
Termin√© !


In [10]:
models = [
    {
        "name": "Naive Bayes (Multinomial)",
        "model": MultinomialNB()
    },
    {
        "name": "R√©gression Logistique",
        "model": LogisticRegression(random_state=42)
    },
    {
        "name": "SVM Lin√©aire (LinearSVC)",
        "model": LinearSVC(dual=False, random_state=42)
    }
]

In [12]:
preprocesses = {
    "stemmed": [
        X_train_stemmed, X_val_stemmed, X_test_stemmed,
        y_train_stemmed, y_val_stemmed, y_test_stemmed,
        X_train_stemmed_vect, X_val_stemmed_vect, X_test_stemmed_vect
    ],
    "lemmatized": [
        X_train_lemmatized, X_val_lemmatized, X_test_lemmatized,
        y_train_lemmatized, y_val_lemmatized, y_test_lemmatized,
        X_train_lemmatized_vect, X_val_lemmatized_vect, X_test_lemmatized_vect
    ],
}
train_results = []
test_results = []

print(f"Lancement du comparatif sur {X_train_stemmed_vect.shape[0]} tweets...\n")

# Entrainement
for preprocess in preprocesses.keys():
    # D√©ballage des donn√©es pour ce preprocessing
    X_train, X_val, X_test, y_train, y_val, y_test, X_train_vect, X_val_vect, X_test_vect = preprocesses[preprocess]

    for m in models:
        # Cr√©ation d'un nom unique : ex "Naive Bayes (stemmed)"
        run_name_unique = f"{m['name']} ({preprocess})"

        print(f"ü§ñ Entra√Ænement de : {run_name_unique}...")

        # --- D√âBUT DE LA RUN MLFLOW ---
        # On d√©marre une session pour ce couple Mod√®le + Preprocessing
        with mlflow.start_run(run_name=run_name_unique):

            # 1. Log des Param√®tres (Carte d'identit√©)
            mlflow.log_param("model_name", m['name'])
            mlflow.log_param("preprocessing", preprocess)
            mlflow.log_param("vectorizer_ngrams", str(vectorizer.ngram_range))

            # Entra√Ænement
            start_time = time.time()
            m['model'].fit(X_train_vect, y_train)
            execution_time = time.time() - start_time

            # --- CALCUL DES M√âTRIQUES (TEST) ---
            # Pour Sklearn, on doit souvent calculer les probas manuellement pour l'AUC
            if hasattr(m['model'], "predict_proba"):
                y_score_test = m['model'].predict_proba(X_test_vect)[:, 1]
            else:
                y_score_test = m['model'].decision_function(X_test_vect)

            y_pred_test = m['model'].predict(X_test_vect)

            # Calcul des scores
            test_acc = accuracy_score(y_test, y_pred_test)
            test_f1 = f1_score(y_test, y_pred_test)
            test_auc = roc_auc_score(y_test, y_score_test)
            test_precision = precision_score(y_test, y_pred_test)
            test_recall = recall_score(y_test, y_pred_test)

            # 2. Log des M√©triques dans MLflow
            mlflow.log_metric("accuracy", test_acc)
            mlflow.log_metric("f1_score", test_f1)
            mlflow.log_metric("auc", test_auc)
            mlflow.log_metric("precision", test_precision)
            mlflow.log_metric("recall", test_recall)
            mlflow.log_metric("training_time", execution_time)

            print(f"   -> Accuracy: {test_acc:.4f} | AUC: {test_auc:.4f}")

            # 3. Log du Mod√®le (Le fichier .pkl)
            clean_name = f"model_{m['name']}_{preprocess}"
            clean_name = clean_name.replace(" ", "_").replace("(", "").replace(")", "")

            mlflow.sklearn.log_model(
                m['model'],
                artifact_path=clean_name,
                registered_model_name=f"{m['name']}"
            )

            # 4. Log des Graphiques (Comme dans le notebook Deep Learning)
            display_roc_auc_graph_sklearn(m['model'], X_test_vect, y_test, model_name=run_name_unique)

            print("   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ\n")

Lancement du comparatif sur 1120000 tweets...

ü§ñ Entra√Ænement de : Naive Bayes (Multinomial) (stemmed)...




   -> Accuracy: 0.7867 | AUC: 0.8709


2026/01/15 17:11:40 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/15 17:11:40 INFO mlflow.store.db.utils: Updating database tables
2026/01/15 17:11:40 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/15 17:11:40 INFO alembic.runtime.migration: Will assume non-transactional DDL.
Successfully registered model 'Naive Bayes (Multinomial)'.
Created version '1' of model 'Naive Bayes (Multinomial)'.


   -> Envoi du graphique Naive Bayes (Multinomial) (stemmed)_ROC.png vers MLflow...
   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ

ü§ñ Entra√Ænement de : R√©gression Logistique (stemmed)...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -> Accuracy: 0.8032 | AUC: 0.8825


Successfully registered model 'R√©gression Logistique'.
Created version '1' of model 'R√©gression Logistique'.


   -> Envoi du graphique R√©gression Logistique (stemmed)_ROC.png vers MLflow...
   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ

ü§ñ Entra√Ænement de : SVM Lin√©aire (LinearSVC) (stemmed)...




   -> Accuracy: 0.7967 | AUC: 0.8766


Successfully registered model 'SVM Lin√©aire (LinearSVC)'.
Created version '1' of model 'SVM Lin√©aire (LinearSVC)'.


   -> Envoi du graphique SVM Lin√©aire (LinearSVC) (stemmed)_ROC.png vers MLflow...
   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ

ü§ñ Entra√Ænement de : Naive Bayes (Multinomial) (lemmatized)...




   -> Accuracy: 0.7887 | AUC: 0.8734


Registered model 'Naive Bayes (Multinomial)' already exists. Creating a new version of this model...
Created version '2' of model 'Naive Bayes (Multinomial)'.


   -> Envoi du graphique Naive Bayes (Multinomial) (lemmatized)_ROC.png vers MLflow...
   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ

ü§ñ Entra√Ænement de : R√©gression Logistique (lemmatized)...




   -> Accuracy: 0.8060 | AUC: 0.8852


Registered model 'R√©gression Logistique' already exists. Creating a new version of this model...
Created version '2' of model 'R√©gression Logistique'.


   -> Envoi du graphique R√©gression Logistique (lemmatized)_ROC.png vers MLflow...
   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ

ü§ñ Entra√Ænement de : SVM Lin√©aire (LinearSVC) (lemmatized)...




   -> Accuracy: 0.7995 | AUC: 0.8794
   -> Envoi du graphique SVM Lin√©aire (LinearSVC) (lemmatized)_ROC.png vers MLflow...
   -> Run MLflow termin√©e et sauvegard√©e. ‚úÖ



Registered model 'SVM Lin√©aire (LinearSVC)' already exists. Creating a new version of this model...
Created version '2' of model 'SVM Lin√©aire (LinearSVC)'.
