# Start

In [16]:
# START

import os
import re
from typing import List, Tuple, Optional

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

import mlflow
import mlflow.tensorflow
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    roc_curve,
    auc,
)


MLFLOW_EXPERIMENT_NAME = "Model_3_USE"
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

print("TensorFlow version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))
print("CUDA built:", tf.test.is_built_with_cuda())


2025/11/19 11:35:05 INFO mlflow.tracking.fluent: Experiment with name 'Model_3_USE' does not exist. Creating a new experiment.


TensorFlow version: 2.20.0
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
CUDA built: True


# DATA READING

In [3]:
# DATA READING
df = pd.read_csv(
    "../../sentiment140/training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None,
    names=["sentiment", "id", "date", "query", "user", "tweet"],
)

df.head()

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Keep only sentiment + tweet

df = df[["sentiment", "tweet"]]

# Stratified sampling to have tweets of the two sentiments
df_negatifs = df[df["sentiment"] == 0].sample(8000, random_state=42)
df_positifs = df[df["sentiment"] == 4].sample(8000, random_state=42)

# So we get a sample of 10% of the original dataset
df = pd.concat([df_negatifs, df_positifs]).reset_index(drop=True)

# Map labels 4 -> 1 for binary classification
df["sentiment"] = df["sentiment"].replace(4, 1)

df.head()


Unnamed: 0,sentiment,tweet
0,0,@xnausikaax oh no! where did u order from? tha...
1,0,A great hard training weekend is over. a coup...
2,0,"Right, off to work Only 5 hours to go until I..."
3,0,I am craving for japanese food
4,0,Jean Michel Jarre concert tomorrow gotta work...


In [5]:
# Class balance check
df["sentiment"].value_counts()

sentiment
0    8000
1    8000
Name: count, dtype: int64

# CLEANING TEXT

## Preprocessing

✅ Preprocessing steps(preparing for TF-IDF + Logistic Regression)

Compared to the preprocessing of tweets made during the application of logistic regression, the steps written in red have been removed, and the steps in green are repeated or added for this time.


<span style="color:green">- Lowercase</span>

<span style="color:green">- Convert emoticons → words</span>

<span style="color:green">- Convert emojis → words</span>

<span style="color:green">- Remove URLs, mentions, hashtags</span>

<span style="color:red">- Tokenize</span>


In [6]:
# CLEANING TEXT – light, meaning-preserving normalization

try:
    # optional but nice: converts emojis → words like ":smiling_face:" -> "smiling face"
    from emoji import demojize
    _HAS_EMOJI = True
except Exception:
    _HAS_EMOJI = False

# ---------- regexes & mappings ----------

_EMOTICONS = {
    r":-\)|:\)|=\)|:\]": "smile",
    r":-D|:D|=D": "laugh",
    r":-\(|:\(|=\(|:\[": "sad",
    r":'\(|:'-\(": "cry",
    r";-\)|;\)": "wink",
    r":-P|:P": "playful",
    r":/|:-/": "skeptical",
    r":\*": "kiss",
    r">:\(|>:-\(": "angry",
    r"XD|xD": "laugh",
}

_EMOTICON_REGEXES = [(re.compile(p), w) for p, w in _EMOTICONS.items()]

_URL_RE = re.compile(r"(https?://\S+|www\.\S+)")
_USER_RE = re.compile(r"@\w+")
_NUM_RE = re.compile(r"\b\d+\b")
# Hashtags: keep the hashtag and add its content as a separate token
_HASHTAG_RE = re.compile(r"#(\w+)")
# compress character repetitions to max 3 (so "sooooo" -> "sooo")
_REPEAT_RE = re.compile(r"(.)\1{3,}")


def _emoticons_to_words(text: str) -> str:
    for rgx, word in _EMOTICON_REGEXES:
        text = rgx.sub(f" {word} ", text)
    return text


def _emojis_to_words(text: str) -> str:
    if not _HAS_EMOJI:
        return text
    text = demojize(text, language="en")
    # demojize yields ":grinning_face_with_big_eyes:" → turn to words
    text = re.sub(
        r":([a-zA-Z0-9_]+):",
        lambda m: " " + m.group(1).replace("_", " ") + " ",
        text,
    )
    return text


def normalize_tweet(t: str) -> str:
    t = t.strip().lower()
    t = _URL_RE.sub(" <URL> ", t)
    t = _USER_RE.sub(" <USER> ", t)
    t = _NUM_RE.sub(" <NUM> ", t)
    t = _emoticons_to_words(t)
    t = _emojis_to_words(t)
    # keep hashtag token, also add its de-hashed word
    t = _HASHTAG_RE.sub(lambda m: f" #{m.group(1)} {m.group(1)} ", t)
    # compress extreme elongations but keep emphasis
    t = _REPEAT_RE.sub(r"\1\1\1", t)
    # normalize whitespace
    t = re.sub(r"\s+", " ", t).strip()
    return t


## Spliting

In [7]:
# Spliting – same logic as in Model 2

texts = df["tweet"].astype(str).tolist()
labels = df["sentiment"].astype("float32").to_numpy()

X_train, X_tmp, y_train, y_tmp = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels,
)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp,
    y_tmp,
    test_size=0.5,
    random_state=42,
    stratify=y_tmp,
)

print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))


Train size: 12800 Val size: 1600 Test size: 1600


## Precompute normalized texts for USE

In [9]:
# Precompute normalized texts for USE (Applying the preprocessing functions)

X_train_norm = [normalize_tweet(t) for t in X_train]
X_val_norm = [normalize_tweet(t) for t in X_val]
X_test_norm = [normalize_tweet(t) for t in X_test]

# Quick sanity check
X_train_norm[:5]

['exam return day, overall average of <NUM> . <NUM> % work harder next time crystal and no slack. astronomy night tonight and team dinner tomorrow!',
 "out of school. i'm going to miss everyone so much! &lt; <NUM>",
 '<USER>',
 "i still hear a kitten meowing! i hope someone helps the poor thing. (i can't find it, it's probably on the other side of the fence.)",
 "stuffed peppers w/ spanish rice, beef, mushrooms, tomaters, acorn squash, &amp; onions - topped w/ cheese. if i'm eating leftovers all week.."]

In [14]:
# Encode normalized texts with USE once to get dense features

USE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4"
print("Loading USE encoder...")
use_encoder = hub.load(USE_URL)

def encode_with_use(texts, batch_size=256):
    vectors = []
    for start in range(0, len(texts), batch_size):
        batch_texts = texts[start:start + batch_size]
        embeddings = use_encoder(batch_texts)
        vectors.append(embeddings.numpy())
    return np.vstack(vectors)

X_train_vec = encode_with_use(X_train_norm)
X_val_vec = encode_with_use(X_val_norm)
X_test_vec = encode_with_use(X_test_norm)

USE_EMBED_DIM = X_train_vec.shape[1]
print("USE embedding dimension:", USE_EMBED_DIM)


Loading USE encoder...
USE embedding dimension: 512


# Optuna + USE

In [17]:
# USE-based classifier with Optuna + MLflow
# (replacing the BiLSTM + Word2Vec part from Model 2)

def build_use_model(params, input_dim=USE_EMBED_DIM) -> tf.keras.Model:
    tf.keras.backend.clear_session()

    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,), dtype=tf.float32, name="use_embedding"),
        tf.keras.layers.Dropout(params["dropout_rate"]),
        tf.keras.layers.Dense(params["dense_units"], activation="relu"),
        tf.keras.layers.Dropout(params["dropout_rate"]),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=params["learning_rate"])
    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


class ValMetricsCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val_vec, y_val):
        super().__init__()
        self.X_val_vec = X_val_vec
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        y_pred_prob = self.model.predict(self.X_val_vec, verbose=0).ravel()
        y_pred = (y_pred_prob >= 0.5).astype("int32")

        acc = accuracy_score(self.y_val, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(
            self.y_val,
            y_pred,
            average="binary",
            zero_division=0,
        )

        mlflow.log_metric("val_accuracy_trace", acc, step=epoch)
        mlflow.log_metric("val_precision_trace", prec, step=epoch)
        mlflow.log_metric("val_recall_trace", rec, step=epoch)
        mlflow.log_metric("val_f1_trace", f1, step=epoch)


def objective(trial):
    tf.keras.backend.clear_session()

    params = {
        "dense_units": trial.suggest_int("dense_units", 64, 256, step=64),
        "dropout_rate": trial.suggest_float("dropout_rate", 0.1, 0.5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128]),
        "epochs": trial.suggest_int("epochs", 3, 8),
    }

    with mlflow.start_run(run_name=f"use_trial_{trial.number}", nested=True):
        mlflow.log_params(params)

        train_ds = (
            tf.data.Dataset.from_tensor_slices((X_train_vec, y_train))
            .shuffle(10000)
            .batch(params["batch_size"])
        )

        val_ds = tf.data.Dataset.from_tensor_slices((X_val_vec, y_val)).batch(
            params["batch_size"]
        )

        model = build_use_model(params)

        model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=params["epochs"],
            verbose=0,
        )

        y_val_pred_prob = model.predict(X_val_vec, verbose=0).ravel()
        y_val_pred = (y_val_pred_prob >= 0.5).astype("int32")

        acc = accuracy_score(y_val, y_val_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_val,
            y_val_pred,
            average="binary",
            zero_division=0,
        )

        mlflow.log_metric("val_accuracy", acc)
        mlflow.log_metric("val_precision", prec)
        mlflow.log_metric("val_recall", rec)
        mlflow.log_metric("val_f1", f1)

        return f1


# Parent run: Optuna + final training + evaluation, mirroring Model 2 logic
with mlflow.start_run(run_name="optuna_use_parent"):
    # Hyperparameter search
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=4)

    # Optional Optuna visualizations
    try:
        import plotly  # noqa: F401

        fig1 = optuna.visualization.plot_optimization_history(study)
        mlflow.log_figure(fig1, "optimization_history_use.html")

        fig2 = optuna.visualization.plot_param_importances(study)
        mlflow.log_figure(fig2, "param_importance_use.html")
    except ImportError:
        print("⚠️ Plotly non installé — visualisations sautées.")

    best_trial = study.best_trial
    best_params = best_trial.params

    print("Best val F1:", best_trial.value)
    print("Best params:", best_params)

    mlflow.log_metric("best_val_f1", best_trial.value)
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})

    # Re-train on full train + val
    X_train_full_vec = np.concatenate([X_train_vec, X_val_vec], axis=0)
    y_train_full = np.concatenate([y_train, y_val])

    model_best = build_use_model(best_params)

    cb = ValMetricsCallback(X_val_vec=X_test_vec, y_val=y_test)

    train_full_ds = (
        tf.data.Dataset.from_tensor_slices((X_train_full_vec, y_train_full))
        .shuffle(10000)
        .batch(best_params["batch_size"])
    )

    model_best.fit(
        train_full_ds,
        epochs=best_params["epochs"],
        callbacks=[cb],
        verbose=0,
    )

    # Final test evaluation
    y_test_pred_prob = model_best.predict(
        X_test_vec,
        verbose=0,
    ).ravel()
    y_test_pred = (y_test_pred_prob >= 0.5).astype("int32")

    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec, test_rec, test_f1, _ = precision_recall_fscore_support(
        y_test,
        y_test_pred,
        average="binary",
        zero_division=0,
    )

    print("Test accuracy:", test_acc)
    print("Test precision:", test_prec)
    print("Test recall:", test_rec)
    print("Test F1:", test_f1)

    mlflow.log_metric("test_accuracy", test_acc)
    mlflow.log_metric("test_precision", test_prec)
    mlflow.log_metric("test_recall", test_rec)
    mlflow.log_metric("test_f1", test_f1)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    fig_cm, ax_cm = plt.subplots(figsize=(4, 4))
    im = ax_cm.imshow(cm, interpolation="nearest", cmap="Blues")
    ax_cm.figure.colorbar(im, ax=ax_cm)
    ax_cm.set(
        xticks=[0, 1],
        yticks=[0, 1],
        xticklabels=["Prédit négatif", "Prédit positif"],
        yticklabels=["Réel négatif", "Réel positif"],
        ylabel="Réel",
        xlabel="Prédit",
        title="Matrice de confusion - USE",
    )
    labels_cm = [["TN", "FP"], ["FN", "TP"]]
    for i in range(2):
        for j in range(2):
            ax_cm.text(
                j,
                i,
                f"{labels_cm[i][j]} = {cm[i, j]}",
                ha="center",
                va="center",
                color="black",
            )
    fig_cm.tight_layout()
    mlflow.log_figure(fig_cm, "confusion_matrix_use.png")
    plt.close(fig_cm)

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_test_pred_prob)
    auc_score = auc(fpr, tpr)

    fig_auc, ax_auc = plt.subplots(figsize=(4, 4))
    ax_auc.plot(fpr, tpr, label=f"AUC = {auc_score:.3f}")
    ax_auc.plot([0, 1], [0, 1], "k--", label="Aléatoire")
    ax_auc.set_xlabel("False Positive Rate")
    ax_auc.set_ylabel("True Positive Rate")
    ax_auc.set_title("Courbe ROC - USE")
    ax_auc.legend(loc="lower right")
    fig_auc.tight_layout()
    mlflow.log_figure(fig_auc, "roc_curve_use.png")
    plt.close(fig_auc)

    mlflow.log_metric("test_auc", auc_score)

    # Log model artifact
    input_example = X_test_vec[:1]
    mlflow.tensorflow.log_model(
        model=model_best,
        artifact_path="best_model_use",
        input_example=input_example,
    )


[I 2025-11-19 11:35:15,931] A new study created in memory with name: no-name-99cb70b6-beed-41b3-947e-242aba1c47df


[I 2025-11-19 11:35:20,046] Trial 0 finished with value: 0.766390833863781 and parameters: {'dense_units': 192, 'dropout_rate': 0.23284310393964414, 'learning_rate': 0.00012221993428715677, 'batch_size': 128, 'epochs': 5}. Best is trial 0 with value: 0.766390833863781.
2025-11-19 11:35:20.925262: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.

[I 2025-11-19 11:35:27,321] Trial 1 finished with value: 0.7793840351979887 and parameters: {'dense_units': 128, 'dropout_rate': 0.4109096958621382, 'learning_rate': 0.00018504668787497987, 'batch_size': 32, 'epochs': 5}. Best is trial 1 with value: 0.7793840351979887.
2025-11-19 11:38:40.326056: E external/local_xla/xla/service/slow_operation_alarm.cc:73] 
********************************
[Compiling modul

Best val F1: 0.7909319899244333
Best params: {'dense_units': 128, 'dropout_rate': 0.3182215029817129, 'learning_rate': 0.003098714884149096, 'batch_size': 64, 'epochs': 5}




Test accuracy: 0.79
Test precision: 0.7857142857142857
Test recall: 0.7975
Test F1: 0.7915632754342432
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step


In [18]:
model_best.summary()

# End