# START

In [8]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
import string
import nltk
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller
import swifter
import mlflow
import mlflow.sklearn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
print("Version TF :", tf.__version__)
print("CUDA dispo dans la build :", tf.test.is_built_with_cuda())
print("GPUs vus par TensorFlow :", tf.config.list_physical_devices('GPU'))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Version TF : 2.20.0
CUDA dispo dans la build : True
GPUs vus par TensorFlow : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# DATA READING

In [9]:
# Load and preprocess data
df = pd.read_csv("../../sentiment140/training.1600000.processed.noemoticon.csv",
                 encoding='latin-1',
                 header=None,
                 names=['sentiment','id','date','query','user','tweet'])

df.head()

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
df = df[['sentiment','tweet']]
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [11]:
# Stratified sampling to have tweets of the two sentiments
df_negatifs = df[df['sentiment'] == 0].sample(8000, random_state=42)
df_positifs = df[df['sentiment'] == 4].sample(8000, random_state=42)

# So we get a sample of 10% of the original dataset
df = pd.concat([df_negatifs, df_positifs]).reset_index(drop=True)
df['sentiment'] = df['sentiment'].replace(4, 1)
df.head()

Unnamed: 0,sentiment,tweet
0,0,@xnausikaax oh no! where did u order from? tha...
1,0,A great hard training weekend is over. a coup...
2,0,"Right, off to work Only 5 hours to go until I..."
3,0,I am craving for japanese food
4,0,Jean Michel Jarre concert tomorrow gotta work...


# CLEANING TEXT

## Preprocessing

âœ… Preprocessing steps(preparing for TF-IDF + Logistic Regression)

Compared to the preprocessing of tweets made during the application of logistic regression, the steps written in red have been removed, and the steps in green are repeated or added for this time.


<span style="color:green"></span>

<span style="color:green">- Lowercase</span>

<span style="color:red">- Expand contractions</span>

<span style="color:green">- Convert emoticons â†’ words</span>

<span style="color:green">- Convert emojis â†’ words</span>

<span style="color:green">- Remove URLs, mentions, hashtags</span>

<span style="color:red">- special chars, and punctuation</span>

<span style="color:green">- Tokenize</span>

<span style="color:red">- Remove stopwords</span>

<span style="color:red">- Lemmatize</span>

<span style="color:red">- Join tokens back</span>

<span style="color:green">- Build vocab (+ OOV token), convert to integer sequences.</span>

<span style="color:green">- Pad/trim to a fixed max length; create attention masks if you use masking.</span>

<span style="color:green">- Initialize an Embedding layer (trainable or with pretrained vectors like GloVe-Twitter/fastText); then LSTM/biLSTM â†’ classifier.</span>

In [None]:
import re
from typing import List, Tuple, Optional

try:
    # optional but nice: converts emojis â†’ words like ":smiling_face:" -> "smiling face"
    from emoji import demojize
    _HAS_EMOJI = True
except Exception:
    _HAS_EMOJI = False

# ---------- light, meaning-preserving normalization ----------

_EMOTICONS = {
    r":-\)|:\)|=\)|:\]": "smile",
    r":-D|:D|=D": "laugh",
    r":-\(|:\(|=\(|:\[": "sad",
    r":'\(|:'-\(": "cry",
    r";-\)|;\)": "wink",
    r":-P|:P": "playful",
    r":/|:-/": "skeptical",
    r":\*": "kiss",
    r">:\(|>:-\(": "angry",
    r"XD|xD": "laugh",
}

_EMOTICON_REGEXES = [(re.compile(p), w) for p, w in _EMOTICONS.items()]

_URL_RE   = re.compile(r"(https?://\S+|www\.\S+)")
_USER_RE  = re.compile(r"@\w+")
_NUM_RE   = re.compile(r"\b\d+\b")
# keep ! and ?; drop most other punctuation later if you want (we keep them)
# Hashtags: keep the hashtag and add its content as a separate token
_HASHTAG_RE = re.compile(r"#(\w+)")
# compress character repetitions to max 3 (so "sooooo" -> "sooo")
_REPEAT_RE  = re.compile(r"(.)\1{3,}")

def _emoticons_to_words(text: str) -> str:
    for rgx, word in _EMOTICON_REGEXES:
        text = rgx.sub(f" {word} ", text)
    return text

def _emojis_to_words(text: str) -> str:
    if not _HAS_EMOJI:
        return text
    text = demojize(text, language="en")
    # demojize yields ":grinning_face_with_big_eyes:" â†’ turn to words
    text = re.sub(r":([a-zA-Z0-9_]+):", lambda m: " " + m.group(1).replace("_", " ") + " ", text)
    return text

def normalize_tweet(t: str) -> str:
    t = t.strip().lower()
    t = _URL_RE.sub(" <URL> ", t)
    t = _USER_RE.sub(" <USER> ", t)
    t = _NUM_RE.sub(" <NUM> ", t)
    t = _emoticons_to_words(t)
    t = _emojis_to_words(t)
    # keep hashtag token, also add its de-hashed word
    t = _HASHTAG_RE.sub(lambda m: f" #{m.group(1)} {m.group(1)} ", t)
    # compress extreme elongations but keep emphasis
    t = _REPEAT_RE.sub(r"\1\1\1", t)
    # normalize whitespace
    t = re.sub(r"\s+", " ", t).strip()
    return t

# ---------- training-time helpers ----------
def build_tokenizer(train_texts: List[str],
                    vocab_size: int = 20000,
                    oov_token: str = "<OOV>") -> Tokenizer:
    """Fit a Keras tokenizer on *normalized* training texts."""
    norm_train = [normalize_tweet(t) for t in train_texts]
    tok = Tokenizer(num_words=vocab_size, oov_token=oov_token, filters="")  # keep punctuation like ! ?
    tok.fit_on_texts(norm_train)
    return tok

def preprocess_train(train_texts: List[str],
                     tokenizer: Tokenizer,
                     max_len: int = 50) -> Tuple[List[List[int]], List[List[int]]]:
    """Return padded sequences and (optional) attention masks for training."""
    norm = [normalize_tweet(t) for t in train_texts]
    seqs = tokenizer.texts_to_sequences(norm)
    padded = pad_sequences(seqs, maxlen=max_len, padding="post", truncating="post")
    # attention mask: 1 for real tokens, 0 for padding (can be used with Masking)
    masks = (padded != 0).astype("int32").tolist()
    return padded, masks

def preprocess_test(test_texts: List[str],
                    tokenizer: Tokenizer,
                    max_len: int) -> Tuple[List[List[int]], List[List[int]]]:
    """
    Normalize tweets, convert to integer sequences using an already-fitted tokenizer,
    and pad/trim to max_len. Returns (padded_sequences, attention_masks).
    """
    norm = [normalize_tweet(t) for t in test_texts]
    seqs = tokenizer.texts_to_sequences(norm)
    padded = pad_sequences(seqs, maxlen=max_len, padding="post", truncating="post")
    masks = (padded != 0).astype("int32").tolist()
    return padded, masks



In [None]:
df['sentiment'].value_counts()

## Spliting

In [None]:
import numpy as np
import tensorflow as tf
import optuna
import mlflow

from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from gensim.models import Word2Vec

# ---------- constants ----------
MAX_LEN    = 80
VOCAB_SIZE = 20000   # tokenizer + embedding vocab size

# ---------- data ----------
texts = df["tweet"].astype(str).tolist()
labels = df["sentiment"].astype("float32").to_numpy()

X_train, X_tmp, y_train, y_tmp = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp
)

print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))


## Griding optuna + Word2vec

In [None]:

# ---------- MLflow experiment ----------
mlflow.set_experiment("tweet_bilstm_w2v_optuna")

In [None]:
import numpy as np
from collections import Counter

print("Unique labels in full data:", np.unique(labels, return_counts=True))

print("Train:", np.unique(y_train, return_counts=True))
print("Val:  ", np.unique(y_val,   return_counts=True))
print("Test: ", np.unique(y_test,  return_counts=True))


In [None]:
def build_model_with_params(params, tokenizer, w2v, max_len=MAX_LEN):
    embed_dim     = params["embed_dim"]
    lstm_units    = params["lstm_units"]
    dense_units   = params["dense_units"]
    dropout_rate  = params["dropout_rate"]
    learning_rate = params["learning_rate"]
    trainable_emb = params["trainable_embedding"]

    # ---- num_words consistent with tokenizer ----
    num_words = tokenizer.num_words
    if num_words is None:
        num_words = min(VOCAB_SIZE, len(tokenizer.word_index) + 1)

    # ---- embedding matrix ----
    embedding_matrix = np.zeros((num_words, embed_dim), dtype="float32")

    for word, idx in tokenizer.word_index.items():
        if idx >= num_words:
            continue
        if word in w2v.wv:
            vec = w2v.wv[word]
            if vec.shape[0] == embed_dim:
                embedding_matrix[idx] = vec

    # ---- model ----
    model = Sequential([
        Embedding(
            input_dim=num_words,
            output_dim=embed_dim,
            weights=[embedding_matrix],
            mask_zero=True,
            trainable=trainable_emb,
        ),
        Bidirectional(LSTM(lstm_units)),
        Dropout(dropout_rate),
        Dense(dense_units, activation="relu"),
        Dropout(dropout_rate),
        Dense(1, activation="sigmoid")  # binary sentiment
    ])

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


In [None]:
def objective(trial):
    tf.keras.backend.clear_session()

    # ---- sample params ----
    params = {
        "embed_dim": trial.suggest_categorical("embed_dim", [100, 200, 300]),
        "w2v_window": trial.suggest_int("w2v_window", 3, 7),
        "w2v_min_count": trial.suggest_int("w2v_min_count", 1, 5),
        "w2v_sg": trial.suggest_categorical("w2v_sg", [0, 1]),
        "w2v_epochs": trial.suggest_int("w2v_epochs", 5, 15),
        "lstm_units": trial.suggest_int("lstm_units", 64, 256, step=64),
        "dense_units": trial.suggest_int("dense_units", 32, 128, step=32),
        "dropout_rate": trial.suggest_float("dropout_rate", 0.1, 0.5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 5e-3, log=True),
        "trainable_embedding": trial.suggest_categorical("trainable_embedding", [False, True]),
        "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128]),
        "epochs": trial.suggest_int("epochs", 3, 8),
    }

    # ---- CHILD RUN ----
    with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True):
        mlflow.log_params(params)

        # tokenizer
        tok = build_tokenizer(X_train, vocab_size=VOCAB_SIZE, oov_token="<OOV>")

        # word2vec
        train_tokens = [normalize_tweet(t).split() for t in X_train]
        w2v = Word2Vec(
            sentences=train_tokens,
            vector_size=params["embed_dim"],
            window=params["w2v_window"],
            min_count=params["w2v_min_count"],
            workers=4,
            sg=params["w2v_sg"],
            negative=10,
            epochs=params["w2v_epochs"],
        )

        # sequences
        X_train_pad, _ = preprocess_train(X_train, tokenizer=tok, max_len=MAX_LEN)
        X_val_pad,   _ = preprocess_test(X_val, tokenizer=tok, max_len=MAX_LEN)

        # model
        model = build_model_with_params(params, tokenizer=tok, w2v=w2v, max_len=MAX_LEN)

        # dataset
        train_ds = tf.data.Dataset.from_tensor_slices((X_train_pad, y_train)).shuffle(10000).batch(params["batch_size"])
        val_ds   = tf.data.Dataset.from_tensor_slices((X_val_pad, y_val)).batch(params["batch_size"])

        history = model.fit(train_ds, validation_data=val_ds, epochs=params["epochs"], verbose=0)

        # ---- VALIDATION PREDICTIONS ----
        y_pred_prob = model.predict(X_val_pad, verbose=0).ravel()
        y_pred = (y_pred_prob >= 0.5).astype("int32")

        acc = accuracy_score(y_val, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(y_val, y_pred, average="binary", zero_division=0)

        # log aggregated metrics
        mlflow.log_metric("val_accuracy", acc)
        mlflow.log_metric("val_precision", prec)
        mlflow.log_metric("val_recall", rec)
        mlflow.log_metric("val_f1", f1)

        # ðŸ‘‰ Optuna will maximize F1
        return f1


In [None]:
class ValMetricsCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val, y_val):
        self.X_val = X_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        y_pred_prob = self.model.predict(self.X_val, verbose=0).ravel()
        y_pred = (y_pred_prob >= 0.5).astype("int32")

        acc = accuracy_score(self.y_val, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(self.y_val, y_pred, average="binary", zero_division=0)

        mlflow.log_metric("val_accuracy_trace", acc, step=epoch)
        mlflow.log_metric("val_precision_trace", prec, step=epoch)
        mlflow.log_metric("val_recall_trace", rec, step=epoch)
        mlflow.log_metric("val_f1_trace", f1, step=epoch)


In [None]:
with mlflow.start_run(run_name="optuna_bilstm_w2v_parent"):

    # ---- 1) Run Optuna ----
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    best_trial = study.best_trial
    best_params = best_trial.params

    print("Best val accuracy:", best_trial.value)
    print("Best params:", best_params)

    # Log best params at parent level
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})

    # full data
    X_train_full = X_train + X_val
    y_train_full = np.concatenate([y_train, y_val])

    tok_full = build_tokenizer(X_train_full, vocab_size=VOCAB_SIZE, oov_token="<OOV>")
    tokens_full = [normalize_tweet(t).split() for t in X_train_full]

    w2v_full = Word2Vec(
        sentences=tokens_full,
        vector_size=best_params["embed_dim"],
        window=best_params["w2v_window"],
        min_count=best_params["w2v_min_count"],
        workers=4,
        sg=best_params["w2v_sg"],
        negative=10,
        epochs=best_params["w2v_epochs"],
    )

    X_train_full_pad, _ = preprocess_train(X_train_full, tokenizer=tok_full, max_len=MAX_LEN)
    X_test_pad, _ = preprocess_test(X_test, tokenizer=tok_full, max_len=MAX_LEN)

    model_best = build_model_with_params(best_params, tokenizer=tok_full, w2v=w2v_full)

    cb = ValMetricsCallback(X_val=X_test_pad, y_val=y_test)  # tracing on val or test? choose val

    train_full_ds = tf.data.Dataset.from_tensor_slices((X_train_full_pad, y_train_full)).shuffle(10000).batch(best_params["batch_size"])

    model_best.fit(train_full_ds, epochs=best_params["epochs"], callbacks=[cb], verbose=0)

    # FINAL TEST METRICS
    y_test_pred_prob = model_best.predict(X_test_pad).ravel()
    y_test_pred = (y_test_pred_prob >= 0.5).astype("int32")

    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec, test_rec, test_f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average="binary")

    mlflow.log_metric("test_accuracy", test_acc)
    mlflow.log_metric("test_precision", test_prec)
    mlflow.log_metric("test_recall", test_rec)
    mlflow.log_metric("test_f1", test_f1)

## Griding optuna + Glove

In [None]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import mlflow
import optuna

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

# =============================================================================
# PREPROCESSING (your existing functions)
# =============================================================================

# (I keep your normalization as-is, not repeated here to save space)
# Paste your normalize_tweet(), build_tokenizer(),
# preprocess_train(), preprocess_test() functions here.

# =============================================================================
# LOAD GLOVE EMBEDDINGS
# =============================================================================

def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings[word] = vector

    print(f"Loaded {len(embeddings)} GloVe word vectors.")
    return embeddings


def build_glove_embedding_matrix(tokenizer, glove_vectors, embed_dim, vocab_size):
    num_words = min(vocab_size, len(tokenizer.word_index) + 1)
    embedding_matrix = np.zeros((num_words, embed_dim), dtype="float32")

    for word, idx in tokenizer.word_index.items():
        if idx >= num_words:
            continue
        vec = glove_vectors.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec

    return embedding_matrix


# =============================================================================
# MODEL BUILDER (uses GloVe embedding matrix)
# =============================================================================

def build_model_with_params(params, tokenizer, embedding_matrix, max_len, vocab_size):

    model = Sequential([
        Embedding(
            input_dim=min(vocab_size, len(tokenizer.word_index) + 1),
            output_dim=params["embed_dim"],
            weights=[embedding_matrix],
            input_length=max_len,
            mask_zero=True,
            trainable=params["trainable_embedding"]
        ),
        SpatialDropout1D(params["dropout_rate"]),
        Bidirectional(LSTM(params["lstm_units"], return_sequences=False)),
        Dense(params["dense_units"], activation="relu"),
        Dropout(params["dropout_rate"]),
        Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(params["learning_rate"]),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model


# =============================================================================
# CALLBACK FOR VAL METRIC TRACES (parent run)
# =============================================================================

class ValMetricsCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val, y_val):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        y_pred_prob = self.model.predict(self.X_val, verbose=0).ravel()
        y_pred = (y_pred_prob >= 0.5).astype("int32")

        acc = accuracy_score(self.y_val, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(
            self.y_val, y_pred, average="binary", zero_division=0
        )

        mlflow.log_metric("val_accuracy_trace", acc, step=epoch)
        mlflow.log_metric("val_precision_trace", prec, step=epoch)
        mlflow.log_metric("val_recall_trace", rec, step=epoch)
        mlflow.log_metric("val_f1_trace", f1, step=epoch)


# =============================================================================
# OPTUNA OBJECTIVE (child runs)
# =============================================================================

VOCAB_SIZE = 20000
MAX_LEN = 50

def objective(trial):

    tf.keras.backend.clear_session()

    # ---- Optuna Hyperparameters ----
    params = {
        "embed_dim": trial.suggest_categorical("embed_dim", [50, 100, 200]),
        "lstm_units": trial.suggest_int("lstm_units", 64, 256, step=64),
        "dense_units": trial.suggest_int("dense_units", 32, 128, step=32),
        "dropout_rate": trial.suggest_float("dropout_rate", 0.1, 0.5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 5e-3, log=True),
        "trainable_embedding": trial.suggest_categorical("trainable_embedding", [False, True]),
        "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128]),
        "epochs": trial.suggest_int("epochs", 3, 8),
    }

    # ---- Child MLflow Run ----
    with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True):
        mlflow.log_params(params)

        # ---- Tokenizer ----
        tok = build_tokenizer(X_train, VOCAB_SIZE, "<OOV>")

        # ---- Load GloVe ----
        glove_path = f"embeddings/glove/glove.twitter.27B.{params['embed_dim']}d.txt"
        glove_vectors = load_glove_embeddings(glove_path)

        # ---- Embedding Matrix ----
        embedding_matrix = build_glove_embedding_matrix(tok, glove_vectors,
                                                        params["embed_dim"], VOCAB_SIZE)

        # ---- Sequences ----
        X_train_pad, _ = preprocess_train(X_train, tok, MAX_LEN)
        X_val_pad, _ = preprocess_test(X_val, tok, MAX_LEN)

        # ---- Model ----
        model = build_model_with_params(params, tok, embedding_matrix, MAX_LEN, VOCAB_SIZE)

        # ---- Datasets ----
        train_ds = tf.data.Dataset.from_tensor_slices((X_train_pad, y_train)) \
            .shuffle(10000).batch(params["batch_size"])
        val_ds = tf.data.Dataset.from_tensor_slices((X_val_pad, y_val)) \
            .batch(params["batch_size"])

        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=params["epochs"],
            verbose=0
        )

        # ---- Validation Predictions ----
        y_pred_prob = model.predict(X_val_pad, verbose=0).ravel()
        y_pred = (y_pred_prob >= 0.5).astype("int32")

        acc = accuracy_score(y_val, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(y_val, y_pred,
                                                           average="binary", zero_division=0)

        mlflow.log_metric("val_accuracy", acc)
        mlflow.log_metric("val_precision", prec)
        mlflow.log_metric("val_recall", rec)
        mlflow.log_metric("val_f1", f1)

        return f1  # maximize val F1


# =============================================================================
# RUN OPTUNA + PARENT MLFLOW LOGGING
# =============================================================================

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best params:", study.best_trial.params)
print("Best val F1:", study.best_trial.value)

# =============================================================================
# PARENT RUN FINAL MODEL (train on train+val, test on test)
# =============================================================================

with mlflow.start_run(run_name="optuna_bilstm_GloVe_parent"):

    best_params = study.best_trial.params
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})

    # ---- Combine Train + Val ----
    X_train_full = X_train + X_val
    y_train_full = np.concatenate([y_train, y_val])

    # ---- Tokenizer ----
    tok_full = build_tokenizer(X_train_full, VOCAB_SIZE, "<OOV>")

    # ---- Load GloVe ----
    glove_path = f"glove.twitter.27B.{best_params['embed_dim']}d.txt"
    glove_vectors = load_glove_embeddings(glove_path)

    # ---- Embedding Matrix ----
    embedding_matrix_full = build_glove_embedding_matrix(tok_full, glove_vectors,
                                                         best_params["embed_dim"], VOCAB_SIZE)

    # ---- Sequences ----
    X_train_full_pad, _ = preprocess_train(X_train_full, tok_full, MAX_LEN)
    X_test_pad, _ = preprocess_test(X_test, tok_full, MAX_LEN)

    # ---- Final Model ----
    model_best = build_model_with_params(best_params, tok_full,
                                         embedding_matrix_full, MAX_LEN, VOCAB_SIZE)

    # ---- Callback (Validation Trace on Test Set) ----
    cb = ValMetricsCallback(X_test_pad, y_test)

    train_full_ds = tf.data.Dataset.from_tensor_slices((X_train_full_pad, y_train_full)) \
        .shuffle(10000).batch(best_params["batch_size"])

    model_best.fit(train_full_ds, epochs=best_params["epochs"],
                   callbacks=[cb], verbose=0)

    # ---- Test Metrics ----
    y_pred_prob = model_best.predict(X_test_pad).ravel()
    y_pred = (y_pred_prob >= 0.5).astype("int32")

    test_acc = accuracy_score(y_test, y_pred)
    test_prec, test_rec, test_f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    mlflow.log_metric("test_accuracy", test_acc)
    mlflow.log_metric("test_precision", test_prec)
    mlflow.log_metric("test_recall", test_rec)
    mlflow.log_metric("test_f1", test_f1)

    print("\nFinal Test results:")
    print("Accuracy:", test_acc)
    print("Precision:", test_prec)
    print("Recall:", test_rec)
    print("F1:", test_f1)

# End