# <a id='toc1_'></a>[Projet 7 : Réalisez une analyse de sentiments grâce au Deep Learning](#toc0_)
# <a id='toc2_'></a>[Modèle sur mesure avancé](#toc0_)

[Lien OpenClassroom](https://openclassrooms.com/fr/paths/795/projects/1516/1578-mission)

---

**Table of contents**<a id='toc0_'></a>    
- [Projet 7 : Réalisez une analyse de sentiments grâce au Deep Learning](#toc1_)    
- [Modèle sur mesure simple](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

---
---

## <a id='toc2_1_'></a>[Imports](#toc0_)

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Dense,
    Dropout,
    Bidirectional,
    Input,
    SpatialDropout1D,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
import mlflow
import mlflow.tensorflow  # Essential for autologging
import pickle  # For saving the tokenizer
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
tf.get_logger().setLevel("ERROR")

---
---

## <a id='toc2_2_'></a>[Chargement des données](#toc0_)

In [3]:
TRAIN_DATA_PATH = "./train_data.csv"
VAL_DATA_PATH = "./validation_data.csv"
TEST_DATA_PATH = "./test_data.csv"

train_df = pd.read_csv(TRAIN_DATA_PATH)
val_df = pd.read_csv(VAL_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

# Handle potential NaN values in 'cleaned_text' that might result from preprocessing
train_df["cleaned_text"].fillna("", inplace=True)
val_df["cleaned_text"].fillna("", inplace=True)
test_df["cleaned_text"].fillna("", inplace=True)


X_train = train_df["cleaned_text"]
y_train = train_df["sentiment"]
X_val = val_df["cleaned_text"]
y_val = val_df["sentiment"]
X_test = test_df["cleaned_text"]
y_test = test_df["sentiment"]

print("Data loaded successfully:")
print(f"Train samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

Data loaded successfully:
Train samples: 1113546
Validation samples: 238617
Test samples: 238618


---
---

## Préparation pour Deep Leanring

---

### Création d'un Tokenizer

In [None]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(
    num_words=VOCAB_SIZE, oov_token="<OOV>"
)  # OOV token for out-of-vocabulary words

# Fit the tokenizer ONLY on the training data
tokenizer.fit_on_texts(X_train)

# Convert text data to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(
    X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)
X_val_pad = pad_sequences(
    X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)
X_test_pad = pad_sequences(
    X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
)

# Vocabulary size for the embedding layer (add 1 for the padding token 0)
# Use min to handle cases where actual vocab is smaller than VOCAB_SIZE
actual_vocab_size = min(VOCAB_SIZE, len(tokenizer.word_index) + 1)
print(f"Actual vocabulary size used: {actual_vocab_size}")
print(f"Shape of padded training sequences: {X_train_pad.shape}")
print(f"Shape of padded validation sequences: {X_val_pad.shape}")
print(f"Shape of padded test sequences: {X_test_pad.shape}")

Actual vocabulary size used: 10000
Shape of padded training sequences: (1113546, 100)
Shape of padded validation sequences: (238617, 100)
Shape of padded test sequences: (238618, 100)


---

### Sauvegarde du Tokenizer

In [None]:
with open("keras_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

Tokenizer saved locally to keras_tokenizer.pkl 


---

### MLFlow Setup

In [8]:
EXPERIMENT_NAME = "Tweet Sentiment Analysis - Advanced DL"
mlflow.set_experiment(EXPERIMENT_NAME)
print(f"MLflow experiment set to: '{EXPERIMENT_NAME}'")

2025/04/25 16:14:51 INFO mlflow.tracking.fluent: Experiment with name 'Tweet Sentiment Analysis - Advanced DL' does not exist. Creating a new experiment.


MLflow experiment set to: 'Tweet Sentiment Analysis - Advanced DL'


---
---

## Experiment 1: LSTM avec GloVe Embeddings

---

### Chargement de GloVe Embeddings

In [None]:
GLOVE_PATH = "./glove.6B.300d.txt"
EMBEDDING_DIM = 300

embeddings_index = {}
try:
    with open(GLOVE_PATH, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    print(f"Found {len(embeddings_index)} word vectors in {GLOVE_PATH}.")
except FileNotFoundError:
    print(f"Error: GloVe file not found at {GLOVE_PATH}")
    print("Skipping GloVe experiment.")
    embeddings_index = None  # Ensure variable exists but is None
except Exception as e:
    print(f"An error occurred loading GloVe file: {e}")
    embeddings_index = None

embedding_matrix = None
if embeddings_index:
    print("Creating embedding matrix...")
    # Initialize matrix with zeros
    embedding_matrix = np.zeros((actual_vocab_size, EMBEDDING_DIM))
    hits = 0
    misses = 0
    # Populate the matrix with GloVe vectors for words in our tokenizer's vocabulary
    for word, i in tokenizer.word_index.items():
        if i >= actual_vocab_size:  # Skip words beyond our vocab size limit
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print(f"Converted {hits} words ({misses} misses)")
    print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Found 400001 word vectors in ./glove.6B.300d.txt.
Creating embedding matrix...
Converted 9226 words (773 misses)
Shape of embedding matrix: (10000, 300)


---

### Création du modèle

In [None]:
def build_lstm_model(
    vocab_size,
    embedding_dim,
    max_length,
    lstm_units,
    dropout_rate,
    spatial_dropout_rate,
    learning_rate,
    embedding_matrix=None,
    is_embedding_trainable=False,
):
    """Builds a Keras LSTM model."""
    model = Sequential()
    model.add(Input(shape=(max_length,)))  # Explicit Input layer

    # Embedding Layer
    if embedding_matrix is not None:
        print("Using pre-trained embedding matrix.")
        model.add(
            Embedding(
                input_dim=vocab_size,
                output_dim=embedding_dim,
                weights=[embedding_matrix],
                input_length=max_length,
                trainable=is_embedding_trainable,  # Typically False for pre-trained
            )
        )
    else:
        print("Using trainable embedding layer.")
        model.add(
            Embedding(
                input_dim=vocab_size,
                output_dim=embedding_dim,
                input_length=max_length,
                trainable=is_embedding_trainable,  # Typically True if learning from scratch
            )
        )

    model.add(
        SpatialDropout1D(spatial_dropout_rate)
    )  # Helps prevent overfitting in NLP

    # Using Bidirectional LSTM for potentially better context capture
    model.add(
        Bidirectional(
            LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate)
        )
    )
    # Or standard LSTM: model.add(LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate))

    # Optional Dense layer before output
    # model.add(Dense(32, activation='relu'))
    # model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation="sigmoid"))  # Output layer for binary classification

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
    )  # Add more metrics like Precision, Recall if needed during training

    print("\nModel Summary:")
    model.summary()
    return model

---

### Entrainement du modèle avec MLFlow

In [None]:
MAX_SEQUENCE_LENGTH = 100
LSTM_UNITS = 64
DROPOUT_RATE = 0.3
SPATIAL_DROPOUT_RATE = 0.3

# Training Parameters
EPOCHS = 10  # Max number of epochs
BATCH_SIZE = 64  # Batch size for training
LEARNING_RATE = 0.001

TOKENIZER_ARTIFACT_PATH = "tokenizer"
MODEL_ARTIFACT_PATH = "model"

run_name_glove = "LSTM_GloVe_Embeddings"
print(f"\n--- Starting MLflow Run for: {run_name_glove} ---")

# Enable MLflow autologging for TensorFlow/Keras
# This automatically logs parameters, metrics per epoch, the model, etc.
mlflow.tensorflow.autolog(
    log_models=True, disable=False, registered_model_name=None
)  # Disable registration via autolog for now

with mlflow.start_run(run_name=run_name_glove) as run_glove:
    run_id_glove = run_glove.info.run_id
    print(f"MLflow Run ID (GloVe): {run_id_glove}")

    # --- Log additional parameters manually (autolog might miss some) ---
    mlflow.log_param("embedding_type", "GloVe (Not Trainable)")
    mlflow.log_param("vocab_size", actual_vocab_size)
    mlflow.log_param("max_sequence_length", MAX_SEQUENCE_LENGTH)
    mlflow.log_param("embedding_dim", EMBEDDING_DIM)
    mlflow.log_param("lstm_units", LSTM_UNITS)
    mlflow.log_param("dropout_rate", DROPOUT_RATE)
    mlflow.log_param("spatial_dropout_rate", SPATIAL_DROPOUT_RATE)
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("epochs", EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("architecture", "Input-Embedding-SpatialDropout-BiLSTM-Dense")

    # --- Build the model ---
    model_glove = build_lstm_model(
        vocab_size=actual_vocab_size,
        embedding_dim=EMBEDDING_DIM,
        max_length=MAX_SEQUENCE_LENGTH,
        lstm_units=LSTM_UNITS,
        dropout_rate=DROPOUT_RATE,
        spatial_dropout_rate=SPATIAL_DROPOUT_RATE,
        learning_rate=LEARNING_RATE,
        embedding_matrix=embedding_matrix,
        is_embedding_trainable=False,  # Crucial for using pre-trained static embeddings
    )

    # --- Callbacks ---
    early_stopping = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )
    # MLflow callback is handled by autolog()

    # --- Train the model ---
    print("\nTraining LSTM model with GloVe embeddings...")
    history_glove = model_glove.fit(
        X_train_pad,
        y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val_pad, y_val),
        callbacks=[early_stopping],  # Autolog handles MLflow logging callback
        verbose=1,  # Set to 1 or 2 for progress updates
    )
    print("GloVe Model Training Finished.")

    # --- Manually log the tokenizer artifact ---
    # Autolog doesn't handle custom artifacts like the tokenizer pickle file
    if os.path.exists("keras_tokenizer.pkl"):
        mlflow.log_artifact(
            "keras_tokenizer.pkl", artifact_path=TOKENIZER_ARTIFACT_PATH
        )
        print(f"Tokenizer logged as artifact to MLflow run {run_id_glove}.")
    else:
        print(
            "Warning: Tokenizer file keras_tokenizer.pkl not found, could not log artifact."
        )

    # Autologging should have logged the model automatically at the end of training
    print(f"--- MLflow Run {run_id_glove} finished ---")


--- Starting MLflow Run for: LSTM_GloVe_Embeddings ---
MLflow Run ID (GloVe): dcabf68fdc9b49f38831717b72983354
Using pre-trained embedding matrix.

Model Summary:





Training LSTM model with GloVe embeddings...


ValueError: Invalid dtype: object