In [1]:
# Manipulation de données et calculs
import pandas as pd
import numpy as np
import re
import time

# Outils pour la gestion des ensembles de données et l'évaluation des modèles
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Bibliothèques pour la construction et l'entraînement des modèles
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import create_optimizer
import tensorflow as tf
import tensorflow_hub as hub

# Suivi et enregistrement des expérimentations avec MLFlow
import mlflow
import mlflow.keras
from mlflow.models.signature import infer_signature




In [None]:
# Charger les données
data_path = 'data/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(data_path, encoding='ISO-8859-1', header=None)

df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Mapper les sentiments à des valeurs binaires
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

# Garder uniquement les colonnes utiles
df = df[['sentiment', 'text']]

# Échantillonnage équilibré
sample_size = 200_000  # Nombre de tweets par classe
df_positive = df[df['sentiment'] == 1].sample(n=sample_size, random_state=42)
df_negative = df[df['sentiment'] == 0].sample(n=sample_size, random_state=42)

# Combiner les deux échantillons
df_sampled = pd.concat([df_positive, df_negative]).sample(frac=1, random_state=42)  # Mélanger les tweets

# Vérifier la répartition
print(df_sampled['sentiment'].value_counts())

sentiment
0    1000
1    1000
Name: count, dtype: int64


In [3]:
# Appliquer un nettoyage simple
def preprocess_tweet_for_sentiment(text):
    # Supprimer les mentions @pseudo
    text = re.sub(r'@\w+', '', text)
    # Supprimer les espaces superflus
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_sampled['text'] = df_sampled['text'].apply(preprocess_tweet_for_sentiment)

In [4]:
# Charger le tokenizer Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizer les tweets échantillonnés
X_tokenized = tokenizer(
    list(df_sampled['text']),  # Utiliser les données échantillonnées
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="tf"
)

# Préparer les labels
y = tf.convert_to_tensor(df_sampled['sentiment'].values)

# Diviser en jeu d'entraînement et de test
X_train = {
    key: value[:int(0.8 * len(value))] for key, value in X_tokenized.items()
}
X_test = {
    key: value[int(0.8 * len(value)):] for key, value in X_tokenized.items()
}
y_train = y[:int(0.8 * len(y))]
y_test = y[int(0.8 * len(y)):]

# Vérifier les dimensions
print(f"X_train: {X_train['input_ids'].shape}, X_test: {X_test['input_ids'].shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

X_train: (1600, 59), X_test: (400, 59)
y_train: (1600,), y_test: (400,)


In [5]:
# Charger le modèle Bert pour la classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Définir l'optimiseur et la fonction de perte
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Calcul des étapes totales et warm-up steps
batch_size = 32
epochs = 3
steps_per_epoch = len(y_train) // batch_size
total_training_steps = steps_per_epoch * epochs
warmup_steps = int(0.1 * total_training_steps)

# Créer un optimiseur compatible avec Transformers
optimizer, schedule = create_optimizer(
    init_lr=2e-5,  # Taux d'apprentissage initial
    num_train_steps=total_training_steps,  # Étapes totales d'entraînement
    num_warmup_steps=warmup_steps,  # Étapes de warm-up
)

In [7]:
# Définir l'expérience MLFlow
mlflow.set_experiment("Sentiment_Analysis_BERT_Model")

# Entraîner BERT
with mlflow.start_run():
    start_time = time.time()

    # Compiler le modèle avec l'optimiseur compatible
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    # Entraîner le modèle
    history = model.fit(
        X_train,
        y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
    )

    elapsed_time = time.time() - start_time

    # Évaluer les performances
    val_accuracy = history.history['val_accuracy'][-1]
    val_loss = history.history['val_loss'][-1]
    y_pred = model.predict(X_test).logits
    roc_auc = roc_auc_score(y_test.numpy(), tf.nn.softmax(y_pred)[:, 1].numpy())

    # Logger les paramètres et métriques dans MLFlow
    mlflow.log_param("model", "BERT")
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("epochs", epochs)
    mlflow.log_metric("val_accuracy", val_accuracy)
    mlflow.log_metric("val_loss", val_loss)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("training_time_seconds", elapsed_time)

    # Préparer les données de test pour la signature
    X_test_combined = np.hstack([
        X_test["input_ids"].numpy(),
        X_test["attention_mask"].numpy(),
        X_test["token_type_ids"].numpy(),
    ])

    # Définir une signature pour le modèle
    signature = infer_signature(X_test_combined, y_pred)

    # Enregistrer le modèle avec signature dans MLFlow
    mlflow.keras.log_model(
        model=model,
        artifact_path="model",
        signature=signature,
        pip_requirements="requirements.txt"
    )

    print(f"BERT - Validation Accuracy: {val_accuracy:.4f}, Loss: {val_loss:.4f}, ROC-AUC: {roc_auc:.4f}, Training Time: {elapsed_time:.2f}s")

Epoch 1/3


Epoch 2/3
Epoch 3/3




BERT - Validation Accuracy: 0.7781, Loss: 0.4786, ROC-AUC: 0.8646, Training Time: 202.03s
