In [29]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

# 1. CHARGEMENT DES DONNÉES
data = pd.read_csv('./data/train.csv')
texts = data['comment_text'].astype(str).values
labels = data.iloc[:, 2:].values  # Colonnes de toxicité
label_names = data.columns[2:]  # Noms des classes

In [30]:
# 2. TOKENISATION ET VECTORISATION
max_words = 20000  # Vocabulaire max
max_len = 100  # Longueur max des séquences
embedding_dim = 100  # Taille des embeddings

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [31]:
# 3. CHARGEMENT DES EMBEDDINGS PRÉ-ENTRAÎNÉS (ex: GloVe)
embeddings_index = {}
with open('./data/weights/glove.6B.100d.txt', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vectors

# Création de la matrice d'embedding
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector

In [32]:
# 4. SPLIT DES DONNÉES
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# 5. CONSTRUCTION DU MODÈLE
model = Sequential([
    Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='sigmoid')  # 6 classes multi-label (toxic, severe_toxic, obscene, etc.)
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [33]:
# 6. ENTRAÎNEMENT DU MODÈLE
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m1995/1995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m430s[0m 215ms/step - accuracy: 0.7198 - loss: 0.1115 - val_accuracy: 0.9941 - val_loss: 0.0573
Epoch 2/5
[1m  46/1995[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6:11[0m 190ms/step - accuracy: 0.9840 - loss: 0.0614

KeyboardInterrupt: 

In [None]:
# 7. PRÉDICTIONS ET ÉVALUATION
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

In [None]:
# F1-Score par classe
f1_scores = f1_score(y_test, y_pred, average=None)
for i, label in enumerate(label_names):
    print(f"F1-Score for {label}: {f1_scores[i]:.4f}")

# Score F1 global (moyenne macro)
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"\nF1-Score Macro (moyenne des classes): {f1_macro:.4f}")

# 8. MATRICES DE CONFUSION
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, label in enumerate(label_names):
    cm = confusion_matrix(y_test[:, i], y_pred[:, i])
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non", "Oui"], yticklabels=["Non", "Oui"], ax=axes[i])
    axes[i].set_title(f"Matrice de confusion - {label}")
    axes[i].set_xlabel("Prédit")
    axes[i].set_ylabel("Réel")

plt.tight_layout()
plt.show()


identity_hate_input = 