
# **EDF R&D Digital Innovation – NLP Test (Détection de Toxicité)**
_Auteur : <votre nom> — Généré le 2025-08-08 11:18 UTC_

Ce notebook propose une solution reproductible et auto‑contenue pour classifier des commentaires en toxiques / non toxiques à partir du jeu de données `mteb/toxic_conversations_50k`.

**Objectifs pédagogiques** :
- Exploration et analyse de données
- Pipelines de pré/post-traitement
- Méthodologie d’évaluation (métriques & visualisations)
- Construction et optimisation d’un modèle NLP (baseline + Transformer)
- Analyse d’erreurs et interprétabilité
- Organisation et lisibilité du code

> Le dataset est exigeant. On optimise le processus plus que le score final.


## 1) Librairies

In [None]:

# Si vous êtes sur Colab, décommentez la ligne suivante :
# %pip install -q -U datasets transformers accelerate evaluate scikit-learn imbalanced-learn nltk matplotlib


In [None]:

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch

from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score,
                             average_precision_score, precision_recall_curve, roc_curve, f1_score)
from sklearn.utils.class_weight import compute_class_weight

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

import nltk
nltk.download('punkt', quiet=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {DEVICE}")


## 2) Données — chargement & exploration

In [None]:

# Chargement des splits fournis
dataset_train = load_dataset('mteb/toxic_conversations_50k', split='train')
dataset_val   = load_dataset('mteb/toxic_conversations_50k', split='test')

# Renommage pour clarté
ds = DatasetDict({
    "train": dataset_train,
    "validation": dataset_val
})
ds


In [None]:

# Conversion pandas pour EDA rapide
df_train = ds["train"].to_pandas()
df_val   = ds["validation"].to_pandas()

print(df_train.head(3))
print(df_train.describe(include='all'))
print("\nRépartition de classes (train) :")
print(df_train['label'].value_counts(normalize=True).rename({0:'non-toxique',1:'toxique'}))
print("\nRépartition de classes (val) :")
print(df_val['label'].value_counts(normalize=True).rename({0:'non-toxique',1:'toxique'}))

# Longueurs de texte
df_train['len'] = df_train['text'].str.split().apply(len)
df_val['len']   = df_val['text'].str.split().apply(len)

print("\nLongueurs (mots) — train :", df_train['len'].describe())
print("Longueurs (mots) — val   :", df_val['len'].describe())


In [None]:

# Histogramme des longueurs (train)
plt.figure()
df_train['len'].hist(bins=50)
plt.xlabel("Longueur du commentaire (mots)")
plt.ylabel("Fréquence")
plt.title("Distribution des longueurs (train)")
plt.show()


## 3) Pré‑traitement minimal

In [None]:

import re

def basic_clean(text: str) -> str:
    # Nettoyage léger pour la baseline classique.
    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Appliquer le nettoyage pour la baseline TF‑IDF (on garde l'original pour le Transformer)
df_train['text_clean'] = df_train['text'].apply(basic_clean)
df_val['text_clean']   = df_val['text'].apply(basic_clean)

X_train_clf = df_train['text_clean'].values
y_train = df_train['label'].values
X_val_clf   = df_val['text_clean'].values
y_val   = df_val['label'].values

print("Exemple (nettoyé):", X_train_clf[0][:200], "...")


## 4) Modèle 1 — Baseline (TF‑IDF + Logistic Regression)

In [None]:

vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.99,
    strip_accents='unicode',
    sublinear_tf=True
)

# Class weights
classes = np.array([0,1])
class_weights = compute_class_weight(
    class_weight='balanced', classes=classes, y=y_train
)
cw = {cls: w for cls, w in zip(classes, class_weights)}
print("Class weights:", cw)

logreg = LogisticRegression(
    max_iter=200,
    class_weight=cw,
    solver='liblinear',
    C=1.0
)

pipeline = ImbPipeline(steps=[
    ('vectorizer', vectorizer),
    ('ros', RandomOverSampler(random_state=RANDOM_SEED)),
    ('clf', logreg),
])

pipeline.fit(X_train_clf, y_train)

# Probabilités & prédictions (seuil 0.5 initial)
proba_val = pipeline.predict_proba(X_val_clf)[:,1]
pred_val_05 = (proba_val >= 0.5).astype(int)

print(classification_report(y_val, pred_val_05, digits=3))
print("ROC‑AUC :", roc_auc_score(y_val, proba_val))
print("PR‑AUC  :", average_precision_score(y_val, proba_val))

cm = confusion_matrix(y_val, pred_val_05)
print("\nMatrice de confusion (seuil 0.5):\n", cm)


In [None]:

# Courbes PR & ROC + choix de seuil par F1 max
precision, recall, thr = precision_recall_curve(y_val, proba_val)
f1s = 2*precision[:-1]*recall[:-1] / (precision[:-1]+recall[:-1] + 1e-9)
best_idx = f1s.argmax()
best_thr = thr[best_idx]
print(f"Seuil optimal (F1 max) : {best_thr:.3f} — F1={f1s[best_idx]:.3f}")

plt.figure()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Courbe Precision‑Recall (baseline)")
plt.show()

from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_val, proba_val)
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Courbe ROC (baseline)")
plt.show()

# Évaluation au seuil optimisé
pred_val_best = (proba_val >= best_thr).astype(int)
print("\n== Rapport au seuil optimisé ==")
print(classification_report(y_val, pred_val_best, digits=3))
print("Matrice de confusion (seuil optimisé):\n", confusion_matrix(y_val, pred_val_best))


In [None]:

# Interprétabilité — top poids LogReg
clf = pipeline.named_steps['clf']
tfidf = pipeline.named_steps['vectorizer']

feature_names = np.array(tfidf.get_feature_names_out())
coefs = clf.coef_[0]

top_pos_idx = np.argsort(coefs)[-20:][::-1]
top_neg_idx = np.argsort(coefs)[:20]

print("\nTop 20 n‑grams associés à la classe TOXIQUE:")
for i in top_pos_idx:
    print(f"{feature_names[i]} \t {coefs[i]:.3f}")

print("\nTop 20 n‑grams associés à la classe NON‑TOXIQUE:")
for i in top_neg_idx:
    print(f"{feature_names[i]} \t {coefs[i]:.3f}")


## 5) Modèle 2 — Fine‑tuning Transformer (DistilBERT)

In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 256

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

from datasets import DatasetDict, Dataset
tokenized_ds = DatasetDict({
    "train": Dataset.from_pandas(df_train[['text','label']]).map(tokenize_batch, batched=True),
    "validation": Dataset.from_pandas(df_val[['text','label']]).map(tokenize_batch, batched=True),
}).with_format("torch")

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(DEVICE)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    import numpy as np
    import torch
    probs = torch.softmax(torch.tensor(logits), dim=1)[:,1].numpy()
    preds = (probs >= 0.5).astype(int)
    return {
        "roc_auc": roc_auc_score(labels, probs),
        "pr_auc": average_precision_score(labels, probs),
        "f1": f1_score(labels, preds),
    }

training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="pr_auc",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
eval_res = trainer.evaluate()
print(eval_res)


In [None]:

# Probabilités sur la validation + seuil optimisé (F1)
import torch, numpy as np
model.eval()
with torch.no_grad():
    logits = []
    for i in range(0, len(tokenized_ds["validation"]), 128):
        batch = tokenized_ds["validation"][i:i+128]
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits.append(out.logits.cpu())
    logits = torch.cat(logits, dim=0)

probs = torch.softmax(logits, dim=1)[:,1].numpy()
y_true = df_val['label'].values

from sklearn.metrics import precision_recall_curve, roc_auc_score, average_precision_score, classification_report, confusion_matrix, roc_curve
precision, recall, thr = precision_recall_curve(y_true, probs)
f1s = 2*precision[:-1]*recall[:-1] / (precision[:-1]+recall[:-1] + 1e-9)
best_idx = f1s.argmax()
best_thr = thr[best_idx]
print(f"Seuil optimal (Transformer, F1 max) : {best_thr:.3f} — F1={f1s[best_idx]:.3f}")
print("ROC‑AUC :", roc_auc_score(y_true, probs))
print("PR‑AUC  :", average_precision_score(y_true, probs))

plt.figure()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Courbe Precision‑Recall (Transformer)")
plt.show()

fpr, tpr, _ = roc_curve(y_true, probs)
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Courbe ROC (Transformer)")
plt.show()

preds = (probs >= best_thr).astype(int)
print("\n== Rapport (Transformer, seuil optimisé) ==")
print(classification_report(y_true, preds, digits=3))
print("Matrice de confusion :\n", confusion_matrix(y_true, preds))


## 6) Analyse d'erreurs

In [None]:

import numpy as np
def top_errors(y_true, y_prob, k=10):
    preds = (y_prob >= 0.5).astype(int)
    fp_idx = np.where((preds==1) & (y_true==0))[0]
    fn_idx = np.where((preds==0) & (y_true==1))[0]
    fp_sorted = fp_idx[np.argsort(-y_prob[fp_idx])][:k]
    fn_sorted = fn_idx[np.argsort(y_prob[fn_idx])][:k]
    return fp_sorted, fn_sorted

fp_ids, fn_ids = top_errors(y_true, probs, k=10)

print("\n--- Faux positifs (prédits toxiques, réalité non toxique) ---")
for i in fp_ids:
    print(f"[p={probs[i]:.3f}] {df_val['text'].iloc[i][:300]}")

print("\n--- Faux négatifs (prédits non toxiques, réalité toxique) ---")
for i in fn_ids:
    print(f"[p={probs[i]:.3f}] {df_val['text'].iloc[i][:300]}")


## 7) Export & inférence

In [None]:

import joblib, os, torch
os.makedirs("artifacts", exist_ok=True)

# Sauvegarde baseline
joblib.dump(pipeline, "artifacts/baseline_tfidf_logreg.joblib")

# Sauvegarde Transformer
model.save_pretrained("artifacts/distilbert_toxic")
tokenizer.save_pretrained("artifacts/distilbert_toxic")

print("Artifacts sauvegardés dans ./artifacts")


In [None]:

# Exemple d'inférence
sample_texts = [
    "I love this! Wonderful job, thanks for sharing.",
    "You're an idiot. Get lost."
]

# Baseline
X_sample = [basic_clean(t) for t in sample_texts]
proba_base = pipeline.predict_proba(X_sample)[:,1]
print("Baseline (probas toxiques):", proba_base)

# Transformer
inputs = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
with torch.no_grad():
    out = model(**inputs)
proba_transf = torch.softmax(out.logits, dim=1)[:,1].cpu().numpy()
print("Transformer (probas toxiques):", proba_transf)



## 8) Conclusions (à adapter)
- La baseline TF‑IDF + LogReg est rapide, interprétable et robuste.
- Le Transformer (DistilBERT) capture mieux le contexte et améliore souvent PR‑AUC / ROC‑AUC.
- Le seuil dépend du cas d’usage (prioriser précision ou rappel).
- Pistes : loss focalisée, data augmentation, nettoyage avancé, MiniLM/BERT‑base selon budget, audit biais/fairness.
