In [None]:
# ============================================================
# 0. IMPORTS & CONFIGURATION DE BASE
# ============================================================

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW 

import matplotlib.pyplot as plt
import time
import random
import re
import emoji

# V√©rification du mat√©riel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device utilis√© :", device)

# Pour la reproductibilit√©
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


üñ•Ô∏è Device utilis√© : cpu


In [None]:
# ============================================================
# 1. CHARGEMENT ET PR√âPARATION DU DATASET
# ============================================================

# Chemin vers ton CSV
path = "train.csv"  

df = pd.read_csv('../../jigsaw-toxic-comment-classification-challenge/train.csv/train.csv', encoding='UTF-8', on_bad_lines='skip')

# Colonnes de labels
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

print("Nombre total de lignes :", len(df))
print("Colonnes :", list(df.columns))
print("Exemple de texte :", df['comment_text'][0][:200], "...")
print("Distribution des labels :")
print(df[label_cols].sum())


üìä Nombre total de lignes : 159571
üîñ Colonnes : ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
üí¨ Exemple de texte : Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove th ...
üßæ Distribution des labels :
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


In [None]:
# ============================================================
# 2. NETTOYAGE DU TEXTE POUR TRANSFORMERS
# ============================================================

def clean_text_for_transformer(text):
    if not isinstance(text, str):
        return ""
    
    # Normaliser les apostrophes et guillemets
    text = text.replace("‚Äô", "'").replace("‚Äú", '"').replace("‚Äù", '"').replace("`", "'")
    
    # Supprimer les URLs et adresses IP
    text = re.sub(r"http\S+|www\S+|https\S+", " url ", text)
    text = re.sub(r"\b\d{1,3}(?:\.\d{1,3}){3}\b", " ", text)
    
    # Transformer les √©mojis en texte
    text = emoji.demojize(text)
    
    # Garder ponctuation de base (. , ? ! ')
    text = re.sub(r"[^a-zA-Z0-9\s\.,!?']", " ", text)
    
    # Supprimer les multiples espaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text.lower()

df["clean_text"] = df["comment_text"].apply(clean_text_for_transformer)
df[["comment_text", "clean_text"]].head(5)


Unnamed: 0,comment_text,clean_text
0,Explanation\nWhy the edits made under my usern...,explanation why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,d'aww! he matches this background colour i'm s...
2,"Hey man, I'm really not trying to edit war. It...","hey man, i'm really not trying to edit war. it..."
3,"""\nMore\nI can't make any real suggestions on ...",more i can't make any real suggestions on impr...
4,"You, sir, are my hero. Any chance you remember...","you, sir, are my hero. any chance you remember..."


In [None]:
# ============================================================
# OPTION DEV RAPIDE : √âCHANTILLONNAGE DU DATASET
# ============================================================

# Cr√©ation d'un sous-√©chantillon pour acc√©l√©rer l'entra√Ænement sur CPU
df_sample = df.sample(10000, random_state=42).reset_index(drop=True)

print("Taille du dataset r√©duit :", len(df_sample))



Taille du dataset r√©duit : 10000
                                        comment_text  \
0  Geez, are you forgetful!  We've already discus...   
1  Carioca RFA \n\nThanks for your support on my ...   

                                          clean_text  
0  geez, are you forgetful! we've already discuss...  
1  carioca rfa thanks for your support on my requ...  


In [None]:
# ============================================================
# 3. TOKENISATION ET CR√âATION DU DATASET PYTORCH
# ============================================================

from transformers import AutoTokenizer

# Choix du mod√®le principal
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# S√©paration train / validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_sample["clean_text"].tolist(),
    df_sample[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].values,
    test_size=0.1,
    random_state=42
)

# Tokenisation
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128
)

val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128
)

# Classe Dataset pour PyTorch
class ToxicCommentsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ToxicCommentsDataset(train_encodings, train_labels)
val_dataset = ToxicCommentsDataset(val_encodings, val_labels)

print(f"Taille du train set : {len(train_dataset)}")
print(f"Taille du validation set : {len(val_dataset)}")



Taille du train set : 9000
Taille du validation set : 1000
Tokenisation et Dataset pr√™ts !


In [None]:
# ============================================================
# 4. ENTRA√éNEMENT DU MOD√àLE RoBERTa-base
# ============================================================

from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

# Chargement du mod√®le RoBERTa
model_name = "roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6,  # 6 √©tiquettes dans Jigsaw
    problem_type="multi_label_classification"
)
model.to(device)

# DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)

# Optimiseur et scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 2  # CPU -> 2 epochs max
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Fonction d'entra√Ænement
def train_model(model, loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Boucle d'entra√Ænement
for epoch in range(num_epochs):
    avg_loss = train_model(model, train_loader, optimizer, scheduler)
    print(f" Epoch {epoch+1}/{num_epochs} - Loss moyenne : {avg_loss:.4f}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üìâ Epoch 1/2 - Loss moyenne : 0.0815
üìâ Epoch 2/2 - Loss moyenne : 0.0440


Le mod√®le RoBERTa-base a √©t√© initialis√© avec des poids pr√©-entra√Æn√©s,
la t√™te de classification a √©t√© ajout√©e et entra√Æn√©e √† partir de z√©ro sur notre dataset Jigsaw.

In [17]:
# ============================================================
# 5. √âVALUATION DU MOD√àLE RoBERTa-base
# ============================================================

from sklearn.metrics import f1_score
import numpy as np

model.eval()
preds, truths = [], []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()
        preds.extend(probs)
        truths.extend(batch["labels"].cpu().numpy())

preds = np.array(preds)
truths = np.array(truths)

# Transformation en 0/1 (seuil 0.5)
preds_binary = (preds > 0.5).astype(int)

# Calcul du F1-score macro
f1 = f1_score(truths, preds_binary, average="macro")
print(f" F1-score macro sur le set de validation : {f1:.4f}")


 F1-score macro sur le set de validation : 0.4661
