In [2]:
# 1. 📥 Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm

# Pour BERT
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# 2. 📄 Chargement du dataset
df = pd.read_csv("data/clean_dataset.csv")
df = df.dropna(subset=['text_processed'])
df = df[df['text_processed'].str.strip().astype(bool)]

X = df['text_processed']
y = df['humor']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. 🔠 Vectoriseurs et Modèles
vectorizers = {
    "tfidf": TfidfVectorizer(max_features=5000),
    "count": CountVectorizer(max_features=5000)
}

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "NaiveBayes": MultinomialNB(),
    "SVM": LinearSVC()
}

# 4. 🧪 Comparaison avec barre de progression
results = []
for vec_name, vectorizer in tqdm(vectorizers.items(), desc="Vectorisations"):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    for model_name, model in tqdm(models.items(), desc=f"Modèles ({vec_name})", leave=False):
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        report = classification_report(y_test, y_pred, output_dict=True)
        results.append({
            "Vectorizer": vec_name,
            "Model": model_name,
            "F1 (macro)": round(report['macro avg']['f1-score'], 4),
            "Accuracy": round(report['accuracy'], 4)
        })

results_df = pd.DataFrame(results)
print("📊 Résultats des modèles classiques :")
display(results_df)


Vectorisations:   0%|          | 0/2 [00:00<?, ?it/s]

Modèles (tfidf):   0%|          | 0/4 [00:00<?, ?it/s]

Modèles (count):   0%|          | 0/4 [00:00<?, ?it/s]

📊 Résultats des modèles classiques :


Unnamed: 0,Vectorizer,Model,F1 (macro),Accuracy
0,tfidf,LogisticRegression,0.8752,0.8752
1,tfidf,RandomForest,0.855,0.855
2,tfidf,NaiveBayes,0.8676,0.8676
3,tfidf,SVM,0.8756,0.8756
4,count,LogisticRegression,0.8755,0.8755
5,count,RandomForest,0.844,0.8441
6,count,NaiveBayes,0.8708,0.8708
7,count,SVM,0.8754,0.8754


In [18]:
# 📥 Imports (déjà faits, donc à ignorer si tu les as déjà en haut)

# ⚙️ Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🖥️ Device utilisé :", device)

# 🔡 Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 🧪 Prendre un échantillon de 5000 exemples pour un entraînement rapide
sample_df = pd.concat([X_train, y_train], axis=1).sample(5000, random_state=42)
X_sample = sample_df['text_processed']
y_sample = sample_df['humor']

# 📂 Dataset
class HumorDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 🧪 Création des datasets et loaders
train_dataset = HumorDataset(X_sample, y_sample, tokenizer)
test_dataset = HumorDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# 🧠 Modèle BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# ⚙️ Optimiseur
optimizer = AdamW(model.parameters(), lr=2e-5)

# 🔁 Entraînement
model.train()
epochs = 1  # reste à 1 pour rester dans la limite de temps

for epoch in range(epochs):
    loop = tqdm(train_loader, desc=f"🔁 Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# 📈 Évaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="🧪 Évaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch['labels'].cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

print("📊 Rapport de classification — BERT (échantillon 5000)")
print(classification_report(all_labels, all_preds))


🖥️ Device utilisé : cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔁 Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]

🧪 Évaluation:   0%|          | 0/625 [00:00<?, ?it/s]

📊 Rapport de classification — BERT (échantillon 5000)
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     19999
           1       0.83      0.91      0.87     20000

    accuracy                           0.86     39999
   macro avg       0.86      0.86      0.86     39999
weighted avg       0.86      0.86      0.86     39999



In [None]:
# 📦 Réinstallation propre
!pip install -q --upgrade pip
!pip install -q transformers datasets scikit-learn pandas seaborn matplotlib

# 🧹 Redémarre cette cellule juste après avoir redémarré le runtime !

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, get_scheduler
from datasets import Dataset as HFDataset
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# 📂 Chargement des données
df = pd.read_csv('dataset.csv')
df['label'] = df['humor'].astype(int)
df_sampled = df.sample(frac=0.05, random_state=42).reset_index(drop=True)

train_df, val_df, test_df = np.split(df_sampled.sample(frac=1, random_state=42), [int(.8*len(df_sampled)), int(.9*len(df_sampled))])
train_df = train_df.rename(columns={'label': 'labels'})
test_df = test_df.rename(columns={'label': 'labels'})

# 🧼 Tokenisation
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def tokenize(example): return tokenizer(example["text"], padding="max_length", truncation=True, max_length=50)
train_ds = HFDataset.from_pandas(train_df[['text', 'labels']]).map(tokenize, batched=True)
test_ds = HFDataset.from_pandas(test_df[['text', 'labels']]).map(tokenize, batched=True)
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# ⚙️ Modèle
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 🚂 Entraînement
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*3)

model.train()
for epoch in range(3):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# 🧪 Évaluation
test_loader = DataLoader(test_ds, batch_size=32)
model.eval(); preds, labels = [], []
for batch in tqdm(test_loader, desc="Evaluation"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad(): out = model(**batch).logits
    preds.extend(torch.argmax(out, dim=-1).cpu().numpy())
    labels.extend(batch['labels'].cpu().numpy())

print(f"\n✅ F1 score on test set: {f1_score(labels, preds):.4f}")

# 📊 Matrice de confusion
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-humor", "Humor"], yticklabels=["Non-humor", "Humor"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# 📈 ROC
fpr, tpr, _ = roc_curve(labels, preds)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc(fpr, tpr):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.show()

# 📉 PR Curve
precision, recall, _ = precision_recall_curve(labels, preds)
plt.figure()
plt.plot(recall, precision, label="Precision-Recall curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR Curve")
plt.legend()
plt.show()

# 🧾 Classification report
print("\n📋 Classification Report:")
print(classification_report(labels, preds))

# 🔍 Erreurs typiques
errors = [(i, l, p) for i, (l, p) in enumerate(zip(labels, preds)) if l != p]
for idx, true, pred in errors[:5]:
    text = test_df.iloc[idx]['text']
    print(f"\n❌ Mal classé : {text}")
    print(f"✔️ Vrai label : {true} | 🚫 Prédit : {pred}")
