In [None]:

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from datasets import Dataset as HFDataset
from tqdm import tqdm

# 📂 Chargement des données
df = pd.read_csv('dataset.csv')
df['label'] = df['humor'].astype(int)
df_sampled = df.sample(frac=0.05, random_state=42).reset_index(drop=True)
_, _, test_df = np.split(df_sampled.sample(frac=1, random_state=42), [int(.8*len(df_sampled)), int(.9*len(df_sampled))])
test_df = test_df.rename(columns={'label': 'labels'})

# 🧼 Tokenisation
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def tokenize(example): return tokenizer(example["text"], padding="max_length", truncation=True, max_length=50)
test_ds = HFDataset.from_pandas(test_df[['text', 'labels']]).map(tokenize, batched=True)
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# ⚙️ Chargement du modèle
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 🔍 Évaluation rapide pour afficher les erreurs
test_loader = DataLoader(test_ds, batch_size=32)
preds, labels = [], []

for batch in tqdm(test_loader, desc="Évaluation"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        out = model(**batch).logits
    preds.extend(torch.argmax(out, dim=-1).cpu().numpy())
    labels.extend(batch['labels'].cpu().numpy())

# 🔎 Affichage des erreurs typiques
errors = [(i, l, p) for i, (l, p) in enumerate(zip(labels, preds)) if l != p]
for idx, true, pred in errors[:5]:
    text = test_df.iloc[idx]['text']
    print(f"\n❌ Mal classé : {text}")
    print(f"✔️ Vrai label : {true} | 🚫 Prédit : {pred}")


In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import numpy as np

model_name = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()

def get_surprisal(text, model=model, tokenizer=tokenizer):
    tokens = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens, labels=tokens["input_ids"])
        loss = outputs.loss
    return loss.item()

text1 = "Why don't skeletons fight each other? They don't have the guts."
text2 = "The cat sat on the mat."

print(f"Surprise 1: {get_surprisal(text1):.4f}")
print(f"Surprise 2: {get_surprisal(text2):.4f}")


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# English to French
src_text = "Why did the chicken cross the road? To get to the other side!"
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

tokens = tokenizer(src_text, return_tensors="pt", padding=True)
translated = model.generate(**tokens)
print("FR:", tokenizer.decode(translated[0], skip_special_tokens=True))
