In [None]:
%pip install transformers torch datasets evaluate accelerate timm kagglehub pandas seaborn

In [None]:
import pandas as pd
import numpy as np
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Load dataset
df = pd.read_csv('data/processed/colbert_humor.csv')
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
print(dataset)

# Load tokenizer and model
model_name = "openai-community/gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Some pretrained tokenizers (GPT-2 style) don't have a pad_token. Trainer/dataset preprocessing
# may request padding when using `padding='max_length'` or batched tokenization.
# Ensure a pad_token exists to avoid ValueError from the tokenizer padding/truncation checks.
if tokenizer.pad_token is None:
    # Use eos_token as pad token (safe choice for causal LM tokenizers)
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Make sure model config knows the pad_token_id. Some model classes don't set it automatically.
if getattr(model.config, 'pad_token_id', None) is None:
    model.config.pad_token_id = tokenizer.pad_token_id
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)
# Remove only columns that are actually present (avoid ValueError). Do not remove 'text' before tokenization.
cols_to_try = ['__index_level_0__']
cols_present = [c for c in cols_to_try if c in encoded_dataset['train'].column_names]
if cols_present:
    encoded_dataset = encoded_dataset.remove_columns(cols_present)

# Ensure the label column is named 'labels' because Trainer expects 'labels' by default
if 'labels' not in encoded_dataset['train'].column_names:
    label_candidate = 'humor' if 'humor' in encoded_dataset['train'].column_names else ('label' if 'label' in encoded_dataset['train'].column_names else None)
    if label_candidate is None:
        raise ValueError(f"No label column found. Available columns: {encoded_dataset['train'].column_names}")
    # When using batched=True the function receives lists; convert labels to int to force CrossEntropyLoss (not BCEWithLogits)
    def _add_labels(batch):
        labels = batch[label_candidate]
        batch['labels'] = [int(l) for l in labels]
        return batch
    encoded_dataset = encoded_dataset.map(_add_labels, batched=True)
    # Optionally remove the old label column
    if label_candidate in encoded_dataset['train'].column_names:
        encoded_dataset = encoded_dataset.remove_columns([label_candidate])

# Ensure labels have integer dtype in the dataset features (best-effort)
try:
    from datasets import Value
    encoded_dataset = encoded_dataset.cast_column('labels', Value('int64'))
except Exception:
    # cast_column may fail depending on datasets version; we've already converted to int above
    pass

encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

training_args = TrainingArguments(
    output_dir='./finetuned_gpt2large',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True,
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()
trainer.evaluate()
# Save the model and tokenizer
model.save_pretrained('./finetuned_gpt2large')
tokenizer.save_pretrained('./finetuned_gpt2large')

In [None]:
# Génération + filtrage (mode CPU pour débogage)
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import os
import json

device = torch.device('cpu')  # START with CPU to avoid CUDA errors; change to 'cuda' si stable

# -------- génération (remplacez si vous avez un LM local) ----------
gen_name = 'openai-community/gpt2-large'  # remplacez par votre modèle génératif FR si vous en avez
tok_gen = AutoTokenizer.from_pretrained(gen_name)
if tok_gen.pad_token is None:
    tok_gen.pad_token = tok_gen.eos_token
model_gen = AutoModelForCausalLM.from_pretrained(gen_name).to(device)

def generate_jokes(prompt, n=8, max_len=80, temp=0.9, top_p=0.95):
    inputs = tok_gen(prompt, return_tensors='pt')
    # don't move tokenizer tensors to device before fixing ids
    out = model_gen.generate(**{k: v.to(device) for k, v in inputs.items()}, do_sample=True, temperature=temp, top_p=top_p, max_length=max_len, num_return_sequences=n, pad_token_id=tok_gen.eos_token_id)
    results = [tok_gen.decode(o, skip_special_tokens=True)[len(prompt):].strip() for o in out]
    return results

# -------- charger vos classifieurs (ils n'ont que config+poids, on utilise fallback tokenizer) ----------
from transformers import AutoTokenizer

def choose_tokenizer_candidate_for_vocab(vocab_size, cfg_archs=None):
    # heuristiques simples
    if vocab_size is None:
        return ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased']
    if vocab_size == 30522:
        return ['bert-base-uncased', 'distilbert-base-uncased', 'bert-base-cased']
    if vocab_size >= 100000:
        return ['bert-base-multilingual-cased', 'distilbert-base-multilingual-cased']
    # fallback list
    return ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased', 'bert-base-uncased']


def load_model_and_tokenizer_classif(path):
    path = os.path.abspath(path)
    if not os.path.exists(path):
        print(f"Le chemin {path} n'existe pas")
        return None, None
    # load model first
    try:
        model = AutoModelForSequenceClassification.from_pretrained(path)
    except Exception as e:
        print(f"Erreur chargement modèle depuis {path}: {e}")
        return None, None

    # tokenizers: try local, else choose candidate by vocab_size
    tok = None
    try:
        tok = AutoTokenizer.from_pretrained(path)
        print(f'Tokenizer local chargé depuis {path}')
    except Exception as e_tok:
        cfg = {}
        try:
            with open(os.path.join(path, 'config.json'), 'r', encoding='utf-8') as f:
                cfg = json.load(f)
        except Exception:
            cfg = {}
        vocab_size = getattr(model.config, 'vocab_size', None)
        candidates = choose_tokenizer_candidate_for_vocab(vocab_size, cfg.get('architectures', None))
        for cand in candidates:
            try:
                tok = AutoTokenizer.from_pretrained(cand)
                print(f'Utilisation du tokenizer de fallback: {cand} pour model vocab_size={vocab_size}')
                break
            except Exception:
                tok = None
        if tok is None:
            # dernier recours
            tok = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
            print('Utilisation du tokenizer fallback final: bert-base-multilingual-cased')

    model.to(device)
    model.eval()
    return model, tok

m1, t1 = load_model_and_tokenizer_classif('./humor_detection_model01')
m2, t2 = load_model_and_tokenizer_classif('./humor_model_multilingual')

# -------- scoring manuel (sans pipeline) ----------
import torch.nn.functional as F

def safe_fix_input_ids_for_model(input_ids, model_vocab_size, unk_id=0):
    # input_ids: torch.LongTensor
    if model_vocab_size is None:
        return input_ids
    # replace indices >= model_vocab_size with unk_id
    mask = input_ids >= model_vocab_size
    if mask.any():
        input_ids = input_ids.clone()
        input_ids[mask] = unk_id
    return input_ids


def score_with_model(texts, model, tokenizer, max_length=128):
    scores = []
    if model is None or tokenizer is None:
        return [0.0] * len(texts)
    model_vocab = getattr(model.config, 'vocab_size', None)
    unk_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else tokenizer.pad_token_id or 0
    for t in texts:
        try:
            enc = tokenizer(t, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt', return_token_type_ids=False)
            # fix any out-of-range token ids before moving to device
            enc_input_ids = safe_fix_input_ids_for_model(enc['input_ids'], model_vocab, unk_id)
            enc['input_ids'] = enc_input_ids
            enc = {k: v.to(device) for k, v in enc.items()}
            with torch.no_grad():
                logits = model(**enc).logits
                probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
            id2label = getattr(model.config, 'id2label', None)
            if id2label:
                labels = [id2label[i] for i in range(len(probs))]
            else:
                labels = [f'LABEL_{i}' for i in range(len(probs))]
            probs_dict = {labels[i]: float(probs[i]) for i in range(len(probs))}
            h = None
            for L, p in probs_dict.items():
                if 'humor' in L.lower() or 'humour' in L.lower():
                    h = p
                    break
            if h is None:
                h = probs_dict.get('LABEL_1', max(probs_dict.values()))
        except Exception as e:
            print('Erreur scoring:', e)
            h = 0.0
        scores.append(float(h))
    return scores

# -------- usage ----------
prompt = "Génère une blague courte en français :\n"
candidates = generate_jokes(prompt, n=8)
scores1 = score_with_model(candidates, m1, t1)
scores2 = score_with_model(candidates, m2, t2)
combined = [(a + b) / 2.0 for a, b in zip(scores1, scores2)]
order = np.argsort(combined)[::-1]
for i in order:
    print("SCORE", combined[i])
    print(candidates[i])
    print("-----")


In [None]:
# Sauvegarder un tokenizer de fallback dans les dossiers de classifieurs (rend le chargement déterministe)
from transformers import AutoTokenizer
import os

def ensure_tokenizer_saved(folder, tokenizer_name='distilbert-base-multilingual-cased'):
    folder = os.path.abspath(folder)
    if not os.path.exists(folder):
        print(f'Folder {folder} nexiste pas, skip')
        return
    # check if there's already a tokenizer file
    files = os.listdir(folder)
    if any(f.startswith('tokenizer') or f in ('vocab.txt','tokenizer.json','tokenizer_config.json') for f in files):
        print(f'Tokenizer déjà présent dans {folder}, skip')
        return
    print(f'Téléchargement et sauvegarde du tokenizer {tokenizer_name} dans {folder}')
    tok = AutoTokenizer.from_pretrained(tokenizer_name)
    tok.save_pretrained(folder)

ensure_tokenizer_saved('./humor_detection_model01')
ensure_tokenizer_saved('./humor_model_multilingual')
print('Tokenizers saved (ou déjà présents).')

## Diagnostic GPU / CUDA
Si vous obtenez encore `AcceleratorError: CUDA error: device-side assert triggered`, exécutez la cellule de diagnostic suivante pour collecter l'état de CUDA et PyTorch.
Cela fournit des informations utiles (CUDA_VISIBLE_DEVICES, torch.cuda.is_available(), nombre de GPUs, versions) et les commandes recommandées à lancer depuis un terminal pour un debug plus profond.

In [None]:
# Cellule de diagnostic pour CUDA / PyTorch
import os, sys, subprocess, json
def run_cmd(cmd):
    try:
        out = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT, universal_newlines=True)
    except subprocess.CalledProcessError as e:
        out = e.output
    return out

print('Environment variables relevant to CUDA:')
for v in ('CUDA_VISIBLE_DEVICES','CUDA_LAUNCH_BLOCKING','TORCH_USE_CUDA_DSA'):
    print(v, '=', os.environ.get(v))

try:
    import torch
    print('torch version:', torch.__version__)
    print('cuda available:', torch.cuda.is_available())
    try:
        print('cuda device count:', torch.cuda.device_count())
        print('current device:', torch.cuda.current_device())
        print('device name:', torch.cuda.get_device_name(torch.cuda.current_device()))
    except Exception as e:
        print('cuda info error:', e)
except Exception as e:
    print('Could not import torch:', e)

# Recommended terminal commands to run if CUDA errors persist
print('Recommended terminal commands (run in a shell):')
print('1) Restart Jupyter / kernel to clear CUDA state')
print('2) Run with synchronous CUDA errors to get a proper backtrace:')
print('   CUDA_LAUNCH_BLOCKING=1 jupyter lab')
print('   or to test availability: CUDA_LAUNCH_BLOCKING=1 python -c ')
print('3) Optionally enable device-side assertions for debug:')
print('   export TORCH_USE_CUDA_DSA=1')
print('4) If the crash happens during Trainer creation, try running the fine-tuning cell with force_cpu=True and restart the kernel')

# Quick check: nvidia-smi if present
out = run_cmd('which nvidia-smi >/dev/null 2>&1 && nvidia-smi -L || echo ')
print('' + out)

print('Done. If you want, paste the above output here and I will suggest next steps.')