In [None]:
%pip install transformers torch datasets evaluate accelerate timm kagglehub pandas seaborn

In [None]:
# Fine-tuning minimal de GPT2-Large sur le dataset de blagues (colbert_humor.csv)
# Pré-requis: avoir installé `transformers`, `datasets` et (optionnel) `accelerate`.
# IMPORTANT: GPT2-large est gourmand en GPU/mémoire. Préférer lancer sur GPU/accelerate.

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import os

# Chemins et hyperparamètres (ajustez selon votre GPU)
csv_path = 'data/processed/colbert_humor.csv'
model_name = 'openai-community/gpt2-large'  # ou 'gpt2-large' selon disponibilité
output_dir = './fine_tuned_gpt2_large'
block_size = 128
num_epochs = 3
per_device_train_batch_size = 2

# Charger dataset (on suppose une colonne 'text' ou 'text_clean')
raw = load_dataset('csv', data_files=csv_path)['train']
print('Taille dataset:', len(raw))
# Normaliser la colonne texte
def pick_text(x):
    for c in ('text_clean', 'text'):
        if c in x and x[c] is not None:
            return x[c]
    return ''
raw = raw.map(lambda x: {'text': pick_text(x)})
raw = raw.filter(lambda x: x['text'] is not None and x['text'].strip() != '')

# Forcer CPU pour éviter les erreurs CUDA lors de l'initialisation (décommenter pour utiliser GPU)
# os.environ['CUDA_VISIBLE_DEVICES'] = ''  # force CPU-only (keeps trainer safe for debugging)
print('Chargement du tokenizer et modèle (cela peut télécharger des centaines de MB)')
tokenizer = AutoTokenizer.from_pretrained(model_name)
# GPT2 n'a pas de token pad par défaut
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenization (line-by-line)
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

tokenized = raw.map(tokenize_function, batched=True, remove_columns=raw.column_names)

# Grouper en blocks pour la LM (concat puis split)
def group_texts(examples):
    # examples['input_ids'] est une liste de listes
    input_ids = examples['input_ids']
    # concatenate all input ids for the batch
    concatenated = []
    for ids in input_ids:
        concatenated.extend(ids)
    total_length = (len(concatenated) // block_size) * block_size
    if total_length == 0:
        return {"input_ids": [], "labels": []}
    result_input_ids = [concatenated[i : i + block_size] for i in range(0, total_length, block_size)]
    return {'input_ids': result_input_ids, 'labels': [r.copy() for r in result_input_ids]}

# IMPORTANT: retirer les colonnes non utilisées pour éviter des incohérences de longueur lors de l'écriture Arrow
# réduire batch_size si dataset petit / problème mémoire
lm_datasets = tokenized.map(group_texts, batched=True, batch_size=200, remove_columns=tokenized.column_names)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_strategy='epoch',
    logging_steps=50,
    fp16=False,
)

# Instantiate Trainer safely: if CUDA device-side asserts happen during seed setting,
# we fallback to a CPU-only Trainer. If you want GPU training later, remove the
# os.environ override above and run with accelerate or ensure CUDA is stable.
trainer = None
try:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets,
        data_collator=data_collator,
    )
except Exception as e:
    print('Erreur lors de la création du Trainer (probablement liée à CUDA).')
    print('Exception:', e)
    print('Réessai sur CPU: désactivation des GPUs via CUDA_VISIBLE_DEVICES=puis recréation du Trainer.')
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    import importlib, sys
    # Re-import torch after env change can be necessary in some contexts; here we just retry Trainer creation
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets,
        data_collator=data_collator,
    )

print("Démarrage de l'entraînement. Sur CPU ce sera très lent — préférez GPU/accelerate.")
# Lancez trainer.train() manuellement (décommenter la ligne ci‑dessous pour exécuter ici)
# trainer.train()

print("Cellule de fine-tuning prête. Décommentez trainer.train() pour lancer l'entraînement.")

In [None]:
# Génération + filtrage (mode CPU pour débogage)
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import os
import json

device = torch.device('cpu')  # START with CPU to avoid CUDA errors; change to 'cuda' si stable

# -------- génération (remplacez si vous avez un LM local) ----------
gen_name = 'openai-community/gpt2-large'  # remplacez par votre modèle génératif FR si vous en avez
tok_gen = AutoTokenizer.from_pretrained(gen_name)
if tok_gen.pad_token is None:
    tok_gen.pad_token = tok_gen.eos_token
model_gen = AutoModelForCausalLM.from_pretrained(gen_name).to(device)

def generate_jokes(prompt, n=8, max_len=80, temp=0.9, top_p=0.95):
    inputs = tok_gen(prompt, return_tensors='pt')
    # don't move tokenizer tensors to device before fixing ids
    out = model_gen.generate(**{k: v.to(device) for k, v in inputs.items()}, do_sample=True, temperature=temp, top_p=top_p, max_length=max_len, num_return_sequences=n, pad_token_id=tok_gen.eos_token_id)
    results = [tok_gen.decode(o, skip_special_tokens=True)[len(prompt):].strip() for o in out]
    return results

# -------- charger vos classifieurs (ils n'ont que config+poids, on utilise fallback tokenizer) ----------
from transformers import AutoTokenizer

def choose_tokenizer_candidate_for_vocab(vocab_size, cfg_archs=None):
    # heuristiques simples
    if vocab_size is None:
        return ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased']
    if vocab_size == 30522:
        return ['bert-base-uncased', 'distilbert-base-uncased', 'bert-base-cased']
    if vocab_size >= 100000:
        return ['bert-base-multilingual-cased', 'distilbert-base-multilingual-cased']
    # fallback list
    return ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased', 'bert-base-uncased']


def load_model_and_tokenizer_classif(path):
    path = os.path.abspath(path)
    if not os.path.exists(path):
        print(f"Le chemin {path} n'existe pas")
        return None, None
    # load model first
    try:
        model = AutoModelForSequenceClassification.from_pretrained(path)
    except Exception as e:
        print(f"Erreur chargement modèle depuis {path}: {e}")
        return None, None

    # tokenizers: try local, else choose candidate by vocab_size
    tok = None
    try:
        tok = AutoTokenizer.from_pretrained(path)
        print(f'Tokenizer local chargé depuis {path}')
    except Exception as e_tok:
        cfg = {}
        try:
            with open(os.path.join(path, 'config.json'), 'r', encoding='utf-8') as f:
                cfg = json.load(f)
        except Exception:
            cfg = {}
        vocab_size = getattr(model.config, 'vocab_size', None)
        candidates = choose_tokenizer_candidate_for_vocab(vocab_size, cfg.get('architectures', None))
        for cand in candidates:
            try:
                tok = AutoTokenizer.from_pretrained(cand)
                print(f'Utilisation du tokenizer de fallback: {cand} pour model vocab_size={vocab_size}')
                break
            except Exception:
                tok = None
        if tok is None:
            # dernier recours
            tok = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
            print('Utilisation du tokenizer fallback final: bert-base-multilingual-cased')

    model.to(device)
    model.eval()
    return model, tok

m1, t1 = load_model_and_tokenizer_classif('./humor_detection_model01')
m2, t2 = load_model_and_tokenizer_classif('./humor_model_multilingual')

# -------- scoring manuel (sans pipeline) ----------
import torch.nn.functional as F

def safe_fix_input_ids_for_model(input_ids, model_vocab_size, unk_id=0):
    # input_ids: torch.LongTensor
    if model_vocab_size is None:
        return input_ids
    # replace indices >= model_vocab_size with unk_id
    mask = input_ids >= model_vocab_size
    if mask.any():
        input_ids = input_ids.clone()
        input_ids[mask] = unk_id
    return input_ids


def score_with_model(texts, model, tokenizer, max_length=128):
    scores = []
    if model is None or tokenizer is None:
        return [0.0] * len(texts)
    model_vocab = getattr(model.config, 'vocab_size', None)
    unk_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else tokenizer.pad_token_id or 0
    for t in texts:
        try:
            enc = tokenizer(t, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt', return_token_type_ids=False)
            # fix any out-of-range token ids before moving to device
            enc_input_ids = safe_fix_input_ids_for_model(enc['input_ids'], model_vocab, unk_id)
            enc['input_ids'] = enc_input_ids
            enc = {k: v.to(device) for k, v in enc.items()}
            with torch.no_grad():
                logits = model(**enc).logits
                probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
            id2label = getattr(model.config, 'id2label', None)
            if id2label:
                labels = [id2label[i] for i in range(len(probs))]
            else:
                labels = [f'LABEL_{i}' for i in range(len(probs))]
            probs_dict = {labels[i]: float(probs[i]) for i in range(len(probs))}
            h = None
            for L, p in probs_dict.items():
                if 'humor' in L.lower() or 'humour' in L.lower():
                    h = p
                    break
            if h is None:
                h = probs_dict.get('LABEL_1', max(probs_dict.values()))
        except Exception as e:
            print('Erreur scoring:', e)
            h = 0.0
        scores.append(float(h))
    return scores

# -------- usage ----------
prompt = "Génère une blague courte en français :\n"
candidates = generate_jokes(prompt, n=8)
scores1 = score_with_model(candidates, m1, t1)
scores2 = score_with_model(candidates, m2, t2)
combined = [(a + b) / 2.0 for a, b in zip(scores1, scores2)]
order = np.argsort(combined)[::-1]
for i in order:
    print("SCORE", combined[i])
    print(candidates[i])
    print("-----")


In [None]:
# Sauvegarder un tokenizer de fallback dans les dossiers de classifieurs (rend le chargement déterministe)
from transformers import AutoTokenizer
import os

def ensure_tokenizer_saved(folder, tokenizer_name='distilbert-base-multilingual-cased'):
    folder = os.path.abspath(folder)
    if not os.path.exists(folder):
        print(f'Folder {folder} nexiste pas, skip')
        return
    # check if there's already a tokenizer file
    files = os.listdir(folder)
    if any(f.startswith('tokenizer') or f in ('vocab.txt','tokenizer.json','tokenizer_config.json') for f in files):
        print(f'Tokenizer déjà présent dans {folder}, skip')
        return
    print(f'Téléchargement et sauvegarde du tokenizer {tokenizer_name} dans {folder}')
    tok = AutoTokenizer.from_pretrained(tokenizer_name)
    tok.save_pretrained(folder)

ensure_tokenizer_saved('./humor_detection_model01')
ensure_tokenizer_saved('./humor_model_multilingual')
print('Tokenizers saved (ou déjà présents).')