In [None]:
%pip install transformers torch datasets evaluate accelerate timm kagglehub pandas seaborn

In [None]:
# Génération + filtrage (mode CPU pour débogage)
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import os
import json

device = torch.device('cpu')  # START with CPU to avoid CUDA errors; change to 'cuda' si stable

# -------- génération (remplacez si vous avez un LM local) ----------
gen_name = 'openai-community/gpt2-large'  # remplacez par votre modèle génératif FR si vous en avez
tok_gen = AutoTokenizer.from_pretrained(gen_name)
if tok_gen.pad_token is None:
    tok_gen.pad_token = tok_gen.eos_token
model_gen = AutoModelForCausalLM.from_pretrained(gen_name).to(device)

def generate_jokes(prompt, n=8, max_len=80, temp=0.9, top_p=0.95):
    inputs = tok_gen(prompt, return_tensors='pt')
    # don't move tokenizer tensors to device before fixing ids
    out = model_gen.generate(**{k: v.to(device) for k, v in inputs.items()}, do_sample=True, temperature=temp, top_p=top_p, max_length=max_len, num_return_sequences=n, pad_token_id=tok_gen.eos_token_id)
    results = [tok_gen.decode(o, skip_special_tokens=True)[len(prompt):].strip() for o in out]
    return results

# -------- charger vos classifieurs (ils n'ont que config+poids, on utilise fallback tokenizer) ----------
from transformers import AutoTokenizer

def choose_tokenizer_candidate_for_vocab(vocab_size, cfg_archs=None):
    # heuristiques simples
    if vocab_size is None:
        return ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased']
    if vocab_size == 30522:
        return ['bert-base-uncased', 'distilbert-base-uncased', 'bert-base-cased']
    if vocab_size >= 100000:
        return ['bert-base-multilingual-cased', 'distilbert-base-multilingual-cased']
    # fallback list
    return ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased', 'bert-base-uncased']


def load_model_and_tokenizer_classif(path):
    path = os.path.abspath(path)
    if not os.path.exists(path):
        print(f"Le chemin {path} n'existe pas")
        return None, None
    # load model first
    try:
        model = AutoModelForSequenceClassification.from_pretrained(path)
    except Exception as e:
        print(f"Erreur chargement modèle depuis {path}: {e}")
        return None, None

    # tokenizers: try local, else choose candidate by vocab_size
    tok = None
    try:
        tok = AutoTokenizer.from_pretrained(path)
        print(f'Tokenizer local chargé depuis {path}')
    except Exception as e_tok:
        cfg = {}
        try:
            with open(os.path.join(path, 'config.json'), 'r', encoding='utf-8') as f:
                cfg = json.load(f)
        except Exception:
            cfg = {}
        vocab_size = getattr(model.config, 'vocab_size', None)
        candidates = choose_tokenizer_candidate_for_vocab(vocab_size, cfg.get('architectures', None))
        for cand in candidates:
            try:
                tok = AutoTokenizer.from_pretrained(cand)
                print(f'Utilisation du tokenizer de fallback: {cand} pour model vocab_size={vocab_size}')
                break
            except Exception:
                tok = None
        if tok is None:
            # dernier recours
            tok = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
            print('Utilisation du tokenizer fallback final: bert-base-multilingual-cased')

    model.to(device)
    model.eval()
    return model, tok

m1, t1 = load_model_and_tokenizer_classif('./humor_detection_model01')
m2, t2 = load_model_and_tokenizer_classif('./humor_model_multilingual')

# -------- scoring manuel (sans pipeline) ----------
import torch.nn.functional as F

def safe_fix_input_ids_for_model(input_ids, model_vocab_size, unk_id=0):
    # input_ids: torch.LongTensor
    if model_vocab_size is None:
        return input_ids
    # replace indices >= model_vocab_size with unk_id
    mask = input_ids >= model_vocab_size
    if mask.any():
        input_ids = input_ids.clone()
        input_ids[mask] = unk_id
    return input_ids


def score_with_model(texts, model, tokenizer, max_length=128):
    scores = []
    if model is None or tokenizer is None:
        return [0.0] * len(texts)
    model_vocab = getattr(model.config, 'vocab_size', None)
    unk_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else tokenizer.pad_token_id or 0
    for t in texts:
        try:
            enc = tokenizer(t, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt', return_token_type_ids=False)
            # fix any out-of-range token ids before moving to device
            enc_input_ids = safe_fix_input_ids_for_model(enc['input_ids'], model_vocab, unk_id)
            enc['input_ids'] = enc_input_ids
            enc = {k: v.to(device) for k, v in enc.items()}
            with torch.no_grad():
                logits = model(**enc).logits
                probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
            id2label = getattr(model.config, 'id2label', None)
            if id2label:
                labels = [id2label[i] for i in range(len(probs))]
            else:
                labels = [f'LABEL_{i}' for i in range(len(probs))]
            probs_dict = {labels[i]: float(probs[i]) for i in range(len(probs))}
            h = None
            for L, p in probs_dict.items():
                if 'humor' in L.lower() or 'humour' in L.lower():
                    h = p
                    break
            if h is None:
                h = probs_dict.get('LABEL_1', max(probs_dict.values()))
        except Exception as e:
            print('Erreur scoring:', e)
            h = 0.0
        scores.append(float(h))
    return scores

# -------- usage ----------
prompt = "Génère une blague courte en français :\n"
candidates = generate_jokes(prompt, n=8)
scores1 = score_with_model(candidates, m1, t1)
scores2 = score_with_model(candidates, m2, t2)
combined = [(a + b) / 2.0 for a, b in zip(scores1, scores2)]
order = np.argsort(combined)[::-1]
for i in order:
    print("SCORE", combined[i])
    print(candidates[i])
    print("-----")
