In [1]:
from pathlib import Path
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch, os, tqdm

CACHE = Path("data/stackexchange/translated_dataset_fr")
if CACHE.exists():
    dataset_tr = load_from_disk(CACHE)
    print("✓ dataset FR rechargé :", CACHE)
else:
    print("→ traduction en cours… (~10 min sur RTX 3060)")
    tok_mt = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
    mod_mt = (AutoModelForSeq2SeqLM
              .from_pretrained("Helsinki-NLP/opus-mt-en-fr")
              .to("cuda", dtype=torch.float16))

    def translate(texts, max_len=128):
        with torch.no_grad():
            enc = tok_mt(texts, return_tensors="pt",
                         padding=True, truncation=True,
                         max_length=max_len).to("cuda")
            out = mod_mt.generate(**enc, max_length=max_len,
                                  num_beams=4, early_stopping=True)
        return tok_mt.batch_decode(out, skip_special_tokens=True)

    def pipeline(batch):
        batch["q_fr"] = translate(batch["q_en"])
        batch["a_fr"] = translate(batch["a_en"])
        return batch

    raw_ds = Dataset.from_pandas(posts_df)
    dataset_tr = raw_ds.map(pipeline, batched=True, batch_size=64,
                            remove_columns=["q_en", "a_en"])
    dataset_tr.save_to_disk(CACHE)
    print("✓ dataset FR sauvegardé :", CACHE)


✓ dataset FR rechargé : data/stackexchange/translated_dataset_fr


In [2]:
from datasets import load_from_disk
ds = load_from_disk("data/stackexchange/translated_dataset_fr")

tmp = ds.train_test_split(test_size=0.1, seed=42)
test_ds = tmp["test"]
tmp2 = tmp["train"].train_test_split(test_size=0.1, seed=42)
train_ds, val_ds = tmp2["train"], tmp2["test"]

print(len(train_ds), "train |", len(val_ds), "val |", len(test_ds), "test")


3855 train | 429 val | 477 test


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig

# 1) Tokenizer
tok = AutoTokenizer.from_pretrained("google/mt5-small")

# 2) Charger en 8-bit
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/mt5-small",
    load_in_8bit=True,
    device_map="auto"
)

# 3) Configurer LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# 4) Enrober le modèle
model = get_peft_model(base_model, lora_config)



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [4]:
from transformers import MT5Tokenizer

tok = MT5Tokenizer.from_pretrained("google/mt5-small", model_max_length=512)

MAX_IN, MAX_OUT = 128, 64
def preprocess(batch):
    src = ["question: " + q for q in batch["q_fr"]]
    tgt = ["answer: "   + a for a in batch["a_fr"]]
    model_in  = tok(src, padding="max_length", truncation=True,
                    max_length=MAX_IN)
    model_out = tok(tgt, padding="max_length", truncation=True,
                    max_length=MAX_OUT)
    model_in["labels"] = model_out["input_ids"]
    return model_in

train_ds = train_ds.map(preprocess, batched=True, remove_columns=["q_fr","a_fr"])
val_ds   = val_ds.map(preprocess,   batched=True, remove_columns=["q_fr","a_fr"])
test_ds  = test_ds.map(preprocess,  batched=True, remove_columns=["q_fr","a_fr"])


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/3855 [00:00<?, ? examples/s]

Map:   0%|          | 0/429 [00:00<?, ? examples/s]

Map:   0%|          | 0/477 [00:00<?, ? examples/s]

In [6]:
# ─── Cellule 4.5 – Calcul du nombre de pas par époque ───
# batch_size = 1  et gradient_accumulation_steps = 8
steps_per_epoch = len(train_ds) // (1 * 8)
print("≈", steps_per_epoch, "pas par époque")


≈ 481 pas par époque


In [7]:
from transformers import MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments
import torch

# 1) Charger & configurer le modèle
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
model.config.use_cache = False
model.gradient_checkpointing_enable()

# 2) Vider la mémoire CUDA
torch.cuda.empty_cache()

# 3) Collator
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model, label_pad_token_id=-100)

# 4) Arguments d’entraînement (sans predict_with_generate ni generation_max_length)
args = TrainingArguments(
    output_dir="mt5_fitness_ckpt",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    fp16=True,
    learning_rate=5e-5,
    num_train_epochs=5,
    eval_steps=steps_per_epoch,
    save_steps=steps_per_epoch,
    save_total_limit=2,
    logging_steps=50,
    report_to=[],
)

# 5) Instancier le Trainer (sans passer predict_with_generate ici)
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator,
    tokenizer=tok,
    compute_metrics=perplexity,   # votre fn qui fera generate→perplexity
)

# 6) Lancer l’entraînement
trainer.train()

# 7) Pour évaluer avec génération :
#    soit en une passe brute :
metrics = trainer.evaluate()
#    soit en generation explicite :
preds = trainer.predict(test_ds, max_length=MAX_OUT, num_beams=4)




NameError: name 'perplexity' is not defined

In [None]:
trainer.train()


In [None]:
from safetensors.torch import save_file
save_dir = Path("mt5_fitness_final")
save_dir.mkdir(exist_ok=True)

# 7-1  poids LoRA seulement (2 × ~30 Mo)
model.save_pretrained(save_dir, safe_serialization=True)
tok.save_pretrained(save_dir)
print("✓ modèle LoRA sauvegardé :", save_dir)


In [None]:
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from peft import PeftModel

ckpt = "mt5_fitness_final"
tok = MT5Tokenizer.from_pretrained(ckpt)
base = MT5ForConditionalGeneration.from_pretrained("google/mt5-small",
                                                   torch_dtype=torch.float16).to("cuda")
model = PeftModel.from_pretrained(base, ckpt).to("cuda").eval()

def ask(q, max_new=64):
    prompt = "question: " + q
    enc = tok(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        gen = model.generate(**enc, max_length=max_new,
                             num_beams=4,
                             temperature=0.7,
                             decoder_start_token_id=tok.pad_token_id)
    return tok.decode(gen[0], skip_special_tokens=True)

print(ask("Comment améliorer mon endurance pour la course à pied ?"))
print("----")
print(ask("Quels étirements faire après une séance de squat ?"))
