Nouvelle version

In [1]:
import os, xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd

print("Working directory:", os.getcwd())

# Deux options : depuis la racine ou depuis notebook/
candidates = [
    Path("data/stackexchange/Posts.xml"),
    Path("../data/stackexchange/Posts.xml"),
]
xml_path = next((p for p in candidates if p.exists()), None)
if xml_path is None:
    raise FileNotFoundError("Posts.xml introuvable (chemins testés : {})"
                            .format([str(p) for p in candidates]))
print("Lecture :", xml_path)

tree = ET.parse(xml_path)

# — Dictionnaire réponses
answers_map = {
    int(r.attrib["Id"]): r.attrib["Body"]
    for r in tree.iter("row") if r.attrib.get("PostTypeId") == "2"
}

# — Questions + réponse acceptée
questions = []
for q in tree.iter("row"):
    if q.attrib.get("PostTypeId") == "1" and "AcceptedAnswerId" in q.attrib:
        aid = int(q.attrib["AcceptedAnswerId"])
        if aid in answers_map:
            questions.append({
                "title":  q.attrib.get("Title", ""),
                "body":   q.attrib.get("Body", ""),
                "tags":   q.attrib.get("Tags", ""),
                "answer": answers_map[aid]
            })

posts_df = pd.DataFrame(questions)
print(f"→ {len(posts_df)} paires Q/R chargées")
posts_df.head(2)


Working directory: /home/maxime/DataDevIA/chatbotcoach_project/notebook
Lecture : ../data/stackexchange/Posts.xml
→ 4761 paires Q/R chargées


Unnamed: 0,title,body,tags,answer
0,What's the difference between Whey Isolate and...,<p>What's the difference? I'm looking at shake...,<protein><nutrition>,"<p>The main difference is in the ""purity"", how..."
1,Breakfast before or after jogging?,"<p>Provided that I'm not hungry, should I eat ...",<nutrition><food><jogging>,"<p>I'd say this depends on your fitness, the i..."


In [2]:
import re, html
from bs4 import BeautifulSoup

def strip_html(text: str) -> str:
    raw = BeautifulSoup(text, "lxml").get_text(" ")
    return re.sub(r"\s+", " ", html.unescape(raw)).strip()

posts_df["q_en"] = posts_df["title"].apply(strip_html) + " " + posts_df["body"].apply(strip_html)
posts_df["a_en"] = posts_df["answer"].apply(strip_html)
posts_df = posts_df[["q_en", "a_en"]]
posts_df.head(2)


Unnamed: 0,q_en,a_en
0,What's the difference between Whey Isolate and...,"The main difference is in the ""purity"", how mu..."
1,Breakfast before or after jogging? Provided th...,"I'd say this depends on your fitness, the inte..."


Etape de traduction, demande plusieurs heures de calculs sur GPU, bonus.

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datasets import Dataset

model_name = "Helsinki-NLP/opus-mt-en-fr"
tok_mt  = AutoTokenizer.from_pretrained(model_name)
mod_mt  = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda", dtype=torch.float16)

def translate_list(texts, max_tokens=128):
    # Tronque avant tokenisation pour éviter >512
    encoded = tok_mt(
        texts, padding=True, truncation=True,
        max_length=max_tokens, return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        out = mod_mt.generate(
            **encoded,
            max_length=max_tokens,
            num_beams=4,
            early_stopping=True
        )
    return tok_mt.batch_decode(out, skip_special_tokens=True)

def translate_batch(batch):
    batch["q_fr"] = translate_list(batch["q_en"])
    batch["a_fr"] = translate_list(batch["a_en"])
    return batch

dataset_raw = Dataset.from_pandas(posts_df)
dataset_tr  = dataset_raw.map(
    translate_batch,
    batched=True,
    batch_size=64,         # GPU → batch 64 passe sur 6 Go en FP16
    remove_columns=["q_en", "a_en"]
)



  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 4761/4761 [08:24<00:00,  9.43 examples/s]


sauvegarde du dataset traduit en francais 

In [4]:
# 1) Sauvegarde du dataset HF sur disque
dataset_tr.save_to_disk("data/stackexchange/translated_dataset_fr")

# 2) Plus tard, pour recharger sans retraduire :
from datasets import load_from_disk
dataset_tr = load_from_disk("data/stackexchange/translated_dataset_fr")


Saving the dataset (1/1 shards): 100%|██████████| 4761/4761 [00:00<00:00, 537236.52 examples/s]


In [1]:
from datasets import load_from_disk

# 1) Charger le dataset traduit
dataset_tr = load_from_disk("data/stackexchange/translated_dataset_fr")

# 2) Split train / val / test sur dataset_tr
split1  = dataset_tr.train_test_split(test_size=0.1, seed=42)
test_ds = split1["test"]
tmp     = split1["train"].train_test_split(test_size=0.1, seed=42)
train_ds, val_ds = tmp["train"], tmp["test"]

# 3) Vérification
print(len(train_ds), "train |", len(val_ds), "val |", len(test_ds), "test")



3855 train | 429 val | 477 test


In [2]:
print("Colonnes du train_ds :", train_ds.column_names)
print("Exemple :", train_ds[0])


Colonnes du train_ds : ['q_fr', 'a_fr']
Exemple : {'q_fr': "J'aimerais faire du ski de fond cet hiver pour mon prochain Marathon. Ma question est de savoir si je peux skier pendant un certain nombre d'heures et cela aura un impact sur mon entraînement normal? Mon dernier marathon j'ai couru 50-60 miles par semaine mais j'aimerais ajouter d'autres types d'exercices à mon prochain cycle d'entraînement. Ai-je besoin de plus d'une journée de repos complète ou puis-je courir une longue journée puis skier pendant un certain nombre d'heures?", 'a_fr': "Oui, le ski aura un impact sur votre entraînement normal (et vice versa). Si vous allez skier pendant un certain nombre d'heures, vous ferez des choses biomécaniques différentes, mais vous insisterez sur les mêmes systèmes. Ne faites pas deux longs efforts en deux jours. Fait amusant: élite 5K et 10K coureur Ben True était un skieur nordique avant de se concentrer entièrement sur la course: http://www.runnersworld.com/tag/ben-true"}


In [3]:
# ─── Cellule 4 (version traduite FR) – Tokenisation & préparation des features ───

from transformers import MT5Tokenizer

# 1) Charger le tokenizer mT5 (slow) sans erreur protobuf
tok = MT5Tokenizer.from_pretrained("google/mt5-small", model_max_length=512)

# 2) Vérifier que train_ds contient bien les colonnes FR
print("Avant map, colonnes de train_ds :", train_ds.column_names)
# → vous devriez voir ['q_fr','a_fr']

# 3) Utiliser les colonnes FR pour l'entraînement
input_col, target_col = "q_fr", "a_fr"
MAX_IN, MAX_OUT       = 192, 128

def preprocess(batch):
    # On explicite le format question→réponse
    inputs  = ["question: " + q for q in batch[input_col]]
    targets = ["answer:   " + a for a in batch[target_col]]

    model_in  = tok(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=MAX_IN
    )
    model_out = tok(
        targets,
        truncation=True,
        padding="max_length",
        max_length=MAX_OUT
    )
    model_in["labels"] = model_out["input_ids"]
    return model_in

# 4) Appliquer le preprocessing sur chaque split et supprimer les colonnes brutes q_fr/a_fr
train_ds = train_ds.map(
    preprocess,
    batched=True,
    batch_size=32,
    remove_columns=[input_col, target_col]
)
val_ds = val_ds.map(
    preprocess,
    batched=True,
    batch_size=32,
    remove_columns=[input_col, target_col]
)
test_ds = test_ds.map(
    preprocess,
    batched=True,
    batch_size=32,
    remove_columns=[input_col, target_col]
)

# 5) Vérification finale
print("Après map, colonnes :", train_ds.column_names)
print("Exemple de clés :", train_ds[0].keys())


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Avant map, colonnes de train_ds : ['q_fr', 'a_fr']
Après map, colonnes : ['input_ids', 'attention_mask', 'labels']
Exemple de clés : dict_keys(['input_ids', 'attention_mask', 'labels'])


Test Debug (optionnel)


In [5]:
# ─── Cellule Debug – Vérification des labels et de la loss sur un batch ───

import torch
from torch.utils.data import DataLoader

# 1. Créer un DataLoader minimal avec votre collator
dl = DataLoader(train_ds, batch_size=4, collate_fn=collator)

# 2. Récupérer le premier batch
batch = next(iter(dl))

# 3. Afficher quelques labels
labels = batch["labels"][0]             # premier exemple
unique = set(labels.tolist())
print("Labels uniques dans le batch :", unique)

# 4. Calculer la loss manuellement
batch = {k: v.to(model.device) for k, v in batch.items()}
with torch.no_grad():
    loss = model(**batch).loss
print("Loss calculée sur ce batch :", loss.item())


Labels uniques dans le batch : {1, 520, 3082, 188426, 1052, 557, 138801, 577, 3654, 7241, 21070, 9295, 10832, 66142, 1124, 613, 4203, 1648, 1143, 22147, 20104, 12424, 1169, 21151, 3760, 6321, 38583, 77507, 22737, 11477, 1754, 750, 1273, 3840, 7938, 259, 260, 261, 263, 7434, 267, 21259, 8461, 270, 269, 32522, 274, 59154, 277, 283, 16161, 289, 295, 299, 300, 303, 57147, 210238, 331, 335, 340, 82268, 865, 37734, 360, 41325, 3953, 886, 383, 9095, 2952, 391, 47504, 139679, 18338, 429, 430, 1465, 1468, 91587, 26572, 5582, 20954, 475, 110047, 483, 498, 5119}
Loss calculée sur ce batch : 33.819705963134766


In [4]:
import transformers, inspect, os, sys
from transformers import TrainingArguments

print("Transformers version :", transformers.__version__)
print("TrainingArguments vient de :", inspect.getfile(TrainingArguments))


Transformers version : 4.52.4
TrainingArguments vient de : /home/maxime/DataDevIA/chatbotcoach_project/.venv/lib/python3.12/site-packages/transformers/training_args.py


definition des metrics

In [4]:
# ─── Cellule 4.5 – Définition de compute_metrics ───

import evaluate
import numpy as np

# On charge la métrique ROUGE (vous pouvez en ajouter d'autres, ex. sacrebleu)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """
    Prend en entrée un tuple (predictions, labels) où :
      - predictions est un np.ndarray de shape (batch_size, max_len)
      - labels       est un np.ndarray de shape (batch_size, max_len)
    Retourne un dict { 'rougeL': float, ... }
    """
    preds, labels = eval_pred

    # Si votre modèle renvoie un tuple, prenez preds[0]
    if isinstance(preds, tuple):
        preds = preds[0]

    # Décodage en texte
    decoded_preds  = tok.batch_decode(preds,  skip_special_tokens=True)
    decoded_labels = tok.batch_decode(labels, skip_special_tokens=True)

    # ROUGE veut des listes de strings
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rougeL"]
    )
    # On prend la fmeasure médiane
    rougeL_f = result["rougeL"].mid.fmeasure

    # Vous pouvez renvoyer plusieurs métriques si besoin
    return {
        "rougeL": rougeL_f,
    }


In [6]:
# ─── Cellule 5 corrigée – Configuration de l'entraînement ───

from transformers import Seq2SeqTrainingArguments  # importe la bonne classe

# 0) Calcul du nombre de pas par époque
steps_per_epoch = len(train_ds) // (2 * 4)
print("≈", steps_per_epoch, "pas par époque")

# 1) Charger & configurer le modèle
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
model.config.use_cache = False
model.gradient_checkpointing_enable()

# 2) Collator
collator = DataCollatorForSeq2Seq(
    tokenizer=tok,
    model=model,
    label_pad_token_id=-100
)

# 3) Arguments d'entraînement (Seq2SeqTrainingArguments gère generate & compute_metrics)
args = Seq2SeqTrainingArguments(
    output_dir="mt5_fitness_ckpt",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=True,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    num_train_epochs=5,

    # évaluation & sauvegarde en nombre de pas
    eval_steps=steps_per_epoch,
    save_steps=steps_per_epoch,
    save_total_limit=2,

    logging_steps=50,
    report_to=["tensorboard"],

    # options de génération pour compute_metrics
    predict_with_generate=True,
    generation_max_length=MAX_OUT,

    # pour sauver en safetensors
    save_safetensors=True,
)

# 4) Instanciation du Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator,
    tokenizer=tok,                   # nécessaire pour la génération
    compute_metrics=compute_metrics, # si vous l'avez défini plus haut
)





≈ 481 pas par époque


  trainer = Seq2SeqTrainer(


Entrainement

In [7]:
# Cellule 6 – Entraînement
trainer.train()


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


TrainOutput(global_step=2410, training_loss=0.0, metrics={'train_runtime': 1130.5496, 'train_samples_per_second': 17.049, 'train_steps_per_second': 2.132, 'total_flos': 3821868859392000.0, 'train_loss': 0.0, 'epoch': 5.0})

In [10]:

# après trainer.train()
trainer.save_model("mt5_fitness_ckpt")  # ceci enregistre les weights dans mt5_fitness_ckpt
tok.save_pretrained("mt5_fitness_ckpt") # ceci enregistre le tokenizer (fichiers .model, config…)



('mt5_fitness_ckpt/tokenizer_config.json',
 'mt5_fitness_ckpt/special_tokens_map.json',
 'mt5_fitness_ckpt/spiece.model',
 'mt5_fitness_ckpt/added_tokens.json')

chargement du modele



In [11]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import torch

checkpoint_dir = "mt5_fitness_ckpt"

# 2.1 Charger le tokenizer qui est maintenant dans mt5_fitness_ckpt
tok = MT5Tokenizer.from_pretrained(checkpoint_dir)

# 2.2 Charger le modèle fine-tuné
model = MT5ForConditionalGeneration.from_pretrained(checkpoint_dir)

# 2.3 Mettre sur GPU si dispo
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [12]:
# ─── Cellule 7 – Inference rapide ───
from transformers import Text2TextGenerationPipeline

# 1) Créer un pipeline de génération
pipe = Text2TextGenerationPipeline(
    model=model,
    tokenizer=tok,
    device=0 if torch.cuda.is_available() else -1,
    framework="pt",
)

# 2) Quelques questions d’exemple
questions = [
    "question: Comment améliorer mon endurance pour la course à pied ?",
    "question: Quel est le meilleur programme d'entraînement pour perdre du poids ?",
    "question: Quels étirements faire après une séance de squat ?",
]

# 3) Générer les réponses
for q in questions:
    out = pipe(q, max_length=128, num_beams=4, early_stopping=True)
    print(f"> {q}\n→ {out[0]['generated_text']}\n")


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


> question: Comment améliorer mon endurance pour la course à pied ?
→ <extra_id_0>.

> question: Quel est le meilleur programme d'entraînement pour perdre du poids ?
→ <extra_id_0>.

> question: Quels étirements faire après une séance de squat ?
→ <extra_id_0>

