# %% [markdown]
# # Chatbot Fitness (Français) – Notebook de Fine-Tuning
#
# Ce notebook vous guide pas-à-pas pour :
#
# 1. Télécharger/assembler un petit jeu de données Q-R sur le fitness en français.  
# 2. Pré-traiter et (optionnellement) traduire des données anglophones.  
# 3. Fine-tuner **`dbddv01/gpt2-french-small`** à l’aide de LoRA + 8-bit sur GPU ≤ 6 Go VRAM.  
# 4. Sauvegarder et tester le modèle sous forme de chatbot.
#
# > ⚠️ Matériel testé : RTX 3060 Laptop 6 Go VRAM + Ryzen 7 5800H + 16 Go RAM.

# %%
# Installation des dépendances principales

In [2]:
%pip install -qU transformers datasets accelerate peft bitsandbytes sentencepiece kaggle sacrebleu

Note: you may need to restart the kernel to use updated packages.


# ## (Facultatif) Configurer l’API Kaggle
#
# Si vous utilisez ce notebook en local/Colab et souhaitez télécharger le jeu *Fitness AI Prompt-Completion Dataset* depuis Kaggle :
#
# ```bash
# mkdir -p ~/.kaggle
# echo '{"username":"___","key":"___"}' > ~/.kaggle/kaggle.json
# chmod 600 ~/.kaggle/kaggle.json
# kaggle datasets download -d chibss/fitness-ai-prompt-completion-dataset -p data/ \
#        && unzip data/fitness-ai-prompt-completion-dataset.zip -d data/
# ```



In [None]:
# Télécharger / charger les jeux de données
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
import json, os, pandas as pd

# Dataset 1 : FQuAD v1.1 (FR)
from datasets import load_dataset, Dataset, concatenate_datasets

# Dataset 1 : FQuAD v1.1 (FR)
# => on active `trust_remote_code` pour autoriser le script
fquad = load_dataset(
    "illuin/fquad",
    "plain_text",
    split="train",
    trust_remote_code=True
)

# Dataset 2 : Fitness Q-R anglophone (Kaggle) – optionnel
#fit_path = "data/fitaidataset.json"          # adapte le chemin si nécessaire
#fitness_ds = None
if os.path.isfile(fit_path):
    with open(fit_path, "r", encoding="utf8") as f:
        raw = json.load(f)                   # attend des champs 'prompt', 'completion'
    df = pd.DataFrame(raw)
    fitness_ds = Dataset.from_pandas(df)     # simple split 'train'

# Concaténer ou fallback
dataset = fquad
if fitness_ds:
    dataset = concatenate_datasets([fquad, fitness_ds])

print(dataset[:3])

  from .autonotebook import tqdm as notebook_tqdm


ValueError: Loading illuin/fquad requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

In [None]:
# (Optionnel) Traduire le jeu Fitness EN→FR
from transformers import pipeline

if fitness_ds:
    translator = pipeline(
        "translation_en_to_fr",
        model="Helsinki-NLP/opus-mt-en-fr",
        device_map="auto"
    )

    def translate_example(ex):
        # ex comporte 'prompt' / 'completion'
        ex["prompt_fr"] = translator(ex["prompt"])[0]["translation_text"]
        ex["completion_fr"] = translator(ex["completion"])[0]["translation_text"]
        return ex

    fitness_ds = fitness_ds.map(translate_example, batched=False)
    # Remplace la version EN par FR
    dataset = concatenate_datasets([fquad, fitness_ds])

In [None]:
# Mise en forme « conversation »
def make_convo(ex):
    if "question" in ex:                                   # FQuAD
        ex["text"] = (
            f"Utilisateur: {ex['question']}\n"
            f"Assistant: {ex['answers']['text'][0]}"
        )
    elif "prompt_fr" in ex:                                # Fitness traduit
        ex["text"] = (
            f"Utilisateur: {ex['prompt_fr']}\n"
            f"Assistant: {ex['completion_fr']}"
        )
    return ex

dataset = dataset.map(make_convo, remove_columns=dataset.column_names)

In [None]:
# Tokenisation
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbddv01/gpt2-french-small")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])

In [None]:

# Préparation LoRA + 8-bit et entraînement
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "dbddv01/gpt2-french-small",
    load_in_8bit=True,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"]
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="gpt2-fitness-fr",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    lr_scheduler_type="cosine",
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=2,
    logging_steps=50,
    evaluation_strategy="no"
)

def data_collator(features):
    return {
        "input_ids":  torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels": torch.stack([f["input_ids"] for f in features]),
    }

trainer = Trainer(
    model=model,
    train_dataset=tokenized,
    args=training_args,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("gpt2-fitness-fr")

In [None]:

# Test rapide du chatbot
from transformers import pipeline

chat = pipeline(
    "text-generation",
    model="gpt2-fitness-fr",
    tokenizer=tokenizer,
    device_map="auto"
)

prompt = "Utilisateur: Comment améliorer mon endurance ?\nAssistant:"
print(
    chat(prompt, max_new_tokens=80, do_sample=True, top_p=0.9, temperature=0.8)[0]["generated_text"]
)


In [13]:
# %% [markdown]
# # Chatbot Fitness – Fine-Tuning GPT-2 French
#
# Ce notebook vous guide, étape par étape, pour :
# 1. Repérer la racine du projet (quel que soit le cwd initial).  
# 2. Installer les dépendances.  
# 3. Charger FQuAD depuis `data/fquad/train.json`.  
# 4. Extraire les paires Q/A depuis `data/stackexchange/Posts.xml`.  
# 5. Formater en conversation “Utilisateur / Assistant”.  
# 6. Tokeniser avec GPT-2 FR.  
# 7. Fine-tuner en 8-bit + LoRA.  
# 8. Tester votre chatbot.

# %%
# 1. Détection robuste de la racine du projet
from pathlib import Path
import os

cur = Path().resolve()
if cur.name == "notebook":
    root = cur.parent
elif (cur / "data" / "fquad").exists():
    root = cur
elif (cur.parent / "data" / "fquad").exists():
    root = cur.parent
else:
    raise RuntimeError(f"Impossible de trouver data/fquad depuis {cur}")

os.chdir(root)
print("Racine détectée :", root)
print("Contenu racine    :", list(root.iterdir()))

# %%
# 2. Installer les dépendances
import sys, subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "--upgrade",
    "transformers", "datasets", "accelerate",
    "peft", "bitsandbytes", "sentencepiece", "sacrebleu"
])

# %% [markdown]
# ## 3. Charger FQuAD (train.json)
#  
# Assurez-vous d’avoir dézippé `download-form-fquad1.0.zip` dans `data/fquad/`.

# %%
import json
from datasets import Dataset

fquad_dir = root / "data" / "fquad"
print("Contenu data/fquad:", list(fquad_dir.iterdir()))

with open(fquad_dir / "train.json", encoding="utf-8") as f:
    raw = json.load(f)

qa_list = []
for art in raw["data"]:
    for para in art["paragraphs"]:
        for qa in para["qas"]:
            if qa.get("answers"):
                qa_list.append({
                    "question": qa["question"],
                    "answer":   qa["answers"][0]["text"]
                })

fquad = Dataset.from_list(qa_list)
print(f"✅ FQuAD chargé : {len(fquad)} exemples")
print(fquad[0])

# %%
# 4. Extraire paires Q/A depuis StackExchange
import xml.etree.ElementTree as ET
import pandas as pd
import re
from datasets import Dataset

stack_dir = root / "data" / "stackexchange"
print("Contenu data/stackexchange:", list(stack_dir.iterdir()))

xml_path = stack_dir / "Posts.xml"

# 4.1 Map {AcceptedAnswerId → question_body}
qmap = {}
for _, elem in ET.iterparse(xml_path, events=("end",)):
    if elem.tag == "row" and elem.attrib.get("PostTypeId") == "1":
        acc = elem.attrib.get("AcceptedAnswerId")
        if acc:
            qmap[acc] = elem.attrib.get("Body", "")
    elem.clear()

# 4.2 Construire paires prompt/completion
pairs = []
for _, elem in ET.iterparse(xml_path, events=("end",)):
    if elem.tag == "row" and elem.attrib.get("PostTypeId") == "2":
        aid = elem.attrib.get("Id")
        if aid in qmap:
            pairs.append({
                "prompt":     qmap[aid],
                "completion": elem.attrib.get("Body", "")
            })
    elem.clear()

# 4.3 Nettoyage HTML
def clean_html(text):
    return re.sub(r"<[^>]+>", "", text)

for p in pairs:
    p["prompt"]     = clean_html(p["prompt"])
    p["completion"] = clean_html(p["completion"])

# 4.4 Conversion en Dataset HF
df = pd.DataFrame(pairs)
fitness_ds = Dataset.from_pandas(df)
print(f"✅ Extrait {len(fitness_ds)} paires Q/A fitness")
print(fitness_ds[0])

# %%
# 5. Formater en conversation “Utilisateur / Assistant”
def to_convo(ex, inp, out):
    ex["text"] = f"Utilisateur: {ex[inp]}\nAssistant: {ex[out]}"
    return ex

fquad_convo = fquad.map(
    lambda ex: to_convo(ex, "question", "answer"),
    remove_columns=[c for c in fquad.column_names if c != "text"]
)
fitness_convo = fitness_ds.map(
    lambda ex: to_convo(ex, "prompt", "completion"),
    remove_columns=["prompt", "completion"]
)

from datasets import concatenate_datasets
dataset = concatenate_datasets([fquad_convo, fitness_convo])
print(f"✅ Dataset total : {len(dataset)} exemples")
print(dataset[0]["text"])

# %%
# 6. Tokenisation pour GPT-2 FR
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbddv01/gpt2-french-small")
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized = dataset.map(
    tokenize_fn, batched=True, remove_columns=["text"]
)
print("✅ Tokenisation OK — colonnes :", tokenized.column_names)

# %%
# 7. Fine-tuning (8-bit + LoRA)
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "dbddv01/gpt2-french-small",
    load_in_8bit=True,
    device_map="auto"
)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"]
)
model = get_peft_model(model, lora_cfg)

training_args = TrainingArguments(
    output_dir=str(root / "gpt2-fitness-fr"),
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    fp16=True,
    logging_steps=50,
    save_total_limit=2,
    evaluation_strategy="no"
)

def data_collator(features):
    return {
        "input_ids":      torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels":         torch.stack([f["input_ids"] for f in features]),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)
trainer.train()
trainer.save_model(root / "gpt2-fitness-fr")

# %%
# 8. Test rapide du chatbot
from transformers import pipeline

chat = pipeline(
    "text-generation",
    model=str(root / "gpt2-fitness-fr"),
    tokenizer=tokenizer,
    device_map="auto"
)
prompt = "Utilisateur: Quels exercices pour les épaules ?\nAssistant:"
out = chat(prompt, max_new_tokens=60, do_sample=True, top_p=0.9, temperature=0.8)
print(out[0]["generated_text"])


RuntimeError: Impossible de trouver data/fquad depuis /home

In [1]:
# %% [markdown]
# # Chatbot Fitness – Fine-Tuning GPT-2 French
#
# Ce notebook vous guide, étape par étape, pour :
# 1. Vérifier et positionner le répertoire de travail (cwd).  
# 2. Installer les dépendances.  
# 3. Charger FQuAD depuis `data/fquad/train.json`.  
# 4. Extraire les paires Q/A depuis `data/stackexchange/Posts.xml`.  
# 5. Formater en conversation “Utilisateur / Assistant”.  
# 6. Tokeniser avec GPT-2 FR.  
# 7. Fine-tuner en 8-bit + LoRA.  
# 8. Tester votre chatbot.

# %%
import os

# 1. Si on travaille depuis le dossier 'notebook', on remonte d'un niveau
print("CWD initial :", os.getcwd())
if os.path.basename(os.getcwd()) == "notebook":
    os.chdir("..")
print("CWD final   :", os.getcwd())

# Chemins relatifs simples
fquad_dir = "data/fquad"
posts_xml = "data/stackexchange/Posts.xml"

# Vérification rapide
print("→ data/fquad existe :", os.path.isdir(fquad_dir))
print("→ Posts.xml existe  :", os.path.isfile(posts_xml))

# %%
# 2. Installer les dépendances (exécuter une fois)
import sys, subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "--upgrade",
    "transformers", "datasets", "accelerate",
    "peft", "bitsandbytes", "sentencepiece", "sacrebleu"
])

# %% [markdown]
# ## 3. Charger FQuAD manuellement  
# Assurez-vous d’avoir dézippé `download-form-fquad1.0.zip` dans `data/fquad/` (fichiers `train.json` et `valid.json`).

# %%
import json
from datasets import Dataset

with open(os.path.join(fquad_dir, "train.json"), encoding="utf-8") as f:
    raw = json.load(f)

qa_list = []
for art in raw["data"]:
    for para in art["paragraphs"]:
        for qa in para["qas"]:
            if qa.get("answers"):
                qa_list.append({
                    "question": qa["question"],
                    "answer":   qa["answers"][0]["text"]
                })

fquad = Dataset.from_list(qa_list)
print(f"✅ FQuAD chargé : {len(fquad)} exemples")
print(fquad[0])

# %% [markdown]
# ## 4. Extraire les paires Q/A depuis StackExchange

# %%
import xml.etree.ElementTree as ET
import pandas as pd
import re
from datasets import Dataset

# 4.1 Construire le mapping question → réponse acceptée
qmap = {}
for _, elem in ET.iterparse(posts_xml, events=("end",)):
    if elem.tag == "row" and elem.attrib.get("PostTypeId") == "1":
        acc = elem.attrib.get("AcceptedAnswerId")
        if acc:
            qmap[acc] = elem.attrib.get("Body", "")
    elem.clear()

# 4.2 Recuperer les paires prompt/completion
pairs = []
for _, elem in ET.iterparse(posts_xml, events=("end",)):
    if elem.tag == "row" and elem.attrib.get("PostTypeId") == "2":
        aid = elem.attrib.get("Id")
        if aid in qmap:
            pairs.append({
                "prompt":     qmap[aid],
                "completion": elem.attrib.get("Body", "")
            })
    elem.clear()

# 4.3 Nettoyage HTML très simple
def clean_html(text):
    return re.sub(r"<[^>]+>", "", text)

for p in pairs:
    p["prompt"]     = clean_html(p["prompt"])
    p["completion"] = clean_html(p["completion"])

# 4.4 Conversion en Dataset HF
df = pd.DataFrame(pairs)
fitness_ds = Dataset.from_pandas(df)
print(f"✅ Extrait {len(fitness_ds)} paires Q/A fitness")
print(fitness_ds[0])

# %% [markdown]
# ## 5. Formater en conversation “Utilisateur / Assistant”

# %%
def to_convo(ex, inp, out):
    ex["text"] = f"Utilisateur: {ex[inp]}\nAssistant: {ex[out]}"
    return ex

fquad_convo = fquad.map(
    lambda ex: to_convo(ex, "question", "answer"),
    remove_columns=[c for c in fquad.column_names if c != "text"]
)
fitness_convo = fitness_ds.map(
    lambda ex: to_convo(ex, "prompt", "completion"),
    remove_columns=["prompt", "completion"]
)

from datasets import concatenate_datasets
dataset = concatenate_datasets([fquad_convo, fitness_convo])
print(f"✅ Dataset total : {len(dataset)} exemples")
print(dataset[0]["text"])

# %% [markdown]
# ## 6. Tokenisation pour GPT-2 FR

# %%
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbddv01/gpt2-french-small")
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized = dataset.map(
    tokenize_fn, batched=True, remove_columns=["text"]
)
print("✅ Tokenisation OK — colonnes :", tokenized.column_names)

# %% [markdown]
# ## 7. Fine-tuning (8-bit + LoRA)

# %%
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "dbddv01/gpt2-french-small",
    load_in_8bit=True,
    device_map="auto"
)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"]
)
model = get_peft_model(model, lora_cfg)

training_args = TrainingArguments(
    output_dir="gpt2-fitness-fr",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    fp16=True,
    logging_steps=50,
    save_total_limit=2,
    evaluation_strategy="no"
)

def data_collator(features):
    return {
        "input_ids":      torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels":         torch.stack([f["input_ids"] for f in features]),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)
trainer.train()
trainer.save_model("gpt2-fitness-fr")

# %% [markdown]
# ## 8. Test rapide du chatbot

# %%
from transformers import pipeline

chat = pipeline(
    "text-generation",
    model="gpt2-fitness-fr",
    tokenizer=tokenizer,
    device_map="auto"
)
prompt = "Utilisateur: Quels exercices pour les épaules ?\nAssistant:"
out = chat(prompt, max_new_tokens=60, do_sample=True, top_p=0.9, temperature=0.8)
print(out[0]["generated_text"])


CWD initial : /home/maxime/DataDevIA/chatbotcoach_project/notebook
CWD final   : /home/maxime/DataDevIA/chatbotcoach_project
→ data/fquad existe : True
→ Posts.xml existe  : True


  from .autonotebook import tqdm as notebook_tqdm


✅ FQuAD chargé : 20731 exemples
{'question': "Quel astronome a émit l'idée en premier d'une planète entre les orbites de Mars et Jupiter ?", 'answer': 'Johann Elert Bode'}
✅ Extrait 4761 paires Q/A fitness
{'prompt': "What's the difference? I'm looking at shake options and some contain whey isolate, some contain whey concentrate and some both.\n", 'completion': 'The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.\n\nIf you have trouble digesting the lactose or are trying to minimize carbohydrate content, then whey isolate would be a good choice. Otherwise, it probably doesn\'t matter; just pick the concentrate since it\'s cheaper in terms of protein grams/dollar. \n'}


Map: 100%|██████████| 20731/20731 [00:00<00:00, 28329.46 examples/s]
Map: 100%|██████████| 4761/4761 [00:00<00:00, 27118.97 examples/s]


✅ Dataset total : 25492 exemples
Utilisateur: Quel astronome a émit l'idée en premier d'une planète entre les orbites de Mars et Jupiter ?
Assistant: Johann Elert Bode


Map: 100%|██████████| 25492/25492 [00:01<00:00, 15398.91 examples/s]


✅ Tokenisation OK — colonnes : ['input_ids', 'attention_mask']


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [3]:
# %% [markdown]
# # Chatbot Fitness – Fine-Tuning GPT-2 French avec Évaluation
#
# Ce notebook vous guide, étape par étape, pour :
# 1. Vérifier et positionner le répertoire de travail.  
# 2. Installer les dépendances.  
# 3. Charger FQuAD depuis `data/fquad/train.json`.  
# 4. Extraire les paires Q/A depuis `data/stackexchange/Posts.xml`.  
# 5. Formater en conversation “Utilisateur / Assistant”.  
# 6. Fractionner en train / eval.  
# 7. Tokeniser pour GPT-2 FR.  
# 8. Fine-tuner en 8-bit + LoRA avec évaluation par époque.  
# 9. Évaluer et calculer la perplexité.  
# 10. Test rapide du chatbot.

# %%
# 1. Positionner le cwd si nécessaire
import os

print("CWD initial :", os.getcwd())
if os.path.basename(os.getcwd()) == "notebook":
    os.chdir("..")
print("CWD final   :", os.getcwd())

# Vérification simple des chemins
print("data/fquad       exists:", os.path.isdir("data/fquad"))
print("data/stackexchange exists:", os.path.isdir("data/stackexchange"))

fquad_dir = "data/fquad"
posts_xml = "data/stackexchange/Posts.xml"

# %%
# 2. Installer les dépendances (à exécuter une fois)
import sys, subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "--upgrade",
    "transformers", "datasets", "accelerate",
    "peft", "bitsandbytes", "sentencepiece", "sacrebleu"
])

# %% [markdown]
# ## 3. Charger FQuAD manuellement  
# Ayez préalablement dézippé `download-form-fquad1.0.zip` dans `data/fquad/`  
# pour obtenir `train.json` et `valid.json`.

# %%
import json
from datasets import Dataset

with open(os.path.join(fquad_dir, "train.json"), encoding="utf-8") as f:
    raw = json.load(f)

qa_list = []
for art in raw["data"]:
    for para in art["paragraphs"]:
        for qa in para["qas"]:
            if qa.get("answers"):
                qa_list.append({
                    "question": qa["question"],
                    "answer":   qa["answers"][0]["text"]
                })

fquad = Dataset.from_list(qa_list)
print(f"✅ FQuAD chargé : {len(fquad)} exemples")
print(fquad[0])

# %% [markdown]
# ## 4. Extraire les paires Q/A depuis StackExchange

# %%
import xml.etree.ElementTree as ET
import pandas as pd
import re
from datasets import Dataset

# 4.1 Map {AcceptedAnswerId → question_body}
qmap = {}
for _, elem in ET.iterparse(posts_xml, events=("end",)):
    if elem.tag == "row" and elem.attrib.get("PostTypeId") == "1":
        acc = elem.attrib.get("AcceptedAnswerId")
        if acc:
            qmap[acc] = elem.attrib.get("Body", "")
    elem.clear()

# 4.2 Construire paires prompt/completion
pairs = []
for _, elem in ET.iterparse(posts_xml, events=("end",)):
    if elem.tag == "row" and elem.attrib.get("PostTypeId") == "2":
        aid = elem.attrib.get("Id")
        if aid in qmap:
            pairs.append({
                "prompt":     qmap[aid],
                "completion": elem.attrib.get("Body", "")
            })
    elem.clear()

# 4.3 Nettoyage HTML
def clean_html(text):
    return re.sub(r"<[^>]+>", "", text)

for p in pairs:
    p["prompt"]     = clean_html(p["prompt"])
    p["completion"] = clean_html(p["completion"])

# 4.4 Conversion en Dataset HF
df = pd.DataFrame(pairs)
fitness_ds = Dataset.from_pandas(df)
print(f"✅ Extrait {len(fitness_ds)} paires Q/A fitness")
print(fitness_ds[0])

# %% [markdown]
# ## 5. Formater en conversation “Utilisateur / Assistant”

# %%
def to_convo(ex, inp, out):
    ex["text"] = f"Utilisateur: {ex[inp]}\nAssistant: {ex[out]}"
    return ex

fquad_convo = fquad.map(
    lambda ex: to_convo(ex, "question", "answer"),
    remove_columns=[c for c in fquad.column_names if c != "text"]
)
fitness_convo = fitness_ds.map(
    lambda ex: to_convo(ex, "prompt", "completion"),
    remove_columns=["prompt", "completion"]
)

from datasets import concatenate_datasets
dataset = concatenate_datasets([fquad_convo, fitness_convo])
print(f"✅ Dataset total : {len(dataset)} exemples")
print(dataset[0]["text"])

# %% [markdown]
# ## 6. Fractionner en train / eval (10% eval)

# %%
from datasets import DatasetDict

splits = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits["train"]
eval_dataset  = splits["test"]
print(f"→ {len(train_dataset)} exemples train, {len(eval_dataset)} exemples eval")

# %% [markdown]
# ## 7. Tokenisation pour GPT-2 FR

# %%
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbddv01/gpt2-french-small")

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized_train = train_dataset.map(
    tokenize_fn, batched=True, remove_columns=["text"]
)
tokenized_eval = eval_dataset.map(
    tokenize_fn, batched=True, remove_columns=["text"]
)
print("✅ Tokenisation terminée")
print("  train cols:", tokenized_train.column_names)
print("   eval cols:", tokenized_eval.column_names)

# %% [markdown]
# ## 8. Fine-tuning (8-bit + LoRA) avec évaluation

# %% 8. Fine-tuning (8-bit + LoRA) — version compatible
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "dbddv01/gpt2-french-small",
    load_in_8bit=True,
    device_map="auto"
)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"]
)
model = get_peft_model(model, lora_cfg)

training_args = TrainingArguments(
    output_dir="gpt2-fitness-fr",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    fp16=True,
    logging_steps=50,
    save_total_limit=2         # conserve 2 checkpoints
    # (on retire evaluation_strategy / save_strategy
)

def data_collator(features):
    return {
        "input_ids":      torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels":         torch.stack([f["input_ids"] for f in features]),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,   # on garde l’éval ; elle sera utilisée plus tard
    data_collator=data_collator
)

trainer.train()
trainer.save_model("gpt2-fitness-fr")

# %% 9. Évaluation finale & perplexité
import math
metrics = trainer.evaluate()       # calcule eval_loss sur eval_dataset
print("Eval metrics:", metrics)
print("Perplexité :", math.exp(metrics["eval_loss"]))




# %% [markdown]
# ## 10. Test rapide du chatbot

# %%
from transformers import pipeline

chat = pipeline(
    "text-generation",
    model="gpt2-fitness-fr",
    tokenizer=tokenizer,
    device_map="auto"
)
prompt = "Utilisateur: Quels exercices pour les épaules ?\nAssistant:"
out = chat(prompt, max_new_tokens=60, do_sample=True, top_p=0.9, temperature=0.8)
print(out[0]["generated_text"])


CWD initial : /home/maxime/DataDevIA/chatbotcoach_project
CWD final   : /home/maxime/DataDevIA/chatbotcoach_project
data/fquad       exists: True
data/stackexchange exists: True
✅ FQuAD chargé : 20731 exemples
{'question': "Quel astronome a émit l'idée en premier d'une planète entre les orbites de Mars et Jupiter ?", 'answer': 'Johann Elert Bode'}
✅ Extrait 4761 paires Q/A fitness
{'prompt': "What's the difference? I'm looking at shake options and some contain whey isolate, some contain whey concentrate and some both.\n", 'completion': 'The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.\n\nIf you have trouble digesting the lactose or are trying to minimize carbohydrate content, then whey isolate would be a good choice. Otherwise, it probably doesn\'t matter; just pick the concentrate since it\'s cheaper in terms of protein grams/dollar. \n'}

Map: 100%|██████████| 20731/20731 [00:00<00:00, 31616.58 examples/s]
Map: 100%|██████████| 4761/4761 [00:00<00:00, 26510.60 examples/s]


✅ Dataset total : 25492 exemples
Utilisateur: Quel astronome a émit l'idée en premier d'une planète entre les orbites de Mars et Jupiter ?
Assistant: Johann Elert Bode
→ 22942 exemples train, 2550 exemples eval


Map: 100%|██████████| 22942/22942 [00:01<00:00, 14155.67 examples/s]
Map: 100%|██████████| 2550/2550 [00:00<00:00, 15577.81 examples/s]


✅ Tokenisation terminée
  train cols: ['input_ids', 'attention_mask']
   eval cols: ['input_ids', 'attention_mask']


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TypeError: expected Tensor as element 0 in argument 0, but got list

In [None]:
# %% [markdown]
# # Chatbot Fitness – Fine-Tuning GPT-2 French avec Évaluation
#
# Ce notebook vous guide, étape par étape, pour :
# 1. Positionner le cwd.  
# 2. Installer les dépendances.  
# 3. Charger FQuAD depuis `data/fquad/train.json`.  
# 4. Extraire les paires Q/A depuis `data/stackexchange/Posts.xml`.  
# 5. Formater en conversation.  
# 6. Fractionner train/éval.  
# 7. Tokeniser.  
# 8. Fine-tuner en 8-bit + LoRA avec évaluation.  
# 9. Évaluer & calculer la perplexité.  
# 10. Tester le chatbot.

# %%
import os

print("CWD initial :", os.getcwd())
if os.path.basename(os.getcwd()) == "notebook":
    os.chdir("..")
print("CWD final   :", os.getcwd())

assert os.path.isdir("data/fquad"), "⛔ data/fquad introuvable"
assert os.path.isdir("data/stackexchange"), "⛔ data/stackexchange introuvable"

fquad_dir = "data/fquad"
posts_xml = "data/stackexchange/Posts.xml"

# %%
# 2. Installer les dépendances (une fois)
import sys, subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "--upgrade",
    "transformers", "datasets", "accelerate",
    "peft", "bitsandbytes", "sentencepiece", "sacrebleu"
])

# %% [markdown]
# ## 3. Charger FQuAD manuellement

# %%
import json
from datasets import Dataset

with open(os.path.join(fquad_dir, "train.json"), encoding="utf-8") as f:
    raw = json.load(f)

qa_list = []
for art in raw["data"]:
    for para in art["paragraphs"]:
        for qa in para["qas"]:
            if qa.get("answers"):
                qa_list.append({
                    "question": qa["question"],
                    "answer":   qa["answers"][0]["text"]
                })

fquad = Dataset.from_list(qa_list)
print(f"✅ FQuAD chargé : {len(fquad)} exemples")
print(fquad[0])

# %% [markdown]
# ## 4. Extraire les paires Q/A depuis StackExchange

# %%
import xml.etree.ElementTree as ET
import pandas as pd
import re
from datasets import Dataset

# mapping question → accepted answer ID
qmap = {}
for _, elem in ET.iterparse(posts_xml, events=("end",)):
    if elem.tag=="row" and elem.attrib.get("PostTypeId")=="1":
        acc=elem.attrib.get("AcceptedAnswerId")
        if acc: qmap[acc]=elem.attrib.get("Body","")
    elem.clear()

pairs=[]
for _, elem in ET.iterparse(posts_xml, events=("end",)):
    if elem.tag=="row" and elem.attrib.get("PostTypeId")=="2":
        aid=elem.attrib.get("Id")
        if aid in qmap:
            pairs.append({"prompt":qmap[aid],"completion":elem.attrib.get("Body","")})
    elem.clear()

def clean_html(t): return re.sub(r"<[^>]+>","",t)
for p in pairs:
    p["prompt"]=clean_html(p["prompt"])
    p["completion"]=clean_html(p["completion"])

fitness_ds=Dataset.from_pandas(pd.DataFrame(pairs))
print(f"✅ Fitness QA : {len(fitness_ds)} paires")
print(fitness_ds[0])

# %% [markdown]
# ## 5. Formater en conversation

# %%
def to_convo(ex, inp, out):
    ex["text"]=f"Utilisateur: {ex[inp]}\nAssistant: {ex[out]}"
    return ex

fquad_convo = fquad.map(lambda ex: to_convo(ex,"question","answer"), remove_columns=[c for c in fquad.column_names if c!="text"])
fitness_convo = fitness_ds.map(lambda ex: to_convo(ex,"prompt","completion"), remove_columns=["prompt","completion"])

from datasets import concatenate_datasets
dataset = concatenate_datasets([fquad_convo, fitness_convo])
print(f"✅ Total exemples : {len(dataset)}")
print(dataset[0]["text"])

# %% [markdown]
# ## 6. Split train / eval (10%)

# %%
splits = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits["train"]
eval_dataset  = splits["test"]
print(f"→ Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

# %% [markdown]
# ## 7. Tokenisation pour GPT-2 FR

# %%
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbddv01/gpt2-french-small")

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval  = eval_dataset.map(tokenize_fn,  batched=True, remove_columns=["text"])
print("✅ Tokenisation terminée")

# %% [markdown]
# ## 8. Fine-tuning (8-bit + LoRA) et évaluation

# %%
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model

# modèle + LoRA
model = AutoModelForCausalLM.from_pretrained("dbddv01/gpt2-french-small", load_in_8bit=True, device_map="auto")
lora_cfg = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, target_modules=["c_attn","c_proj","c_fc"])
model = get_peft_model(model, lora_cfg)

# args avec éval par époque
training_args = TrainingArguments(
    output_dir="gpt2-fitness-fr",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    fp16=True,
    logging_steps=50,
    # ces deux ne sont pas supportés par ta version : on fera eval manuellement après
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    save_total_limit=2
)

# collator causal LM
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("gpt2-fitness-fr")




CWD initial : /home/maxime/DataDevIA/chatbotcoach_project
CWD final   : /home/maxime/DataDevIA/chatbotcoach_project
✅ FQuAD chargé : 20731 exemples
{'question': "Quel astronome a émit l'idée en premier d'une planète entre les orbites de Mars et Jupiter ?", 'answer': 'Johann Elert Bode'}
✅ Fitness QA : 4761 paires
{'prompt': "What's the difference? I'm looking at shake options and some contain whey isolate, some contain whey concentrate and some both.\n", 'completion': 'The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.\n\nIf you have trouble digesting the lactose or are trying to minimize carbohydrate content, then whey isolate would be a good choice. Otherwise, it probably doesn\'t matter; just pick the concentrate since it\'s cheaper in terms of protein grams/dollar. \n'}


Map: 100%|██████████| 20731/20731 [00:00<00:00, 30773.22 examples/s]
Map: 100%|██████████| 4761/4761 [00:00<00:00, 26036.67 examples/s]


✅ Total exemples : 25492
Utilisateur: Quel astronome a émit l'idée en premier d'une planète entre les orbites de Mars et Jupiter ?
Assistant: Johann Elert Bode
→ Train: 22942, Eval: 2550


Map: 100%|██████████| 22942/22942 [00:01<00:00, 16065.46 examples/s]
Map: 100%|██████████| 2550/2550 [00:00<00:00, 15167.96 examples/s]
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


✅ Tokenisation terminée


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,6.0841
100,5.3185
150,5.0039
200,4.8336
250,4.5921
300,4.4348
350,4.5521
400,4.5053
450,4.4127
500,4.4419




Eval metrics: {'eval_runtime': 30.1433, 'eval_samples_per_second': 84.596, 'eval_steps_per_second': 10.583, 'epoch': 3.0}


KeyError: 'eval_loss'

In [6]:
# %% [markdown]
# ## 9. Évaluation manuelle & calcul de la perplexité (sans OOM)

# %%
import math, torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

# 9.1 Créer un DataLoader pour le jeu d'évaluation
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
eval_dataloader = DataLoader(
    tokenized_eval, 
    batch_size=1,               # mini-batch réduit pour tenir en VRAM
    collate_fn=data_collator
)

# 9.2 Passer en mode évaluation
model.eval()
total_loss = 0.0
count = 0

# 9.3 Boucle d'évaluation
for batch in eval_dataloader:
    # déplacer batch sur le même device que le modèle
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        out = model(**batch)
    total_loss += out.loss.item()
    count += 1

# 9.4 Moyenne et perplexité
avg_loss = total_loss / count
print(f"→ Loss moyenne : {avg_loss:.4f}")
print(f"→ Perplexité   : {math.exp(avg_loss):.2f}")



→ Loss moyenne : 3.2777
→ Perplexité   : 26.51
