In [2]:
import json
from datasets import Dataset

In [27]:
# Charge des données JSON
with open("../data/faq.json", "r") as f:
    data = json.load(f)

# Convertis en format Dataset (compatible avec Hugging Face)
dataset = Dataset.from_dict({
    "instruction": [item["instruction"] for item in data],
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data],
})

 ## Préparer le tokenizer et le modèle

In [17]:
from huggingface_hub import login
login(token="token")

In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Charge un modèle et un tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

## Prétraitement des données

In [29]:
# First, set up the padding token
tokenizer.pad_token = tokenizer.eos_token  # Most common approach
# Alternatively, you could add a new [PAD] token:
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Then your preprocessing function
def preprocess_function(examples):
    prompts = [f"Instruction: {inst}\nInput: {inp}\nOutput: "
               for inst, inp in zip(examples["instruction"], examples["input"])]

    model_inputs = tokenizer(prompts,
                           max_length=512,
                           truncation=True,
                           padding="max_length")

    labels = tokenizer(examples["output"],
                      max_length=512,
                      truncation=True,
                      padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 786/786 [00:00<00:00, 5018.84 examples/s]


## Configurer l'entraînement

- Utilise le Trainer de Hugging Face pour simplifier l'entraînement.

In [33]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
# Entraîne le modèle (fine tuning)
trainer.train()

# Évaluer et sauvegarder le modèle

évaluation les performances du modèle sur un ensemble de validation
Sauvegarde le modèle fine-tuné.

In [None]:
trainer.save_model("../models/fine-tuned-model")
tokenizer.save_pretrained("../models/fine-tuned-model")