In [11]:
!pip install transformers datasets torch


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-small"  # Change si tu utilises un autre modèle
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [33]:
from datasets import load_dataset

dataset = load_dataset("gsm8k","socratic", split="train")  # Change "imdb" si tu as un autre dataset


Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 124308.57 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 68660.09 examples/s]


In [40]:
print(dataset.select(range(5)))
# Afficher les premières lignes avec les valeurs des colonnes 'question' et 'answer'
for i in range(5):
    print(f"Question: {dataset['question'][i]}")
    print(f"Answer: {dataset['answer'][i]}")
    print("\n---\n")


Dataset({
    features: ['question', 'answer'],
    num_rows: 5
})
Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Answer: How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.
How many clips did Natalia sell altogether in April and May? ** Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

---

Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Answer: How much does Weng earn per minute? ** Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
How much did Weng earn? ** Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10

---

Question: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as muc

In [41]:
from transformers import AutoTokenizer

# Charger le tokenizer pour le modèle T5
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Fonction de prétraitement
def preprocess_data(examples):
    # Les questions sont les entrées, les réponses sont les cibles
    inputs = [q.strip() for q in examples['question']]
    targets = [a.strip() for a in examples['answer']]

    # Tokenisation avec padding et troncature
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length").input_ids

    # Ajouter les labels aux entrées
    model_inputs["labels"] = labels
    return model_inputs

# Appliquer la fonction de prétraitement
tokenized_dataset = dataset.map(preprocess_data, batched=True)


Map: 100%|██████████| 7473/7473 [00:02<00:00, 3697.75 examples/s]


In [42]:
from transformers import DataCollatorForSeq2Seq

# Créer un collator pour gérer le padding dynamique
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [46]:
from datasets import DatasetDict

# Diviser le dataset en 80% pour l'entraînement et 20% pour la validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})


In [48]:
from transformers import TrainingArguments, Trainer

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator
)

# Lancer l'entraînement
trainer.train()



Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.1371,0.602876
2,0.6998,0.570264
3,0.6747,0.563626


TrainOutput(global_step=4485, training_loss=1.3061746960898306, metrics={'train_runtime': 997.1772, 'train_samples_per_second': 17.985, 'train_steps_per_second': 4.498, 'total_flos': 1666880155680768.0, 'train_loss': 1.3061746960898306, 'epoch': 3.0})

In [50]:
# Sauvegarder le modèle fine-tuné
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/tokenizer.json')

In [51]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Charger le modèle fine-tuné et le tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")


In [52]:
def generate_answer(question):
    # Tokeniser la question en entrée
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True)
    
    # Générer la réponse avec le modèle
    outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
    
    # Décoder la réponse en texte
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [61]:
# Exemples de questions
questions = ["Sarah has 84 marbles and wants to divide them equally among her 7 friends. How many marbles does each friend get?"
     
]

# Afficher les réponses du modèle pour chaque question
for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {generate_answer(question)}")
    print("\n---\n")


Question: Sarah has 84 marbles and wants to divide them equally among her 7 friends. How many marbles does each friend get?
Answer: How many marbles does Sarah have? ** Sarah has 84 x 7 = 84*7=84>>84 marbles. How many marbles does each friend get? ** Each friend gets 84 x 7 = 84*7=84>>84 marbles. #### 84

---



In [6]:
!pip install accelerate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-1.1.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.0-py3-none-any.whl (333 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.1.0


In [8]:
!pip install 'transformers[torch]'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [10]:
!pip install "accelerate>=0.26.0"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [15]:
pip install 'accelerate>={ACCELERATE_MIN_VERSION}'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Invalid requirement: 'accelerate>={ACCELERATE_MIN_VERSION}': Expected end or semicolon (after name and no valid version specifier)
    accelerate>={ACCELERATE_MIN_VERSION}
              ^[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [60]:
print(tokenized_dataset[0])


{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nHow many clips did Natalia sell altogether in April and May? ** Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72', 'input_ids': [9267, 5434, 1916, 16234, 12, 4678, 13, 160, 803, 16, 1186, 6, 11, 258, 255, 1916, 985, 38, 186, 16234, 16, 932, 5, 571, 186, 16234, 410, 9267, 5434, 1789, 16889, 16, 1186, 11, 932, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,