In [6]:
import pandas as pd
import matplotlib.pyplot as plt

chemin_fichier_csv = 'DBfinal.csv'
dataset = pd.read_csv(chemin_fichier_csv)

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.01)

print(f"Taille de l'ensemble d'entraînement: {len(train)}")
print(f"Taille de l'ensemble de test: {len(test)}")


Taille de l'ensemble d'entraînement: 117203
Taille de l'ensemble de test: 1184


In [8]:
from transformers import MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)

In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [10]:
def preprocess_function(examples):
    
    inputs = tokenizer(examples["eng"], max_length=35, truncation=True, padding="max_length")
    targets = tokenizer(examples["fr"], max_length=35, truncation=True, padding="max_length")
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/117203 [00:00<?, ? examples/s]

Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

In [13]:
from transformers import MarianMTModel, MarianConfig, TrainingArguments, Trainer
import torch

config = MarianConfig.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
print(f"Le modèle utilise : {device}")

training_args = TrainingArguments(
    output_dir="./resultsENG",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",  
    save_strategy="epoch",  
    save_total_limit=3,  
    no_cuda=not torch.cuda.is_available(),  
    dataloader_num_workers=4,  
    dataloader_prefetch_factor=2  
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  
    eval_dataset=tokenized_test_dataset     
)

trainer.train()



Le modèle utilise : cuda


Epoch,Training Loss,Validation Loss
1,0.3485,0.277609
2,0.2506,0.237238
3,0.1968,0.221928
4,0.1652,0.21634


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=29304, training_loss=0.30409445008911334, metrics={'train_runtime': 8287.2016, 'train_samples_per_second': 56.571, 'train_steps_per_second': 3.536, 'total_flos': 4345457883217920.0, 'train_loss': 0.30409445008911334, 'epoch': 4.0})

In [14]:
tokenizer.save_pretrained('resultsENG\checkpoint-29304')

('resultsENG\\checkpoint-29304\\tokenizer_config.json',
 'resultsENG\\checkpoint-29304\\special_tokens_map.json',
 'resultsENG\\checkpoint-29304\\vocab.json',
 'resultsENG\\checkpoint-29304\\source.spm',
 'resultsENG\\checkpoint-29304\\target.spm',
 'resultsENG\\checkpoint-29304\\added_tokens.json')

In [1]:
from transformers import MarianMTModel, MarianTokenizer

model_path = "resultsENG\checkpoint-29304"
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_path)


def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text



source_text = "At the bar, she had the time of her life because she met someone really good"
translated_text = translate(source_text, model, tokenizer)
print(translated_text)   

Au bar, elle a passé le meilleur moment de sa vie car elle a vraiment rencontré quelqu'un de


In [35]:

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text


text = "It is the most famous painting in the world, and yet, when viewers manage to see"
translated_text = translate(text)
print("Translated text:", translated_text)


Translated text: C'est la peinture la plus célèbre au monde, et pourtant, quand les téléspectateurs parviennent à voir


In [None]:
"It is the most famous painting in the world, and yet, when viewers manage to see"