In [33]:
import pandas as pd
import torch
from transformers import T5Tokenizer,T5ForConditionalGeneration
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu
import nltk
from torch.optim import AdamW
from transformers import get_scheduler

In [34]:
df=pd.read_csv(r"D:\translation\en-fr.csv",nrows=1000)

In [35]:
df.shape

(1000, 2)

In [36]:
df=df.dropna()

In [37]:
df.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [99]:
df.iloc[20::]

Unnamed: 0,en,fr
20,Where did we come from?,D'où venons-nous?
21,Are we alone?,Sommes-nous seuls?
22,The lure of these universal enigmas was the sp...,L'attrait exercé par ces énigmes universelles ...
23,Astronomy also plays a much more practical rol...,L'astronomie possède également une dimension b...
24,"Since the time of our earliest ancestors, huma...","Depuis toujours, nous observons les mouvements..."
...,...,...
995,A nova is a star that absorbs matter from a ne...,La matière absorbée finit par réchauffer l'éto...
996,It is a rare and spectacular event.,Il s'agit d'un phénomène rare et plutôt specta...
997,"In 1977, he became the French editor of the Na...","En 1977, il devient l'éditeur francophone du N..."
998,"In 1978, Lemay began the daunting task of asse...","En 1978, il entreprend la tâche colossale de r..."


In [39]:
english_sentences=df['en'].to_list()
french_sentence=df['fr'].to_list()

In [40]:
model_name="t5-base"
model=T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer=T5Tokenizer.from_pretrained(model_name,model_max_length=128)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
try:
    model.to(device)
except RuntimeError:
    print("Not enough GPU memory! Staying on CPU.")
    device = torch.device('cpu')

In [41]:
input_encodings=tokenizer(english_sentences,padding=True,truncation=True,return_tensors="pt",max_length=128)
target_encodings=tokenizer(french_sentence,padding=True,truncation=True,return_tensors="pt",max_length=128)

In [42]:
class Transalationdataset(torch.utils.data.Dataset):
    def __init__(self,input_encodings,target_encodings):
        self.input_encodings=input_encodings
        self.target_encodings=target_encodings
    def __getitem__(self, idx):
        input_ids=self.input_encodings["input_ids"][idx]
        target_ids=self.target_encodings["input_ids"][idx]
        return {"input_ids":input_ids,"labels":target_ids}
    def __len__(self):
        return len(self.input_encodings["input_ids"])

In [43]:
dataset=Transalationdataset(input_encodings,target_encodings)
train_loader=DataLoader(dataset,batch_size=4,shuffle=True)
optimizer = AdamW(model.parameters(), lr=1e-4)
num_training_steps = len(train_loader) * 3  # 4 epochs
scheduler = get_scheduler("cosine", optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

In [None]:
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/3], Loss: {total_loss / len(train_loader)}")

# Save fine-tuned model
model.save_pretrained("fine_tuned_t5_model")

Epoch [1/3], Loss: 1.1454964004158974
Epoch [2/3], Loss: 0.3208753889799118
Epoch [3/3], Loss: 0.27298929768800734


In [95]:
tokenizer.save_pretrained("t5_model")

('t5_model\\tokenizer_config.json',
 't5_model\\special_tokens_map.json',
 't5_model\\spiece.model',
 't5_model\\added_tokens.json')

In [89]:
import random
import numpy as np

In [91]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
torch.use_deterministic_algorithms(False)
def translate(sentence):
    sentence = "translate English to French: " + sentence
    
    encoded = tokenizer(sentence, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            encoded.input_ids,
            max_length=128,
            num_beams=4,
            do_sample=False,
            early_stopping=True,
            temperature=1.0
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [96]:
print(translate("Français"))
print(translate("Good morning, how are you?"))


English
Bonjour, et à quel point êtes-vous bien?
