### **Machine Translation using Pretrain Model**

In [29]:
from pytorch_lightning.utilities.types import EVAL_DATALOADERS

''' Import all important Library '''
import pandas as pd
import os
import torch
import pytorch_lightning
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch.nn as nn
from torchmetrics.text import  BLEUScore

In [30]:
''' Root and Dataset Path '''
Root_dir = '/Users/mahadiur/Desktop/Bongodev MLops Projects/Machine Translation using Pretrain Model/Data'

train_path = os.path.join(Root_dir, 'train.csv')
test_path = os.path.join(Root_dir, 'test.csv')
validation_path = os.path.join(Root_dir, 'val.csv')


In [31]:
''' Load Dataset '''
train_dataset = pd.read_csv(train_path)
test_dataset = pd.read_csv(test_path)
validation_dataset = pd.read_csv(validation_path)

train_dataset.head()

Unnamed: 0,en,bn
0,men with orange wristbands perform a dance .,কমলা রিস্টব্যান্ড পড়া লোক নাচছে ।
1,a man with a grey beard is sitting by a window .,ধূসর দাড়িওয়ালা একটি লোক জানালার কাছে বসে আছেন।
2,a dog walking through the water at the ocean .,একটি কুকুর সমুদ্র তীরে পানির মধ্যে হাঁটছে
3,a bike racer in a red jersey is pursued by ano...,দুজন বাইক চালক প্রতিযোগিতা করছে
4,two identical dogs bound across a lush green m...,দুইটি অভিন্ন কুকুর সতেজ তৃণভূমিতে আবদ্ধ


In [32]:
''' Device Check '''
Device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {Device}')

Device: cpu


In [33]:
''' Pretrained Model '''
Fine_Tune_Model_name = "shhossain/opus-mt-en-to-bn"


In [34]:
''' Tokenizer and Model '''
Tokenizer = AutoTokenizer.from_pretrained(Fine_Tune_Model_name)
Fine_Tune_Model = AutoModelForSeq2SeqLM.from_pretrained(Fine_Tune_Model_name)

### **Data(MT Part-1)**

In [35]:
''' Dataset Class '''
class MTDataset(Dataset):
    # Read csv file using pandas
    def __init__(self, data_path):
        super().__init__()
        self.data = pd.read_csv(data_path)
    # Find Dataset Length
    def __len__(self):
        return len(self.data)
    # Ready single example
    def __getitem__(self, item):
        # English
        source_te = str(self.data.iloc[item]['en'])
        # Bangla
        target_te = str(self.data.iloc[item]['bn'])

        # All Token size must be match
        source_encoder = Tokenizer(
            source_te,
            max_length=256,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        target_encoder = Tokenizer(
            target_te,
            max_length=256,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # return outputs
        return {
            'source_encoder_input_ids': source_encoder['input_ids'].squeeze(),
            'source_encoder_attention_mask': source_encoder['attention_mask'].squeeze(),
            'target_encoder_input_ids': target_encoder['input_ids'].squeeze(),
            'target_encoder_attention_mask': target_encoder['attention_mask'].squeeze(),
        }


In [36]:
''' DataModule Class '''
class MTDataModule(pytorch_lightning.LightningDataModule):
    # Load Dataset using pandas
    def __init__(self, train_csv, test_csv, val_csv, batch_size):
        super().__init__()
        self.train = train_csv
        self.test = test_csv
        self.val = val_csv
        self.batch_size = batch_size

    # Dataset
    def setup(self, stage = None):
        self.train_dataset = MTDataset(self.train)
        self.test_dataset = MTDataset(self.test)
        self.val_dataset = MTDataset(self.val)

    # train Dataloader
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
        )
    # validation Dataloader
    def val_dataloader(self):
        return DataLoader(
             self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
        )
    # Test Dataloader
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
        )


In [37]:
Data_Module = MTDataModule(
    train_csv=train_path,
    test_csv=test_path,
    val_csv=validation_path,
    batch_size=32,
)

### **Model (Fine-Tune) (MT Part-2)**

In [38]:
''' Model '''
class MTModel(pytorch_lightning.LightningModule):
    def __init__(self):
        super().__init__()
        # Load Model
        self.model = AutoModelForSeq2SeqLM.from_pretrained(Fine_Tune_Model_name)
        # Load Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(Fine_Tune_Model_name)
        # learning Rate
        self.lr = 0.001
        # loss func
        self.criterion = nn.CrossEntropyLoss()
        # Bleuscore
        self.blue = BLEUScore()

    def forward(self,sour_input_ids, sour_attention_mask, tar_input_ids, tar_attention_mask):
        outputs = self.model(
            input_ids=sour_input_ids,
            attention_mask=sour_attention_mask,
            decoder_input_ids=tar_input_ids[:, :-1],
            decoder_attention_mask=tar_attention_mask[:, :-1],
        )
        return outputs

    # Training Step
    def training_step(self, batch, batch_idx):
        loss = self.Compute_Loss(batch, batch_idx, 'train')
        self.log('train_loss', loss, prog_bar=True)
        return loss

    # Validation Step
    def validation_step(self, batch, batch_idx):
        loss = self.Compute_Loss(batch, batch_idx, 'val')
        self.log('val_loss', loss, prog_bar=True)
        return loss

    # Test Step
    def test_step(self, batch, batch_idx):
        loss = self.Compute_Loss(batch, batch_idx, 'test')
        self.log('test_loss', loss, prog_bar=True)
        return loss

    # Gradient Descent
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_schedule.CosineAnnealingLR(
            optimizer,
            T_max=10,
        )
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    # Compute loss
    def Compute_Loss(self, batch, batch_idx, stage):
        source_input_ids = batch['source_encoder_input_ids']
        source_attention_mask = batch['source_attention_mask']
        target_input_ids = batch['target_encoder_input_ids']
        target_attention_mask = batch['target_attention_mask']

        outputs = self.forward(
            source_input_ids,
            source_attention_mask,
            target_input_ids,
            target_attention_mask
        )

        logits = outputs.logits
        loss = self.criterion(
            logits.view(-1, logits.size(-1)),
            target_input_ids[:,1:].contiguous().view(-1)
        )

        if stage == 'test' or 'val':
            preds = torch.argmax(logits, dim=-1)
            pred_text = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
            target_text = self.tokenizer.batch_decode(target_input_ids, skip_special_tokens=True)
            bleu_score = self.blue(pred_text, [[target] for target in target_text])
            self.log(f'{stage}_bleu_score', bleu_score, prog_bar=True)

        return loss

In [39]:
Model = MTModel()

### **Train (MT Part-3)**

In [41]:
Training_Model = pytorch_lightning.Trainer(
    max_epochs=10,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    precision=16,
    log_every_n_steps=10,
    val_check_interval=0.25,

)

/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:508: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/pytorch_lightning