### **Machine Translation using Pretrain Model**

In [8]:
from pytorch_lightning.utilities.types import EVAL_DATALOADERS

''' Import all important Library '''
import pandas as pd
import os
import torch
import pytorch_lightning
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch.nn as nn
from torchmetrics.text import  BLEUScore

In [9]:
''' Root and Dataset Path '''
Root_dir = '/Users/mahadiur/Desktop/Bongodev MLops Projects/Machine Translation using Pretrain Model/Data'

train_path = os.path.join(Root_dir, 'train.csv')
test_path = os.path.join(Root_dir, 'test.csv')
validation_path = os.path.join(Root_dir, 'val.csv')


In [10]:
''' Load Dataset '''
train_dataset = pd.read_csv(train_path)
test_dataset = pd.read_csv(test_path)
validation_dataset = pd.read_csv(validation_path)

train_dataset.head()

Unnamed: 0,en,bn
0,men with orange wristbands perform a dance .,কমলা রিস্টব্যান্ড পড়া লোক নাচছে ।
1,a man with a grey beard is sitting by a window .,ধূসর দাড়িওয়ালা একটি লোক জানালার কাছে বসে আছেন।
2,a dog walking through the water at the ocean .,একটি কুকুর সমুদ্র তীরে পানির মধ্যে হাঁটছে
3,a bike racer in a red jersey is pursued by ano...,দুজন বাইক চালক প্রতিযোগিতা করছে
4,two identical dogs bound across a lush green m...,দুইটি অভিন্ন কুকুর সতেজ তৃণভূমিতে আবদ্ধ


In [11]:
''' Device Check '''
Device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {Device}')

Device: cpu


In [12]:
''' Pretrained Model '''
Fine_Tune_Model_name = "shhossain/opus-mt-en-to-bn"


In [13]:
''' Tokenizer and Model '''
Tokenizer = AutoTokenizer.from_pretrained(Fine_Tune_Model_name)
Fine_Tune_Model = AutoModelForSeq2SeqLM.from_pretrained(Fine_Tune_Model_name)

### **Data(MT Part-1)**

In [14]:
''' Dataset Class '''
class MTDataset(Dataset):
    # Read csv file using pandas
    def __init__(self, data_path):
        super().__init__()
        self.data = pd.read_csv(data_path)
    # Find Dataset Length
    def __len__(self):
        return len(self.data)
    # Ready single example
    def __getitem__(self, item):
        # English
        source_te = str(self.data.iloc[item]['en'])
        # Bangla
        target_te = str(self.data.iloc[item]['bn'])

        # All Token size must be match
        source_encoder = Tokenizer(
            source_te,
            max_length=256,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        target_encoder = Tokenizer(
            target_te,
            max_length=256,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # return outputs
        return {
            'source_encoder_input_ids': source_encoder['input_ids'].squeeze(),
            'source_encoder_attention_mask': source_encoder['attention_mask'].squeeze(),
            'target_encoder_input_ids': target_encoder['input_ids'].squeeze(),
            'target_encoder_attention_mask': target_encoder['attention_mask'].squeeze(),
        }


In [15]:
''' DataModule Class '''
class MTDataModule(pytorch_lightning.LightningDataModule):
    # Load Dataset using pandas
    def __init__(self, train_csv, test_csv, val_csv, batch_size):
        super().__init__()
        self.train = train_csv
        self.test = test_csv
        self.val = val_csv
        self.batch_size = batch_size

    # Dataset
    def setup(self, stage = None):
        self.train_dataset = MTDataset(self.train)
        self.test_dataset = MTDataset(self.test)
        self.val_dataset = MTDataset(self.val)

    # train Dataloader
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
        )
    # validation Dataloader
    def val_dataloader(self):
        return DataLoader(
             self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
        )
    # Test Dataloader
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
        )


In [16]:
MTDataModule = MTDataModule(
    train_csv=train_path,
    test_csv=test_path,
    val_csv=validation_path,
    batch_size=32,
)