In [1]:
from typing import Any

from pytorch_lightning.utilities.types import STEP_OUTPUT

""" Class 25 | Project 2 | Machine Translation using Pretrained Model

Objectives:
1. End-to-end machine translation training pipeline
2. Fine-tune a pre-trained model for the custom dataset
"""

import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torchmetrics.text import BLEUScore
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
"""Task: English to Bangla """

mt_pretrained_model_name = "shhossain/opus-mt-en-to-bn"

In [4]:
""" For NLP tasks, we basically need two entities:
1. Tokenizer
2. Model
"""

tokenizer = AutoTokenizer.from_pretrained(mt_pretrained_model_name)
mt_pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(mt_pretrained_model_name)



In [8]:
print(mt_pretrained_model.forward) #downloded model e ki ki layer ase seta check kortesi

<bound method MarianMTModel.forward of MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61760, 512, padding_idx=61759)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61760, 512, padding_idx=61759)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_l

In [10]:
import inspect
print(inspect.signature(mt_pretrained_model.forward))# downloaded model er input parameter ki ki ase seta dekhtesi


(input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Union[tuple[torch.Tensor], transformers.modeling_outputs.BaseModelOutput, NoneType] = None, past_key_values: Optional[transformers.cache_utils.Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.Tensor] = None) -> transformers.modeling_outputs.Seq2SeqLMOutput


In [12]:
mt_pretrained_model.prepare_inputs_for_generation()

TypeError: GenerationMixin.prepare_inputs_for_generation() missing 1 required positional argument: 'input_ids'

In [2]:
mt_pretrained_model.config

NameError: name 'mt_pretrained_model' is not defined

# Data

In [5]:
"""
Sentence: How are you, dude?
Tokens: 'How', 'are', 'you', 'dude?'
ids: 125, 14, 145, 78
max_length = 3
ids: [125, 14, 145]
"""

class MTDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_text = str(self.data.iloc[idx]['en'])
        tgt_text = str(self.data.iloc[idx]['bn'])

        src_encoding = tokenizer(
            src_text,
            max_length=128,
            padding='max_length',#max length theke choto hole 0 diye padding kore dibe
            truncation=True,# maximum length theke boro hole kete dibe
            return_tensors='pt',# pytorch er tensor datatype hisebe return korbe
        )

        tgt_encoding = tokenizer(
            tgt_text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'src_input_ids': src_encoding['input_ids'].squeeze(),
            'src_attention_mask': src_encoding['attention_mask'].squeeze(),
            'tgt_input_ids': tgt_encoding['input_ids'].squeeze(),
            'tgt_attention_mask': tgt_encoding['attention_mask'].squeeze()
        }

"""
example: How are you, dude?
input_ids: 125, 14, 145, 78
max_length = 7
input_ids: [125, 14, 145, 147, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 0, 0, 0] #jei value gula te 1 thake segulo hocce real token ar jegulate o thake segulo hocche padded token
"""

'\nexample: How are you, dude?\ninput_ids: 125, 14, 145, 78\nmax_length = 7\ninput_ids: [125, 14, 145, 147, 0, 0, 0]\nattention_mask: [1, 1, 1, 1, 0, 0, 0] #jei value gula te 1 thake segulo hocce real token ar jegulate o thake segulo hocche padded token\n'

In [6]:
class MTDataModule(pl.LightningDataModule):
    def __init__(self, train_csv, val_csv, test_csv, batch_size=32):
        super().__init__()
        self.train_csv = train_csv
        self.val_csv = val_csv
        self.test_csv = test_csv
        self.batch_size = batch_size

    def setup(self, stage=None):#sobar age ei setup ta call hoy
        self.train_dataset = MTDataset(self.train_csv)
        self.val_dataset = MTDataset(self.val_csv)
        self.test_dataset = MTDataset(self.test_csv)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

In [12]:
data_module = MTDataModule(
    train_csv=r'train.csv',
    val_csv=r'val.csv',
    test_csv=r'test.csv',
    batch_size=32
)

# Model

In [8]:
class MTModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        # load pretrained model
        self.model = AutoModelForSeq2SeqLM.from_pretrained(mt_pretrained_model_name)
        # load pretrained tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(mt_pretrained_model_name)
        # learning rate
        self.learning_rate = 2e-5
        # loss function
        self.loss_fn = nn.CrossEntropyLoss(
            ignore_index=self.tokenizer.pad_token_id
        )
        # evaluation metric
        self.bleu = BLEUScore()#machine translation ta koto valo kaj kore seta check korar jonnno use kora hoy BLEUScore.jemon classification er jonno use kora hoy accuracy

    def forward(self,
                src_input_ids,
                src_attention_mask,
                tgt_input_ids,
                tgt_attention_mask
        ):
        outputs = self.model(
            input_ids=src_input_ids,
            attention_mask=src_attention_mask,
            decoder_input_ids=tgt_input_ids[:, :-1],
            decoder_attention_mask=tgt_attention_mask[:, :-1]
        )
        return outputs

    def training_step(self, batch, batch_idx):
        loss = self.compute_loss(batch, batch_idx, 'train')
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.compute_loss(batch, batch_idx, 'val')
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        loss = self.compute_loss(batch, batch_idx, 'test')
        self.log('test_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=10
        )
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    def compute_loss(self, batch, batch_idx, stage):
        src_input_ids = batch['src_input_ids']
        src_attention_mask = batch['src_attention_mask']
        tgt_input_ids = batch['tgt_input_ids']
        tgt_attention_mask = batch['tgt_attention_mask']

        outputs = self(
            src_input_ids,
            src_attention_mask,
            tgt_input_ids,
            tgt_attention_mask
        )
        """CrossEntropy needs input shape(N,C) where N=number of tokens=number of batch*number of tokens in each batch"""
        logits = outputs.logits
        loss = self.loss_fn(
            logits.view(-1, logits.size(-1)),
            tgt_input_ids[:, 1:].contiguous().view(-1)
        )

        if stage == 'val' or stage == 'test':
            preds = torch.argmax(logits, dim=-1)
            pred_texts = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
            tgt_texts = self.tokenizer.batch_decode(tgt_input_ids[:, 1:], skip_special_tokens=True)
            bleu_score = self.bleu(pred_texts, [[tgt] for tgt in tgt_texts])
            self.log(f'{stage}_bleu', bleu_score, prog_bar=True)

        return loss


In [9]:
model = MTModel()

# Train

In [2]:
trainer = pl.Trainer(
    max_epochs=2,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    precision=16,
    log_every_n_steps=10,
    val_check_interval=0.25
)

NameError: name 'pl' is not defined

In [None]:
# trainer.fit(model, data_module)

C:\Users\User\Downloads\60 days of python\day-38(Aspect base sentiment analysis)\.venv\Lib\site-packages\pytorch_lightning\utilities\model_summary\model_summary.py:242: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


In [1]:
trainer.test(model, data_module)

NameError: name 'trainer' is not defined

In [None]:
model.model.config

NameError: name 'model' is not defined

In [None]:
for name, module in model.model.named_modules():
    print(name)

NameError: name 'model' is not defined