<a href="https://colab.research.google.com/github/Karabi-codehub/Machine-Translation-project-En_to_BN-/blob/main/Capstone_Project_Machine_Translation(English_to_Bangla)_using_Pretrained_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
-For type hints
-Any means a variable or return type can be any data type (string, int, tensor, etc.).
"""
from typing import Any

import os

!pip install -U mlflow
import mlflow
from mlflow.models import infer_signature


"""
-Defines step output types for Lightning training
-In PyTorch Lightning, some functions (like training_step, validation_step, test_step) return outputs that can be different types
-Lightning gives you a ready-made type alias called STEP_OUTPUT.
           -STEP_OUTPUT = shorthand type hint for “whatever the training/validation/test step is allowed to return in Lightning.”
           -PyTorch Lightning makes writing and training deep learning models easier and cleaner.
           -It handles things like training loops, validation, logging, GPU/TPU support, etc., so you don’t have to write them manually.
"""
!pip install pytorch-lightning
from pytorch_lightning.utilities.types import STEP_OUTPUT


"""
-High-level training framework to simplify PyTorch code
"""
import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping


"""
-Core PyTorch library for tensors and computations
"""
import torch


"""
-Build neural network layers and models
"""
import torch.nn as nn


"""
-Handle custom datasets and batch loading
"""
from torch.utils.data import Dataset, DataLoader


"""
-Load and preprocess dataset (CSV, Excel, etc.)
"""
import pandas as pd


"""
-BLEUScore measures how good a machine translation is by comparing it to reference translations.
-BLEU (Bilingual Evaluation Understudy): a metric that checks how closely your model’s translations match the correct/reference translations.
-torchmetrics.text.BLEUScore: makes it easy to calculate BLEU in PyTorch projects.
-BLEU tells how accurate your translations are.”
"""
from torchmetrics.text import BLEUScore


"""
-AutoTokenizer: Converts text into numbers (tokens) the model can understand.
-AutoModelForSeq2SeqLM: Pretrained sequence-to-sequence model for tasks like translation or summarization.
-Tokenizer prepares text, Model translates or generates text.
"""
%pip install -q transformers datasets sentencepiece
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM


!pip install -U mlflow pyngrok pytorch-lightning gradio sacremoses


Collecting mlflow
  Downloading mlflow-3.4.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.4.0 (from mlflow)
  Downloading mlflow_skinny-3.4.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.4.0 (from mlflow)
  Downloading mlflow_tracing-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastmcp<3,>=2.0.0 (from mlflow)
  Downloading fastmcp-2.12.4-py3-none-any.whl.metadata (19 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.4.0->mlflow)
  Downloading databricks_sdk-0.67.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-proto<3,>=1.9.0 (from mlflow-skinny==3.4.0->mlflow)
  Downloading opentelemetry_proto-1.37.0-py3-none-any.w

In [2]:
"""
-torch.cuda.is_available() → checks if your computer has a GPU ready.
-"cuda" → runs computations on GPU (faster).
-"cpu" → runs on the processor if no GPU is available.
-torch.device(...) → tells PyTorch where to run your model and tensors.
"""
# Chooses GPU (cuda) if available, otherwise CPU, for running your PyTorch model.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
mt_pretrained_model_name = "csebuetnlp/banglat5_nmt_en_bn"

In [4]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(mt_pretrained_model_name)


"""
✅ AutoModelForSeq2SeqLM is:Automatic Model for Sequence-to-Sequence Language Modeling

Let’s break it down:

AutoModel → “Auto” means it automatically picks the right pretrained model architecture (you just give the model name,
             e.g., "t5-small", "facebook/mbart-large-50", etc.).
Seq2Seq → stands for Sequence-to-Sequence (input sequence → output sequence).
          Used in tasks like translation, summarization, and text generation.
"""

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


'\n✅ AutoModelForSeq2SeqLM is:Automatic Model for Sequence-to-Sequence Language Modeling\n\nLet’s break it down:\n\nAutoModel → “Auto” means it automatically picks the right pretrained model architecture (you just give the model name,\n             e.g., "t5-small", "facebook/mbart-large-50", etc.).\nSeq2Seq → stands for Sequence-to-Sequence (input sequence → output sequence).\n          Used in tasks like translation, summarization, and text generation.\n'

In [5]:
"""A custom dataset class for Machine Translation (MT)."""
#MTDataset → Loads data from a CSV file (your custom dataset).
MAX_LENGTH = 128
class MTDataset(Dataset):
    def __init__(self, csv_file): # __init__: called when we create the dataset object.
        self.data=pd.read_csv(csv_file) #loads the CSV file
    def __len__(self): # total number of samples in the dataset.
        return len(self.data)
    def __getitem__(self,idx): # __getitem__: fetches one sample (source + target) by index.
        src_text = str(self.data.iloc[idx]['en']) # Source text (English) from column 'en'
        tgt_text = str(self.data.iloc[idx]['bn']) # Target text (Bangla) from column 'bn'
        src_encoding=tokenizer(
            src_text, # Input sentence (English)
            max_length=MAX_LENGTH,  # integer,Maximum length of tokens (fixed size input)
            padding='max_length',
            truncation=True, # Cuts off text longer than max_length
            return_tensors='pt'# Returns PyTorch tensors instead of plain lists
         )

        tgt_encoding = tokenizer(
        tgt_text,              # The target sentence (Bangla text) that we want the model to generate.
        max_length=MAX_LENGTH ,        # Fixes the size of the sequence (like saying "all sentences must be 128 tokens long").
        padding='max_length',  # If the sentence is shorter, add [PAD] tokens until it’s 128 tokens.
        truncation=True, # Cut off extra words if the sentence is longer than 128 tokens.
        return_tensors='pt'    # Converts everything into PyTorch tensors, ready for training.
    )
        return {
    'src_input_ids': src_encoding['input_ids'].squeeze(0),        # Token IDs for source (English) sentence
    'src_attention_mask': src_encoding['attention_mask'].squeeze(0),  # Mask for source (1 = real token, 0 = padding)
    'tgt_input_ids': tgt_encoding['input_ids'].squeeze(0),        # Token IDs for target (Bangla) sentence
    'tgt_attention_mask': tgt_encoding['attention_mask'].squeeze(0)   # Mask for target
}

"""
example: How are you, dude?
input_ids: 125, 14, 145, 78
max_length = 7
input_ids: [125, 14, 145, 147, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 0, 0, 0],src_attention_mask → Mask to ignore [PAD] tokens.

"""


'\nexample: How are you, dude?\ninput_ids: 125, 14, 145, 78\nmax_length = 7\ninput_ids: [125, 14, 145, 147, 0, 0, 0]\nattention_mask: [1, 1, 1, 1, 0, 0, 0],src_attention_mask → Mask to ignore [PAD] tokens.\n\n'

In [6]:
# DataModule definition
class MTDataModule(pl.LightningDataModule):
    def __init__(self, train_csv, val_csv, test_csv, batch_size=32):
        super().__init__() # Call parent LightningDataModule __init__
        # Save CSV file paths
        self.train_csv = train_csv
        self.val_csv = val_csv
        self.test_csv = test_csv
        # Save batch size (how many samples per batch)
        self.batch_size = batch_size

      # This method prepares datasets for train, val, test
    def setup(self, stage=None):
        # Create dataset objects using the CSV paths
        self.train_dataset = MTDataset(self.train_csv)
        self.val_dataset = MTDataset(self.val_csv)
        self.test_dataset = MTDataset(self.test_csv)

    # DataLoader for training (shuffle=True so model sees random data order every epoch)
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

    # DataLoader for validation (shuffle=False so order is fixed)
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

    # DataLoader for testing (also no shuffle)
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

#  Create DataModule object
data_module = MTDataModule(
    train_csv='train.csv',   # path to training data CSV
    val_csv='val.csv',       # path to validation data CSV
    test_csv='test.csv',     # path to testing data CSV
    batch_size=32            # how many samples per batch
)

In [7]:
# Machine Translation Model
class MTModel(pl.LightningModule):
    def __init__(self, learning_rate=1e-5):
        super().__init__()

        # Load a pretrained Seq2Seq model
        self.model = AutoModelForSeq2SeqLM.from_pretrained(mt_pretrained_model_name)

        # Load tokenizer for the same model (T5-specific)
        self.tokenizer = T5Tokenizer.from_pretrained(mt_pretrained_model_name)

        #Define learning rate (small value because we are fine-tuning a pretrained model)
        self.learning_rate = learning_rate

        # Define loss function (CrossEntropyLoss)
        # "ignore_index=pad_token_id" means padding tokens won't be counted in loss.
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)

        # Define evaluation metric (BLEU Score)
        # BLEU checks how close translations are to the target sentences.
        self.bleu = BLEUScore()

    # Forward pass: how the model processes one batch of data
    def forward(self,
                src_input_ids,        # English tokens
                src_attention_mask,   # Mask for English (ignore PAD tokens)
                tgt_input_ids,        # Bangla tokens
                tgt_attention_mask    # Mask for Bangla
        ):
        #Call the underlying HuggingFace seq2seq model
        outputs = self.model(
            input_ids=src_input_ids,                # Source sentence (English)
            attention_mask=src_attention_mask,      # Mask for English
            decoder_input_ids=tgt_input_ids[:, :-1],# Decoder input (Bangla shifted right, teacher forcing)
            decoder_attention_mask=tgt_attention_mask[:, :-1] # Mask for Bangla
        )
        return outputs   # Contains logits (predicted token probabilities)

 # Training loop: runs for every batch during training
    def training_step(self, batch, batch_idx):
        loss = self.compute_loss(batch, batch_idx, 'train')   # Compute loss
        self.log('train_loss', loss, prog_bar=True)           # Log train loss on progress bar
        return loss

# Validation loop: runs after each epoch on validation data
    def validation_step(self, batch, batch_idx):
        loss = self.compute_loss(batch, batch_idx, 'val')     # Compute validation loss
        self.log('val_loss', loss, prog_bar=True)             # Log validation loss
        return loss

# Test loop: runs on test data
    def test_step(self, batch, batch_idx):
        loss = self.compute_loss(batch, batch_idx, 'test')    # Compute test loss
        self.log('test_loss', loss, prog_bar=True)            # Log test loss
        return loss

 # Optimizer + Scheduler setup
    def configure_optimizers(self):

        # Use AdamW optimizer (works well with transformers)
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

        # Use learning rate scheduler (Cosine Annealing: decreases LR smoothly)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=10   # Number of epochs to restart cycle
        )

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

# ⚡ Compute loss + BLEU (shared by train/val/test)
    def compute_loss(self, batch, batch_idx, stage):
        # Unpack batch
        src_input_ids = batch['src_input_ids']
        src_attention_mask = batch['src_attention_mask']
        tgt_input_ids = batch['tgt_input_ids']
        tgt_attention_mask = batch['tgt_attention_mask']

        # Forward pass through model
        outputs = self(
            src_input_ids,
            src_attention_mask,
            tgt_input_ids,
            tgt_attention_mask
        )

        # Get predicted token logits (probabilities before softmax)
        logits = outputs.logits

        #Compute CrossEntropy loss
        # Shift target tokens by one position (teacher forcing)
        loss = self.loss_fn(
            logits.view(-1, logits.size(-1)), # Predictions: flatten for all tokens
            tgt_input_ids[:, 1:].contiguous().view(-1) # Targets: shifted right by 1
        )

        #If validation/test → also compute BLEU score
        if stage == 'val' or stage == 'test':
            preds = torch.argmax(logits, dim=-1)   # Pick highest probability tokens
            pred_texts = self.tokenizer.batch_decode(preds, skip_special_tokens=True)   # Convert to text
            tgt_texts = self.tokenizer.batch_decode(tgt_input_ids[:, 1:], skip_special_tokens=True)

            # Compute BLEU score (higher = better translation quality)
            bleu_score = self.bleu(pred_texts, [[tgt] for tgt in tgt_texts])

            # Log BLEU score to progress bar
            self.log(f'{stage}_bleu', bleu_score, prog_bar=True)

        return loss



In [None]:
# Initialize Machine Translation model
model = MTModel()

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import os
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    verbose=True
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    save_top_k=1,
    mode='max'
)

checkpoint_path = os.path.join(
    os.getcwd(), "checkpoints", "best_model.pth"
)
if not os.path.exists(os.path.dirname(checkpoint_path)):
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)


In [None]:

trainer = pl.Trainer(
    max_epochs = 2,
    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu',
    devices = 1,
    precision = 32,
    log_every_n_steps = 10,
    val_check_interval = 0.25,
    callbacks = [checkpoint_callback, early_stopping]
)


In [None]:
try:
    print("model:", type(model))
    print("tokenizer:", type(tokenizer))
    print("✅ Both found — ready to save.")
except NameError as e:
    print("❌ Model or tokenizer not found — re-run your training or loading code first.")


In [None]:
import os
import torch
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# -----------------------------
# 1️⃣ Create save folder
# -----------------------------
save_dir = "/content/mt_model"
os.makedirs(save_dir, exist_ok=True)

# -----------------------------
# 2️⃣ Save Hugging Face model + tokenizer
# -----------------------------
model.model.save_pretrained(save_dir)   # Save the inner HF model
tokenizer.save_pretrained(save_dir)

# -----------------------------
# 3️⃣ Save PyTorch weights (.pt)
# -----------------------------
torch.save(model.model.state_dict(), "/content/mt_model_weights.pt")

# -----------------------------
# 4️⃣ Zip the model folder
# -----------------------------
shutil.make_archive("/content/mt_model", 'zip', save_dir)

# -----------------------------
# 5️⃣ Move files to Google Drive
# -----------------------------
shutil.move("/content/mt_model.zip", "/content/drive/MyDrive/mt_model.zip")
shutil.move("/content/mt_model_weights.pt", "/content/drive/MyDrive/mt_model_weights.pt")

print("✅ Model folder and weights saved successfully to Google Drive!")
print("📁 Check your Google Drive → MyDrive → mt_model.zip and mt_model_weights.pt")


In [None]:
import shutil
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Check if folder exists before zipping
if os.path.exists("mt_model"):
    shutil.make_archive("mt_model", 'zip', "mt_model")
    print("✅ Folder zipped successfully.")
else:
    print("❌ Folder 'mt_model' not found — check your path.")

# Move files to Google Drive
if os.path.exists("mt_model.zip"):
    shutil.move("mt_model.zip", "/content/drive/MyDrive/mt_model.zip")
    print("✅ mt_model.zip saved to Google Drive.")
else:
    print("⚠️ mt_model.zip not found after zipping.")

# If you have model weights, move them too
if os.path.exists("mt_model_weights.pt"):
    shutil.move("mt_model_weights.pt", "/content/drive/MyDrive/mt_model_weights.pt")
    print("✅ mt_model_weights.pt saved to Google Drive.")
else:
    print("⚠️ mt_model_weights.pt not found.")


print("✅ Files saved to your Google Drive (MyDrive folder).")



In [None]:
import mlflow
#MLflow Tracking

EPOCHS = 1
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

mlflow.set_experiment("English-Bangla-Translation")




In [None]:
data_module = MTDataModule("train.csv", "val.csv", "test.csv", batch_size=BATCH_SIZE)
model = MTModel(learning_rate=LEARNING_RATE)

In [None]:
import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature
from pyngrok import ngrok
import torch
import numpy as np

# ---------------- TRAIN + LOG ---------------- #
with mlflow.start_run() as run:
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("epochs", EPOCHS)

    # Train & test
    trainer.fit(model=model, datamodule=data_module)
    evaluation_score = trainer.test(model=model, dataloaders=data_module.test_dataloader())
    mlflow.log_metric("test_loss", evaluation_score[0]['test_loss'])

    # Make sample input & output
    sample_batch = next(iter(data_module.test_dataloader()))
    sample_input = {
        'src_input_ids': sample_batch['src_input_ids'],
        'src_attention_mask': sample_batch['src_attention_mask']
    }

    with torch.no_grad():
        model.eval()
        sample_output = model(
            sample_input['src_input_ids'].to(model.device),
            sample_input['src_attention_mask'].to(model.device),
            sample_batch['tgt_input_ids'].to(model.device),
            sample_batch['tgt_attention_mask'].to(model.device)
        ).logits
        model.train()

    sample_input_np = {k: v.cpu().numpy().tolist() for k, v in sample_input.items()}
    sample_output_np = sample_output.cpu().numpy().tolist()

    # Save model to MLflow
    signature = infer_signature(sample_input_np, sample_output_np)
    mlflow.pytorch.log_model(
        pytorch_model=model,
        artifact_path="mt_model",
        signature=signature,
        input_example=sample_input_np
    )

    RUN_ID = run.info.run_id
    print("✅ Your MLflow run ID:", RUN_ID)



In [None]:
   # ---------------- START MLFLOW UI ---------------- #
!mlflow ui --port 5000 &>/dev/null &



In [None]:
from pyngrok import ngrok
public_url = ngrok.connect(5000)
print("🔗 MLflow UI URL:", public_url.public_url)

In [None]:
!pip install gradio


In [None]:
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ---------------- CONFIG ---------------- #
mt_pretrained_model_name = "csebuetnlp/banglat5_nmt_en_bn"  # base architecture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 128

# ---------------- LOAD TOKENIZER ---------------- #
tokenizer = AutoTokenizer.from_pretrained(mt_pretrained_model_name)

# ---------------- LOAD MODEL + YOUR WEIGHTS ---------------- #
# Load the base pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(mt_pretrained_model_name)

# Load your fine-tuned weights (must be in the same folder as app.py)
state_dict = torch.load("mt_model_weights.pt", map_location=device)
model.load_state_dict(state_dict, strict=False)  # strict=False = ignore extra keys
model.to(device)
model.eval()

# ---------------- TRANSLATION FUNCTION ---------------- #
def translate_english_to_bangla(sentence: str) -> str:
    input_ids = tokenizer(
        sentence,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    ).input_ids.to(device)

    with torch.no_grad():
        output_tokens = model.generate(
            input_ids,
            max_length=MAX_LENGTH,
            num_beams=4,
            early_stopping=True
        )

    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# ---------------- GRADIO INTERFACE ---------------- #
gr.Interface(
    fn=translate_english_to_bangla,
    inputs=gr.Textbox(lines=3, placeholder="Enter English sentence here...", label="English Text"),
    outputs=gr.Textbox(label="Bangla Translation"),
    title="English → Bangla Translator (Fine-tuned)",
    description="Translates English into Bangla using your fine-tuned model weights."
).launch()
