# Banglish to Bengali Transliteration

This is a fine-tuned model for transliterating Banglish (Bengali written in Latin script) to Bangla (Bengali script). It is based on the `facebook/mbart-large-50` model.

In [1]:
!pip install -q transformers datasets torch sentencepiece accelerate tqdm

In [2]:
import torch
from tqdm.auto import tqdm
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset, DatasetDict
import re

# Initialize tqdm for notebook compatibility
tqdm.pandas()

# Set device to cuda
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## Data Preparation

In [3]:
def prepare_dataset():
    
    with tqdm(desc="Loading dataset", total=1) as pbar:
        dataset = load_dataset("SKNahin/bengali-transliteration-data")
        pbar.update(1)
    
    with tqdm(desc="Reversing dataset", total=3) as pbar:
        formatted_data = {
            "train": {
                "banglish": dataset["train"]["rm"],
                "bengali": dataset["train"]["bn"]
            }
        }
        pbar.update(1)
        
        min_length = min(len(formatted_data["train"]["banglish"]), 
                        len(formatted_data["train"]["bengali"]))
        pbar.update(1)
        
        aligned_dataset = DatasetDict({
            "train": Dataset.from_dict({
                "banglish": formatted_data["train"]["banglish"][:min_length],
                "bengali": formatted_data["train"]["bengali"][:min_length]
            })
        })
        pbar.update(1)
    
    return aligned_dataset

print("\n[1/4] Data Preparation:")
dataset = prepare_dataset()


[1/4] Data Preparation:


Loading dataset:   0%|          | 0/1 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Reversing dataset:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def clean_text(examples):
    cleaned = {"banglish": [], "bengali": []}
    
    for bl, bn in tqdm(zip(examples["banglish"], examples["bengali"]), 
                      desc="Cleaning pairs", total=len(examples["banglish"])):
        # Clean Banglish
        bl_clean = re.sub(r'[^a-zA-Z\s\']', '', bl).strip().lower()
        bl_clean = re.sub(r'\s+', ' ', bl_clean)
        
        # Clean Bengali
        bn_clean = re.sub(r'[^\u0980-\u09FF\s]', '', bn).strip()
        bn_clean = re.sub(r'\s+', ' ', bn_clean)
        
        # Ensure records are within fixed length
        if 3 <= len(bl_clean) <= 100 and 3 <= len(bn_clean) <= 100:
            cleaned["banglish"].append(bl_clean)
            cleaned["bengali"].append(bn_clean)
    
    return cleaned

print("\n[2/4] Data Cleaning:")
cleaned_dataset = dataset.map(
    clean_text,
    batched=True,
    batch_size=1000,
    remove_columns=dataset["train"].column_names,
    desc="Processing batches"
)


[2/4] Data Cleaning:


Processing batches:   0%|          | 0/5006 [00:00<?, ? examples/s]

Cleaning pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Cleaning pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Cleaning pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Cleaning pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Cleaning pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Cleaning pairs:   0%|          | 0/6 [00:00<?, ?it/s]

## Split the dataset into train and test data

In [5]:
print("\n[3/4] Train-Test Split:")
with tqdm(total=1, desc="Splitting data") as pbar:
    split_dataset = cleaned_dataset["train"].train_test_split(
        test_size=0.2, 
        seed=42
    )
    pbar.update(1)


[3/4] Train-Test Split:


Splitting data:   0%|          | 0/1 [00:00<?, ?it/s]

## Model Setup

In [6]:
print("\n[4/4] Model Initialization:")
with tqdm(total=2, desc="Loading model") as pbar:
    model_name = "facebook/mbart-large-50-many-to-many-mmt"
    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
    pbar.update(1)
    
    model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
    pbar.update(1)

# Set language codes
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "bn_IN"


[4/4] Model Initialization:


Loading model:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

## Tokenization

In [7]:
def tokenize_function(examples):
    inputs = tokenizer(
        examples["banglish"],
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["bengali"],
            max_length=128,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    
    inputs["labels"] = labels["input_ids"]
    return inputs

print("\nTokenization Progress:")
tokenized_datasets = split_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
    desc="Tokenizing"
)


Tokenization Progress:


Tokenizing:   0%|          | 0/3885 [00:00<?, ? examples/s]



Tokenizing:   0%|          | 0/972 [00:00<?, ? examples/s]

## Training the model

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-banglish-to-bengali",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_only_model=True,
    save_steps=1000,
    predict_with_generate=True,
    fp16=True if device == "cuda" else False,
    load_best_model_at_end=True,
    disable_tqdm=False,  # Ensure progress bars are enabled
    dataloader_num_workers=2,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [9]:
print("\nTraining Progress:")
train_result = trainer.train()


Training Progress:


Step,Training Loss,Validation Loss
100,10.5964,8.839693
200,5.9531,2.065823
300,0.6418,0.256509
400,0.2153,0.184206
500,0.2015,0.155849
600,0.1291,0.136801
700,0.1083,0.119557
800,0.1024,0.107817
900,0.0969,0.099055
1000,0.0667,0.096615


Could not locate the best model at ./mbart-banglish-to-bengali/checkpoint-2400/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


In [10]:
print("\nSaving Model:")
with tqdm(total=2, desc="Saving") as pbar:
    trainer.save_model("./mbart-banglish-to-bengali-final")
    pbar.update(1)
    tokenizer.save_pretrained("./mbart-banglish-to-bengali-final")
    pbar.update(1)


Saving Model:


Saving:   0%|          | 0/2 [00:00<?, ?it/s]

## Testing the fine tuned model

In [11]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load the trained model and tokenizer
model_path = "./mbart-banglish-to-bengali-final"
model = MBartForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)

# Set language codes
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "bn_IN"

In [12]:
def transliterate_banglish_to_bengali(banglish_text):
    # Tokenize the input text
    inputs = tokenizer(
        banglish_text,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)
    
    # Generate the Bengali translation
    translated_tokens = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )
    
    # Decode the generated tokens to Bengali text
    bengali_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    
    return bengali_text

In [13]:
test_cases = [
    "ami banglay gan gai",
    "tomar naam ki",
    "apni kothay jacchen",
    "ei khane ekta bhalo restaurant ache",
    "amar bari dhaka te",
    "ajke amar mon valo nei",
    "tumi ki bangla bolte paro",
    "amar ekta choto bhai ache",
    "amra ekhane notun bari kinlam",
    "ei boi ta khub valo"
]

for banglish_text in test_cases:
    bengali_text = transliterate_banglish_to_bengali(banglish_text)
    print(f"Banglish: {banglish_text}")
    print(f"Bengali: {bengali_text}")
    print("-" * 40)

Banglish: ami banglay gan gai
Bengali: আমি বাংলায় গান গাই
----------------------------------------
Banglish: tomar naam ki
Bengali: তোমার নাম কি
----------------------------------------
Banglish: apni kothay jacchen
Bengali: আপনি কোথায় যাচ্ছেন
----------------------------------------
Banglish: ei khane ekta bhalo restaurant ache
Bengali: এই খান একটা ভালো রেস্টুর্ট আছে
----------------------------------------
Banglish: amar bari dhaka te
Bengali: আমার বাড়ি নাটক তে
----------------------------------------
Banglish: ajke amar mon valo nei
Bengali: আজকে আমার মনে ভালো নেই
----------------------------------------
Banglish: tumi ki bangla bolte paro
Bengali: তুমি কি বাংলা বলতে পারো
----------------------------------------
Banglish: amar ekta choto bhai ache
Bengali: আমার একটা ছোট ভাই আছে
----------------------------------------
Banglish: amra ekhane notun bari kinlam
Bengali: আমরা এখানে নতুন বাড়ি কিনলাম
----------------------------------------
Banglish: ei boi ta khub valo
Bengali: এই বছর