In [None]:
# ==========================================
# BanglaT5 Fine-Tuning (Dialect → Standard Bangla)
# ==========================================

!pip install -q transformers[sentencepiece] datasets evaluate sacrebleu torch pandas openpyxl accelerate

import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from google.colab import drive

# ==========================================
#  Mount Google Drive & Load Dataset
# ==========================================
drive.mount('/content/drive')

try:
    df = pd.read_excel("/content/drive/MyDrive/bangla_dialect_aligned_18920.xlsx")
    print(f" Dataset loaded successfully. Total rows: {len(df)}")
except FileNotFoundError:
    print(" File not found — using a small demo dataset instead.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়।", "তুমি কোথায় যাও?", "আমি বই পড়ি।"],
        'Chittagong': ["হেই ইস্কুলে যায়।", "তুই কই যাও?", "আঁই বই পড়মু।"]
    })

# Select which dialect(s) to train
DIALECTS_TO_TRAIN = ['Barisal', 'Sylhet']

# ==========================================
# Load Tokenizer & Model
# ==========================================
MODEL_NAME = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\n Tokenizer for '{MODEL_NAME}' loaded.\n")

# ==========================================
# 🧩 Dataset Preparation
# ==========================================
def create_dataset_dict(dialect_col):
    subset_df = df[['Standard_Bangla', dialect_col]].dropna()
    subset_df = subset_df.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})
    print(f"Dialect '{dialect_col}' samples: {len(subset_df)}")

    hf_dataset = Dataset.from_pandas(subset_df)
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    return DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })

# ==========================================
#  Tokenization
# ==========================================
def tokenize_and_prepare_datasets(dataset_dict):
    def tokenize_fn(examples):
        model_inputs = tokenizer(
            examples["source"], max_length=64, truncation=True, padding="max_length"
        )
        labels = tokenizer(
            text_target=examples["target"], max_length=64, truncation=True, padding="max_length"
        )
        # Mask out padding tokens
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True)
    print(" Tokenization complete.")
    return tokenized_datasets

# ==========================================
#  Train & Evaluate Function
# ==========================================
def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    output_dir = f"/content/drive/MyDrive/Bangla_Dialect_Models/BanglaT5-{dialect_name.lower()}"

    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        # Convert predictions to the appropriate integer type if necessary
        if isinstance(preds, np.ndarray) and preds.dtype != np.int64:
            preds = preds.astype(np.int64)
        elif torch.is_tensor(preds):
            preds = preds.to(torch.int64)

        # Replace out-of-vocabulary token IDs with padding token ID
        preds = np.where((preds >= 0) & (preds < tokenizer.vocab_size), preds, tokenizer.pad_token_id)

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_labels = [[l.strip()] for l in decoded_labels]

        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=3e-4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=5,
        weight_decay=0.01,
        predict_with_generate=True,
        fp16=False,                             # disable to avoid silent overflow
        logging_strategy="steps",
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_steps=500,
        save_total_limit=2,
        generation_max_length=64,
        generation_num_beams=4,
        report_to="none",
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Training BanglaT5 for dialect: {dialect_name}")
    trainer.train()

    print(f"\n Evaluating model on test set for '{dialect_name}'...")
    results = trainer.predict(test_ds)
    bleu_score = results.metrics.get('test_bleu', 0.0)
    print(f" Test BLEU Score for {dialect_name}: {bleu_score:.2f}")

    print("\n Example Outputs:")
    pred_ids = results.predictions
    # Convert predictions to the appropriate integer type if necessary
    if isinstance(pred_ids, np.ndarray) and pred_ids.dtype != np.int64:
      pred_ids = pred_ids.astype(np.int64)
    elif torch.is_tensor(pred_ids):
      pred_ids = pred_ids.to(torch.int64)

    # Replace out-of-vocabulary token IDs with padding token ID for example outputs
    pred_ids = np.where((pred_ids >= 0) & (pred_ids < tokenizer.vocab_size), pred_ids, tokenizer.pad_token_id)


    preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    for i in range(min(3, len(preds))):
        print(f"Dialect:   {test_ds[i]['source']}")
        print(f"Standard:  {test_ds[i]['target']}")
        print(f"Predicted: {preds[i]}\n")

# ==========================================
#  Run Fine-Tuning
# ==========================================
for dialect in DIALECTS_TO_TRAIN:
    dataset_dict = create_dataset_dict(dialect)
    tokenized_data = tokenize_and_prepare_datasets(dataset_dict)
    train_and_evaluate(
        dialect,
        tokenized_data["train"],
        tokenized_data["validation"],
        tokenized_data["test"]
    )

print("\n All training completed! Models saved to your Google Drive.")

Mounted at /content/drive
 Dataset loaded successfully. Total rows: 3480


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



 Tokenizer for 'csebuetnlp/banglat5' loaded.

Dialect 'Barisal' samples: 3480


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

 Tokenization complete.


pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Training BanglaT5 for dialect: Barisal


Step,Training Loss,Validation Loss,Bleu
200,3.1531,1.958206,30.610845
400,2.4437,1.44397,39.223573
600,1.8701,1.101344,50.173056
800,1.343,0.986512,53.732861
1000,1.181,0.944738,55.261774
1200,1.0135,0.907351,53.343809
1400,1.0269,0.855748,58.10162
1600,0.8295,0.84337,58.989037
1800,0.8328,0.830736,59.220432
2000,0.8473,0.811668,59.991041



 Evaluating model on test set for 'Barisal'...


 Test BLEU Score for Barisal: 58.47

 Example Outputs:
Dialect:   আম্নে কি মোর প্রশ্নের জবাব দেতে পারেন?
Standard:  আপনি কি আমার প্রশ্নের জবাব দিতে পারেন?
Predicted: আপনি কি আমার প্রশ্নের জবাব দিতে পারেন?

Dialect:   আকাশের নীল রংডা অসাধারন
Standard:  আকাশের নীল রঙটি অসাধারণ
Predicted: আকাশের নীল রংটি অসাধারণ

Dialect:   ছোডো বুইন চিল্লান দিয়া ওঠছে দারুণ
Standard:  ছোটো বোন চিৎকার দিয়ে উঠল, দারুণ
Predicted: ছোট বোন চিৎকার করে উঠছে দারুন

Dialect 'Sylhet' samples: 3480


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

 Tokenization complete.


  trainer = Seq2SeqTrainer(



 Training BanglaT5 for dialect: Sylhet


Step,Training Loss,Validation Loss,Bleu
200,3.7939,2.388562,21.209023
400,2.6296,1.637667,36.64059
600,2.0636,1.318723,40.996558
800,1.4469,1.189013,44.937054
1000,1.3548,1.152171,46.065186
1200,1.2103,1.119686,47.075142
1400,1.1185,1.058917,50.120182
1600,0.9135,1.063503,50.032885
1800,1.0284,1.011335,50.674826
2000,0.9105,0.998109,51.417693



 Evaluating model on test set for 'Sylhet'...


 Test BLEU Score for Sylhet: 49.80

 Example Outputs:
Dialect:   আফনে কিতা আমার প্রশ্ন'র জবাব দিতে ফারবা নি?
Standard:  আপনি কি আমার প্রশ্নের জবাব দিতে পারেন?
Predicted: আপনি কি আমার প্রশ্নের জবাব দিতে পারেন?

Dialect:   আকাশ ওর নীল রঙটা অসাধারণ
Standard:  আকাশের নীল রঙটি অসাধারণ
Predicted: আকাশের নীল রঙটা অসাধারণ

Dialect:   ছোট বইনে চিল্লাইয়া উঠলো, দারুণ
Standard:  ছোটো বোন চিৎকার দিয়ে উঠল, দারুণ
Predicted: ছোট বোন চিৎকার করে উঠলো, অসাধারণ


 All training completed! Models saved to your Google Drive.


In [None]:
# ==========================================
# BanglaT5 Fine-Tuning (Dialect → Standard Bangla)
# ==========================================

!pip install -q transformers[sentencepiece] datasets evaluate sacrebleu torch pandas openpyxl accelerate

import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from google.colab import drive

# ==========================================
#  Mount Google Drive & Load Dataset
# ==========================================
drive.mount('/content/drive')

try:
    df = pd.read_excel("/content/drive/MyDrive/bangla_dialect_aligned_18920.xlsx")
    print(f" Dataset loaded successfully. Total rows: {len(df)}")
except FileNotFoundError:
    print(" File not found — using a small demo dataset instead.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়।", "তুমি কোথায় যাও?", "আমি বই পড়ি।"],
        'Chittagong': ["হেই ইস্কুলে যায়।", "তুই কই যাও?", "আঁই বই পড়মু।"]
    })

# Select which dialect(s) to train
DIALECTS_TO_TRAIN = ['Noakhali', 'Mymensingh']

# ==========================================
# Load Tokenizer & Model
# ==========================================
MODEL_NAME = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\n Tokenizer for '{MODEL_NAME}' loaded.\n")

# ==========================================
# 🧩 Dataset Preparation
# ==========================================
def create_dataset_dict(dialect_col):
    subset_df = df[['Standard_Bangla', dialect_col]].dropna()
    subset_df = subset_df.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})
    print(f"Dialect '{dialect_col}' samples: {len(subset_df)}")

    hf_dataset = Dataset.from_pandas(subset_df)
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    return DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })

# ==========================================
#  Tokenization
# ==========================================
def tokenize_and_prepare_datasets(dataset_dict):
    def tokenize_fn(examples):
        model_inputs = tokenizer(
            examples["source"], max_length=64, truncation=True, padding="max_length"
        )
        labels = tokenizer(
            text_target=examples["target"], max_length=64, truncation=True, padding="max_length"
        )
        # Mask out padding tokens
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True)
    print(" Tokenization complete.")
    return tokenized_datasets

# ==========================================
#  Train & Evaluate Function
# ==========================================
def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    output_dir = f"/content/drive/MyDrive/Bangla_Dialect_Models/BanglaT5-{dialect_name.lower()}"

    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        # Convert predictions to the appropriate integer type if necessary
        if isinstance(preds, np.ndarray) and preds.dtype != np.int64:
            preds = preds.astype(np.int64)
        elif torch.is_tensor(preds):
            preds = preds.to(torch.int64)

        # Replace out-of-vocabulary token IDs with padding token ID
        preds = np.where((preds >= 0) & (preds < tokenizer.vocab_size), preds, tokenizer.pad_token_id)

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_labels = [[l.strip()] for l in decoded_labels]

        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=3e-4,                     # slightly higher for better convergence
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=5,
        weight_decay=0.01,
        predict_with_generate=True,
        fp16=False,                             # disable to avoid silent overflow
        logging_strategy="steps",
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_steps=500,
        save_total_limit=2,
        generation_max_length=64,
        generation_num_beams=4,
        report_to="none",
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Training BanglaT5 for dialect: {dialect_name}")
    trainer.train()

    print(f"\n Evaluating model on test set for '{dialect_name}'...")
    results = trainer.predict(test_ds)
    bleu_score = results.metrics.get('test_bleu', 0.0)
    print(f" Test BLEU Score for {dialect_name}: {bleu_score:.2f}")

    print("\n Example Outputs:")
    pred_ids = results.predictions
    # Convert predictions to the appropriate integer type if necessary
    if isinstance(pred_ids, np.ndarray) and pred_ids.dtype != np.int64:
      pred_ids = pred_ids.astype(np.int64)
    elif torch.is_tensor(pred_ids):
      pred_ids = pred_ids.to(torch.int64)

    # Replace out-of-vocabulary token IDs with padding token ID for example outputs
    pred_ids = np.where((pred_ids >= 0) & (pred_ids < tokenizer.vocab_size), pred_ids, tokenizer.pad_token_id)


    preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    for i in range(min(3, len(preds))):
        print(f"Dialect:   {test_ds[i]['source']}")
        print(f"Standard:  {test_ds[i]['target']}")
        print(f"Predicted: {preds[i]}\n")

# ==========================================
#  Run Fine-Tuning
# ==========================================
for dialect in DIALECTS_TO_TRAIN:
    dataset_dict = create_dataset_dict(dialect)
    tokenized_data = tokenize_and_prepare_datasets(dataset_dict)
    train_and_evaluate(
        dialect,
        tokenized_data["train"],
        tokenized_data["validation"],
        tokenized_data["test"]
    )

print("\n All training completed! Models saved to your Google Drive.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
 Dataset loaded successfully. Total rows: 3480


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



 Tokenizer for 'csebuetnlp/banglat5' loaded.

Dialect 'Noakhali' samples: 2500


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

 Tokenization complete.


pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Training BanglaT5 for dialect: Noakhali


Step,Training Loss,Validation Loss,Bleu
200,3.4139,2.243392,27.410134
400,2.5668,1.622468,39.750448
600,1.6951,1.431065,43.71052
800,1.3857,1.269101,46.724812
1000,1.5356,1.199299,48.437901
1200,1.0459,1.19185,50.434671
1400,1.1603,1.162313,50.46683
1600,0.849,1.192689,50.705832
1800,0.8335,1.158011,51.781202
2000,0.7884,1.175604,52.232141



 Evaluating model on test set for 'Noakhali'...


 Test BLEU Score for Noakhali: 51.63

 Example Outputs:
Dialect:   যাওয়ার লগে লগে কী এককান শব্দ কইরলো
Standard:  যাওয়ার সাথে সাথে কী একটা শব্দ করল
Predicted: যাওয়ার সাথে সাথে কী একটা শব্দ করলো

Dialect:   তুই কি আরে এই কাম আন করি দিতা হাইরবা নি?
Standard:  তুমি কি আমাকে এই কাজটি করে দিতে পারবে?
Predicted: তুমি কি আমাকে এই কাজ এনে দিতে পারবে ?

Dialect:   তোর হরিক্ষা কবে?
Standard:  তোর পরিক্ষা কবে ?
Predicted: তোমার পরীক্ষা কবে?

Dialect 'Mymensingh' samples: 2500


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

 Tokenization complete.


  trainer = Seq2SeqTrainer(



 Training BanglaT5 for dialect: Mymensingh


Step,Training Loss,Validation Loss,Bleu
200,2.2918,1.227936,52.082886
400,1.4559,0.685322,68.820078
600,0.8926,0.631858,72.238863
800,0.6086,0.606383,73.395013
1000,0.6096,0.562501,75.346593
1200,0.4322,0.560546,76.234691
1400,0.4399,0.545681,76.797089
1600,0.3826,0.531948,78.091593
1800,0.3343,0.54126,78.681931
2000,0.2743,0.561232,77.812434



 Evaluating model on test set for 'Mymensingh'...


 Test BLEU Score for Mymensingh: 77.53

 Example Outputs:
Dialect:   যাওনের লগে লগে কী একটা শব্দ করল
Standard:  যাওয়ার সাথে সাথে কী একটা শব্দ করল
Predicted: যাওয়ার সঙ্গে সঙ্গে কী একটা শব্দ করল

Dialect:   তুমি কিতা আমারে এই কামডা কইরা দিতা ফারবা?
Standard:  তুমি কি আমাকে এই কাজটি করে দিতে পারবে?
Predicted: তুমি কি আমাকে এই কাজটি করে দিতে পারবে?

Dialect:   তর পরিক্ষা কবে ?
Standard:  তোর পরিক্ষা কবে ?
Predicted: তোর পরিক্ষা কবে ?


 All training completed! Models saved to your Google Drive.
