## **MYMENSINGH**

In [None]:
# ===============================
#  Bangla Dialect ‚Üí Standard Bangla using mBART-50 (Improved)
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
import gc # <-- Added for memory cleanup
from google.colab import drive

print("‚úÖ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
#  Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("‚úÖ Successfully loaded the dataset")
except FileNotFoundError:
    print("‚ö†Ô∏è Dataset not found ‚Äî using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["‡¶∏‡ßá ‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Barisal': ["‡¶π‡ßá‡¶á ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Chittagong': ["‡¶π‡ßá‡¶á ‡¶∏‡ßç‡¶ï‡ßã‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Sylhet': ["‡¶§‡¶æ‡¶∞‡ßá ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"]
    })

# Choose dialects to train
DIALECTS_TO_TRAIN = ['Barisal'] # Can now train multiple, one after another

# ===============================
#  Load Tokenizer (mBART)
# ===============================

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer once, it can be reused
# For mBART, we set the src/tgt lang on the tokenizer instance
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.src_lang = "bn_IN"
tokenizer.tgt_lang = "bn_IN"

print(f"\n‚úÖ Tokenizer for '{MODEL_NAME}' loaded.")
print(f" Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).")


# ===============================
#  Helper Functions
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]
    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) < 20: # Need enough data to split
        print(f"‚ö†Ô∏è Insufficient data for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Split, ensuring splits are not too small
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"‚úÖ Dataset splits created for {dialect_col}: Train {len(dataset_dict['train'])}, Val {len(dataset_dict['validation'])}, Test {len(dataset_dict['test'])}")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # mBART tokenizer uses the `src_lang` set on the tokenizer
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        # mBART tokenizer uses `tgt_lang` when `text_target` is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(
        tokenize_fn,
        batched=True,
        remove_columns=['source', 'target'] # <-- Added remove_columns
    )
    print("‚úÖ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- Memory Cleanup ---
    gc.collect()
    torch.cuda.empty_cache()
    # ----------------------

    output_dir = f"/content/Bangla_Dialect_Models/mbart-bangla-{dialect_name.lower()}"

    # --- Load fresh model ---
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # ------------------------

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # --- Improved Training Arguments ---
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2, # <-- Effective batch size = 8

        per_device_eval_batch_size=8,  # <-- Can be larger for eval
        weight_decay=0.01,
        save_total_limit=2,

        # Increased epochs to avoid underfitting
        num_train_epochs=5,           # <-- Set to 5

        predict_with_generate=True,
        fp16=torch.cuda.is_available(),

        # --- CRITICAL FIX: To save the best model ---
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        # --------------------------------------------

        # Your "step" settings
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,     # Must match eval_steps
        logging_steps=100,

        # Standard good practices
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,
        generation_num_beams=4,

        # Added to hide wandb logs
        report_to="none",
    )
    # -----------------------------------

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n Evaluating on the test set for {dialect_name}...")
    # --- Improved: Use original dataset for printing ---
    original_test_ds = split_dataset_dict["test"]

    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f" Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\nüîç Example Translations:")
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
    for i in range(min(5, len(predictions))):
        print(f" Input (Dialect):   {original_test_ds[i]['source']}")
        print(f" Actual (Standard): {original_test_ds[i]['target']}")
        print(f" Predicted:         {predictions[i]}\n")
    # -------------------------------------------------

    # Save the final (best) model
    trainer.save_model(f"{output_dir}/best_model")
    print(f"‚úÖ Best model for {dialect_name} saved to {output_dir}/best_model")

    return trainer


# ===============================
# üöÄ Training Loop (Improved)
# ===============================
for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # The train_and_evaluate function now handles model loading and saving
    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    print(f"--- Finished processing {dialect} ---")


print("\n‚úÖ Finished training all dialect models. Best models are saved in Google Drive.")

‚úÖ Libraries installed and imported successfully.
‚úÖ Successfully loaded the dataset

‚úÖ Tokenizer for 'facebook/mbart-large-50-many-to-many-mmt' loaded.
 Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).

--- Processing dialect: Barisal ---
‚úÖ Dataset splits created for Barisal: Train 2784, Val 348, Test 348


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

‚úÖ Tokenization complete.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Starting training for Barisal dialect...


Step,Training Loss,Validation Loss,Bleu
500,0.1257,0.113813,35.764739
1000,0.0604,0.08956,44.350685
1500,0.0303,0.088473,50.370624


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



 Evaluating on the test set for Barisal...


 Test Set BLEU Score for Barisal: 47.85

üîç Example Translations:
 Input (Dialect):   ‡¶Ü‡¶Æ‡ßç‡¶®‡ßá ‡¶ï‡¶ø ‡¶Æ‡ßã‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡ßá‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?
 Actual (Standard): ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?
 Predicted:         ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡¶õ‡¶®‡ßç‡¶¶‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?

 Input (Dialect):   ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶Ç‡¶°‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶®
 Actual (Standard): ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶ø ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£
 Predicted:         ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶Ç‡¶ü‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£

 Input (Dialect):   ‡¶õ‡ßã‡¶°‡ßã ‡¶¨‡ßÅ‡¶á‡¶® ‡¶ö‡¶ø‡¶≤‡ßç‡¶≤‡¶æ‡¶® ‡¶¶‡¶ø‡ßü‡¶æ ‡¶ì‡¶†‡¶õ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Actual (Standard): ‡¶õ‡ßã‡¶ü‡ßã ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé‡¶ï‡¶æ‡¶∞ ‡¶¶‡¶ø‡ßü‡ßá ‡¶â‡¶†‡¶≤, ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Predicted:         ‡¶õ‡ßã‡¶ü ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé ‡¶•‡ßá‡¶ï‡ßá ‡¶â‡¶†‡ßá‡¶õ‡ßá ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£

 Inp

## **BARISAL**

In [None]:
# ===============================
#  Bangla Dialect ‚Üí Standard Bangla using mBART-50 (Improved)
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
import gc # <-- Added for memory cleanup
from google.colab import drive

print("‚úÖ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
#  Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("‚úÖ Successfully loaded the dataset")
except FileNotFoundError:
    print("‚ö†Ô∏è Dataset not found ‚Äî using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["‡¶∏‡ßá ‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Barisal': ["‡¶π‡ßá‡¶á ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Chittagong': ["‡¶π‡ßá‡¶á ‡¶∏‡ßç‡¶ï‡ßã‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Sylhet': ["‡¶§‡¶æ‡¶∞‡ßá ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"]
    })

# Choose dialects to train
DIALECTS_TO_TRAIN = ['Barisal'] # Can now train multiple, one after another

# ===============================
#  Load Tokenizer (mBART)
# ===============================

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer once, it can be reused
# For mBART, we set the src/tgt lang on the tokenizer instance
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.src_lang = "bn_IN"
tokenizer.tgt_lang = "bn_IN"

print(f"\n‚úÖ Tokenizer for '{MODEL_NAME}' loaded.")
print(f" Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).")


# ===============================
#  Helper Functions
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]
    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) < 20: # Need enough data to split
        print(f"‚ö†Ô∏è Insufficient data for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Split, ensuring splits are not too small
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"‚úÖ Dataset splits created for {dialect_col}: Train {len(dataset_dict['train'])}, Val {len(dataset_dict['validation'])}, Test {len(dataset_dict['test'])}")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # mBART tokenizer uses the `src_lang` set on the tokenizer
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        # mBART tokenizer uses `tgt_lang` when `text_target` is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(
        tokenize_fn,
        batched=True,
        remove_columns=['source', 'target'] # <-- Added remove_columns
    )
    print("‚úÖ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- Memory Cleanup ---
    gc.collect()
    torch.cuda.empty_cache()
    # ----------------------

    output_dir = f"/content/Bangla_Dialect_Models/mbart-bangla-{dialect_name.lower()}"

    # --- Load fresh model ---
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # ------------------------

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # --- Improved Training Arguments ---
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,

        # Use gradient accumulation for a larger effective batch size
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2, # <-- Effective batch size = 8

        per_device_eval_batch_size=8,  # <-- Can be larger for eval
        weight_decay=0.01,
        save_total_limit=2,

        # Increased epochs to avoid underfitting
        num_train_epochs=5,           # <-- Set to 5

        predict_with_generate=True,
        fp16=torch.cuda.is_available(),

        # --- CRITICAL FIX: To save the best model ---
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        # --------------------------------------------

        # Your "step" settings
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,     # Must match eval_steps
        logging_steps=100,

        # Standard good practices
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,
        generation_num_beams=4,

        # Added to hide wandb logs
        report_to="none",
    )
    # -----------------------------------

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n Evaluating on the test set for {dialect_name}...")
    # --- Improved: Use original dataset for printing ---
    original_test_ds = split_dataset_dict["test"]

    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f" Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\nüîç Example Translations:")
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
    for i in range(min(5, len(predictions))):
        print(f" Input (Dialect):   {original_test_ds[i]['source']}")
        print(f" Actual (Standard): {original_test_ds[i]['target']}")
        print(f" Predicted:         {predictions[i]}\n")
    # -------------------------------------------------

    # Save the final (best) model
    trainer.save_model(f"{output_dir}/best_model")
    print(f"‚úÖ Best model for {dialect_name} saved to {output_dir}/best_model")

    return trainer


# ===============================
# üöÄ Training Loop (Improved)
# ===============================
for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # The train_and_evaluate function now handles model loading and saving
    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    print(f"--- Finished processing {dialect} ---")


print("\n‚úÖ Finished training all dialect models. Best models are saved in Google Drive.")

‚úÖ Libraries installed and imported successfully.
‚úÖ Successfully loaded the dataset

‚úÖ Tokenizer for 'facebook/mbart-large-50-many-to-many-mmt' loaded.
 Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).

--- Processing dialect: Barisal ---
‚úÖ Dataset splits created for Barisal: Train 2784, Val 348, Test 348


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

‚úÖ Tokenization complete.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Starting training for Barisal dialect...


Step,Training Loss,Validation Loss,Bleu
500,0.1257,0.113813,35.764739
1000,0.0604,0.08956,44.350685
1500,0.0303,0.088473,50.370624


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



 Evaluating on the test set for Barisal...


 Test Set BLEU Score for Barisal: 47.85

üîç Example Translations:
 Input (Dialect):   ‡¶Ü‡¶Æ‡ßç‡¶®‡ßá ‡¶ï‡¶ø ‡¶Æ‡ßã‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡ßá‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?
 Actual (Standard): ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?
 Predicted:         ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡¶õ‡¶®‡ßç‡¶¶‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?

 Input (Dialect):   ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶Ç‡¶°‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶®
 Actual (Standard): ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶ø ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£
 Predicted:         ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶Ç‡¶ü‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£

 Input (Dialect):   ‡¶õ‡ßã‡¶°‡ßã ‡¶¨‡ßÅ‡¶á‡¶® ‡¶ö‡¶ø‡¶≤‡ßç‡¶≤‡¶æ‡¶® ‡¶¶‡¶ø‡ßü‡¶æ ‡¶ì‡¶†‡¶õ‡ßá ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Actual (Standard): ‡¶õ‡ßã‡¶ü‡ßã ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé‡¶ï‡¶æ‡¶∞ ‡¶¶‡¶ø‡ßü‡ßá ‡¶â‡¶†‡¶≤, ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Predicted:         ‡¶õ‡ßã‡¶ü ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé ‡¶•‡ßá‡¶ï‡ßá ‡¶â‡¶†‡ßá‡¶õ‡ßá ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£

 Inp

## **SYLHET**


In [None]:
# ===============================
#  Bangla Dialect ‚Üí Standard Bangla using mBART-50 (Improved)
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
import gc # <-- Added for memory cleanup
from google.colab import drive

print("‚úÖ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
#  Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("‚úÖ Successfully loaded the dataset")
except FileNotFoundError:
    print("‚ö†Ô∏è Dataset not found ‚Äî using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["‡¶∏‡ßá ‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Barisal': ["‡¶π‡ßá‡¶á ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Chittagong': ["‡¶π‡ßá‡¶á ‡¶∏‡ßç‡¶ï‡ßã‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Sylhet': ["‡¶§‡¶æ‡¶∞‡ßá ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"]
    })

# Choose dialects to train
DIALECTS_TO_TRAIN = ['Sylhet'] # Can now train multiple, one after another

# ===============================
#  Load Tokenizer (mBART)
# ===============================

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer once, it can be reused
# For mBART, we set the src/tgt lang on the tokenizer instance
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.src_lang = "bn_IN"
tokenizer.tgt_lang = "bn_IN"

print(f"\n‚úÖ Tokenizer for '{MODEL_NAME}' loaded.")
print(f" Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).")


# ===============================
#  Helper Functions
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]
    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) < 20: # Need enough data to split
        print(f"‚ö†Ô∏è Insufficient data for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Split, ensuring splits are not too small
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"‚úÖ Dataset splits created for {dialect_col}: Train {len(dataset_dict['train'])}, Val {len(dataset_dict['validation'])}, Test {len(dataset_dict['test'])}")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # mBART tokenizer uses the `src_lang` set on the tokenizer
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        # mBART tokenizer uses `tgt_lang` when `text_target` is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(
        tokenize_fn,
        batched=True,
        remove_columns=['source', 'target'] # <-- Added remove_columns
    )
    print("‚úÖ Tokenization complete.") # Corrected indentation
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- Memory Cleanup ---
    gc.collect()
    torch.cuda.empty_cache()
    # ----------------------

    output_dir = f"/content/Bangla_Dialect_Models/mbart-bangla-{dialect_name.lower()}"

    # --- Load fresh model ---
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # ------------------------

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # --- Improved Training Arguments ---
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,

        # Use gradient accumulation for a larger effective batch size
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2, # <-- Effective batch size = 8

        per_device_eval_batch_size=8,  # <-- Can be larger for eval
        weight_decay=0.01,
        save_total_limit=2,

        # Increased epochs to avoid underfitting
        num_train_epochs=5,           # <-- Set to 5

        predict_with_generate=True,
        fp16=torch.cuda.is_available(),

        # --- CRITICAL FIX: To save the best model ---
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        # --------------------------------------------

        # Your "step" settings
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,     # Must match eval_steps
        logging_steps=100,

        # Standard good practices
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,
        generation_num_beams=4,

        # Added to hide wandb logs
        report_to="none",
    )
    # -----------------------------------

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n Evaluating on the test set for {dialect_name}...")
    # --- Improved: Use original dataset for printing ---
    original_test_ds = split_dataset_dict["test"]

    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f" Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\nüîç Example Translations:")
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
    for i in range(min(5, len(predictions))):
        print(f" Input (Dialect):   {original_test_ds[i]['source']}")
        print(f" Actual (Standard): {original_test_ds[i]['target']}")
        print(f" Predicted:         {predictions[i]}\n")
    # -------------------------------------------------

    # Save the final (best) model
    trainer.save_model(f"{output_dir}/best_model")
    print(f"‚úÖ Best model for {dialect_name} saved to {output_dir}/best_model")

    return trainer


# ===============================
# üöÄ Training Loop (Improved)
# ===============================
for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # The train_and_evaluate function now handles model loading and saving
    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    print(f"--- Finished processing {dialect} ---")


print("\n‚úÖ Finished training all dialect models. Best models are saved in Google Drive.")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Libraries installed and imported successfully.
‚úÖ Successfully loaded the dataset


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]


‚úÖ Tokenizer for 'facebook/mbart-large-50-many-to-many-mmt' loaded.
 Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).

--- Processing dialect: Sylhet ---
‚úÖ Dataset splits created for Sylhet: Train 2784, Val 348, Test 348


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

‚úÖ Tokenization complete.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Starting training for Sylhet dialect...


Step,Training Loss,Validation Loss,Bleu
500,0.1384,0.131626,28.56629
1000,0.0721,0.108993,37.558164
1500,0.0357,0.108468,41.329051


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



 Evaluating on the test set for Sylhet...


 Test Set BLEU Score for Sylhet: 38.55

üîç Example Translations:
 Input (Dialect):   ‡¶Ü‡¶´‡¶®‡ßá ‡¶ï‡¶ø‡¶§‡¶æ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®'‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶´‡¶æ‡¶∞‡¶¨‡¶æ ‡¶®‡¶ø?
 Actual (Standard): ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?
 Predicted:         ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶¨‡¶®‡ßç‡¶ß‡ßÅ‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡¶¨‡ßá‡¶®?

 Input (Dialect):   ‡¶Ü‡¶ï‡¶æ‡¶∂ ‡¶ì‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£
 Actual (Standard): ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶ø ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£
 Predicted:         ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£

 Input (Dialect):   ‡¶õ‡ßã‡¶ü ‡¶¨‡¶á‡¶®‡ßá ‡¶ö‡¶ø‡¶≤‡ßç‡¶≤‡¶æ‡¶á‡ßü‡¶æ ‡¶â‡¶†‡¶≤‡ßã, ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Actual (Standard): ‡¶õ‡ßã‡¶ü‡ßã ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé‡¶ï‡¶æ‡¶∞ ‡¶¶‡¶ø‡ßü‡ßá ‡¶â‡¶†‡¶≤, ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Predicted:         ‡¶õ‡ßã‡¶ü‡ßã ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡¶∞‡¶ø‡¶Ø‡¶º‡ßá ‡¶â‡¶†‡¶≤‡ßã,‡¶¶‡¶æ‡¶∞‡ßÅ‡¶®

 Input (D

## **CHITTAGONG**

In [None]:
# ===============================
#  Bangla Dialect ‚Üí Standard Bangla using mBART-50 (Improved)
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
import gc # <-- Added for memory cleanup
from google.colab import drive

print("‚úÖ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
#  Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("‚úÖ Successfully loaded the dataset")
except FileNotFoundError:
    print("‚ö†Ô∏è Dataset not found ‚Äî using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["‡¶∏‡ßá ‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Barisal': ["‡¶π‡ßá‡¶á ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Chittagong': ["‡¶π‡ßá‡¶á ‡¶∏‡ßç‡¶ï‡ßã‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Sylhet': ["‡¶§‡¶æ‡¶∞‡ßá ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"]
    })

# Choose dialects to train
DIALECTS_TO_TRAIN = ['Chittagong'] # Can now train multiple, one after another

# ===============================
#  Load Tokenizer (mBART)
# ===============================

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer once, it can be reused
# For mBART, we set the src/tgt lang on the tokenizer instance
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.src_lang = "bn_IN"
tokenizer.tgt_lang = "bn_IN"

print(f"\n‚úÖ Tokenizer for '{MODEL_NAME}' loaded.")
print(f" Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).")


# ===============================
#  Helper Functions
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]
    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) < 20: # Need enough data to split
        print(f"‚ö†Ô∏è Insufficient data for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Split, ensuring splits are not too small
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"‚úÖ Dataset splits created for {dialect_col}: Train {len(dataset_dict['train'])}, Val {len(dataset_dict['validation'])}, Test {len(dataset_dict['test'])}")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # mBART tokenizer uses the `src_lang` set on the tokenizer
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        # mBART tokenizer uses `tgt_lang` when `text_target` is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(
        tokenize_fn,
        batched=True,
        remove_columns=['source', 'target'] # <-- Added remove_columns
    )
    print("‚úÖ Tokenization complete.") # Corrected indentation
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- Memory Cleanup ---
    gc.collect()
    torch.cuda.empty_cache()
    # ----------------------

    output_dir = f"/content/Bangla_Dialect_Models/mbart-bangla-{dialect_name.lower()}"

    # --- Load fresh model ---
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # ------------------------

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # --- Improved Training Arguments ---
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,

        # Use gradient accumulation for a larger effective batch size
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2, # <-- Effective batch size = 8

        per_device_eval_batch_size=8,  # <-- Can be larger for eval
        weight_decay=0.01,
        save_total_limit=2,

        # Increased epochs to avoid underfitting
        num_train_epochs=5,           # <-- Set to 5

        predict_with_generate=True,
        fp16=torch.cuda.is_available(),

        # --- CRITICAL FIX: To save the best model ---
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        # --------------------------------------------

        # Your "step" settings
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,     # Must match eval_steps
        logging_steps=100,

        # Standard good practices
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,
        generation_num_beams=4,

        # Added to hide wandb logs
        report_to="none",
    )
    # -----------------------------------

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n Evaluating on the test set for {dialect_name}...")
    # --- Improved: Use original dataset for printing ---
    original_test_ds = split_dataset_dict["test"]

    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f" Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\nüîç Example Translations:")
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
    for i in range(min(5, len(predictions))):
        print(f" Input (Dialect):   {original_test_ds[i]['source']}")
        print(f" Actual (Standard): {original_test_ds[i]['target']}")
        print(f" Predicted:         {predictions[i]}\n")
    # -------------------------------------------------

    # Save the final (best) model
    trainer.save_model(f"{output_dir}/best_model")
    print(f"‚úÖ Best model for {dialect_name} saved to {output_dir}/best_model")

    return trainer


# ===============================
# üöÄ Training Loop (Improved)
# ===============================
for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # The train_and_evaluate function now handles model loading and saving
    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    print(f"--- Finished processing {dialect} ---")


print("\n‚úÖ Finished training all dialect models. Best models are saved in Google Drive.")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Libraries installed and imported successfully.
‚úÖ Successfully loaded the dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]


‚úÖ Tokenizer for 'facebook/mbart-large-50-many-to-many-mmt' loaded.
 Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).

--- Processing dialect: Chittagong ---
‚úÖ Dataset splits created for Chittagong: Train 2784, Val 348, Test 348


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

‚úÖ Tokenization complete.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Starting training for Chittagong dialect...


Step,Training Loss,Validation Loss,Bleu
500,0.1727,0.166116,20.532833
1000,0.0907,0.136891,32.607105
1500,0.0462,0.139361,33.216564


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



 Evaluating on the test set for Chittagong...


 Test Set BLEU Score for Chittagong: 30.98

üîç Example Translations:
 Input (Dialect):   ‡¶Ö‡¶®‡ßá ‡¶ï‡¶ø ‡¶Ü‡¶Å‡¶∞ ‡¶´‡ßç‡¶∞‡¶∂‡ßç‡¶®‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§ ‡¶´‡¶æ‡¶∞‡¶ø‡¶¨‡¶æ‡¶® ‡¶®‡ßá
 Actual (Standard): ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?
 Predicted:         ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡¶õ‡¶®‡ßç‡¶¶‡ßá‡¶∞ ‡¶ú‡¶¨‡¶æ‡¶¨ ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡¶®?

 Input (Dialect):   ‡¶Ü‡¶∏‡ßç‡¶∏‡¶æ‡¶® ‡¶Ö‡¶∞ ‡¶®‡¶ø‡¶≤ ‡¶∞‡¶Ç ‡¶á‡¶¨‡¶æ ‡¶Ö‡¶∏‡¶æ‡¶¶‡¶æ‡¶∞‡¶®
 Actual (Standard): ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶ø ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£
 Predicted:         ‡¶Ü‡¶ï‡¶æ‡¶∂‡ßá‡¶∞ ‡¶®‡ßÄ‡¶≤ ‡¶∞‡¶ô‡¶ü‡¶ø ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£

 Input (Dialect):   ‡¶õ‡ßã‡ßã‡¶° ‡¶≠‡ßã‡¶á‡¶® ‡¶ö‡¶ø‡ßé‡¶ï‡¶æ‡¶∞ ‡¶¶‡¶ø‡ßü‡ßá‡¶∞‡ßá ‡¶â‡¶°‡¶ø ‡¶ó‡¶ø‡ßü‡ßá
 Actual (Standard): ‡¶õ‡ßã‡¶ü‡ßã ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé‡¶ï‡¶æ‡¶∞ ‡¶¶‡¶ø‡ßü‡ßá ‡¶â‡¶†‡¶≤, ‡¶¶‡¶æ‡¶∞‡ßÅ‡¶£
 Predicted:         ‡¶õ‡ßã‡¶ü ‡¶¨‡ßã‡¶® ‡¶ö‡¶ø‡ßé‡¶ï‡¶æ‡¶∞ ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶â‡¶†‡ßá ‡¶ó‡¶ø‡¶Ø‡

## **NOAKHALI**

In [None]:
# ===============================
#  Bangla Dialect ‚Üí Standard Bangla using mBART-50 (Improved)
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
import gc # <-- Added for memory cleanup
from google.colab import drive

print("‚úÖ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
#  Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("‚úÖ Successfully loaded the dataset")
except FileNotFoundError:
    print("‚ö†Ô∏è Dataset not found ‚Äî using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["‡¶∏‡ßá ‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Barisal': ["‡¶π‡ßá‡¶á ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Chittagong': ["‡¶π‡ßá‡¶á ‡¶∏‡ßç‡¶ï‡ßã‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"],
        'Sylhet': ["‡¶§‡¶æ‡¶∞‡ßá ‡¶á‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤‡ßá ‡¶Ø‡¶æ‡¶Ø‡¶º„ÄÇ"]
    })

# Choose dialects to train
DIALECTS_TO_TRAIN = ['Noakhali'] # Can now train multiple, one after another

# ===============================
#  Load Tokenizer (mBART)
# ===============================

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer once, it can be reused
# For mBART, we set the src/tgt lang on the tokenizer instance
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.src_lang = "bn_IN"
tokenizer.tgt_lang = "bn_IN"

print(f"\n‚úÖ Tokenizer for '{MODEL_NAME}' loaded.")
print(f" Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).")


# ===============================
#  Helper Functions
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]
    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) < 20: # Need enough data to split
        print(f"‚ö†Ô∏è Insufficient data for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Split, ensuring splits are not too small
    train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
    test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"‚úÖ Dataset splits created for {dialect_col}: Train {len(dataset_dict['train'])}, Val {len(dataset_dict['validation'])}, Test {len(dataset_dict['test'])}")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # mBART tokenizer uses the `src_lang` set on the tokenizer
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        # mBART tokenizer uses `tgt_lang` when `text_target` is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(
        tokenize_fn,
        batched=True,
        remove_columns=['source', 'target'] # <-- Added remove_columns
    )
    print("‚úÖ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- Memory Cleanup ---
    gc.collect()
    torch.cuda.empty_cache()
    # ----------------------

    output_dir = f"/content/Bangla_Dialect_Models/mbart-bangla-{dialect_name.lower()}"

    # --- Load fresh model ---
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # ------------------------

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # --- Improved Training Arguments ---
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,

        # Use gradient accumulation for a larger effective batch size
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2, # <-- Effective batch size = 8

        per_device_eval_batch_size=8,  # <-- Can be larger for eval
        weight_decay=0.01,
        save_total_limit=2,

        # Increased epochs to avoid underfitting
        num_train_epochs=5,           # <-- Set to 5

        predict_with_generate=True,
        fp16=torch.cuda.is_available(),

        # --- CRITICAL FIX: To save the best model ---
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        # --------------------------------------------

        # Your "step" settings
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,     # Must match eval_steps
        logging_steps=100,

        # Standard good practices
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,
        generation_num_beams=4,

        # Added to hide wandb logs
        report_to="none",
    )
    # -----------------------------------

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n Evaluating on the test set for {dialect_name}...")
    # --- Improved: Use original dataset for printing ---
    original_test_ds = split_dataset_dict["test"]

    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f" Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\nüîç Example Translations:")
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
    for i in range(min(5, len(predictions))):
        print(f" Input (Dialect):   {original_test_ds[i]['source']}")
        print(f" Actual (Standard): {original_test_ds[i]['target']}")
        print(f" Predicted:         {predictions[i]}\n")
    # -------------------------------------------------

    # Save the final (best) model
    trainer.save_model(f"{output_dir}/best_model")
    print(f"‚úÖ Best model for {dialect_name} saved to {output_dir}/best_model")

    return trainer


# ===============================
# üöÄ Training Loop (Improved)
# ===============================
for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # The train_and_evaluate function now handles model loading and saving
    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    print(f"--- Finished processing {dialect} ---")


print("\n‚úÖ Finished training all dialect models. Best models are saved in Google Drive.")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Libraries installed and imported successfully.
‚úÖ Successfully loaded the dataset


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]


‚úÖ Tokenizer for 'facebook/mbart-large-50-many-to-many-mmt' loaded.
 Language pair: bn_IN ‚Üí bn_IN (Dialect ‚Üí Standard).

--- Processing dialect: Noakhali ---
‚úÖ Dataset splits created for Noakhali: Train 2000, Val 250, Test 250


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

‚úÖ Tokenization complete.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



 Starting training for Noakhali dialect...


Step,Training Loss,Validation Loss,Bleu
500,0.1623,0.15191,28.633643
1000,0.0591,0.140994,35.489501


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



 Evaluating on the test set for Noakhali...


 Test Set BLEU Score for Noakhali: 39.56

üîç Example Translations:
 Input (Dialect):   ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ‡¶∞ ‡¶≤‡¶ó‡ßá ‡¶≤‡¶ó‡ßá ‡¶ï‡ßÄ ‡¶è‡¶ï‡¶ï‡¶æ‡¶® ‡¶∂‡¶¨‡ßç‡¶¶ ‡¶ï‡¶á‡¶∞‡¶≤‡ßã
 Actual (Standard): ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶∏‡¶æ‡¶•‡ßá ‡¶ï‡ßÄ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∂‡¶¨‡ßç‡¶¶ ‡¶ï‡¶∞‡¶≤
 Predicted:         ‡¶Ø‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶∏‡¶æ‡¶•‡ßá ‡¶ï‡ßÄ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶ï‡¶∞‡¶≤‡ßã

 Input (Dialect):   ‡¶§‡ßÅ‡¶á ‡¶ï‡¶ø ‡¶Ü‡¶∞‡ßá ‡¶è‡¶á ‡¶ï‡¶æ‡¶Æ ‡¶Ü‡¶® ‡¶ï‡¶∞‡¶ø ‡¶¶‡¶ø‡¶§‡¶æ ‡¶π‡¶æ‡¶á‡¶∞‡¶¨‡¶æ ‡¶®‡¶ø?
 Actual (Standard): ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶ï‡ßá ‡¶è‡¶á ‡¶ï‡¶æ‡¶ú‡¶ü‡¶ø ‡¶ï‡¶∞‡ßá ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡¶¨‡ßá?
 Predicted:         ‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ï‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶ï‡ßá ‡¶è‡¶á ‡¶ï‡¶æ‡¶ú‡¶ü‡¶ø ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶¶‡¶ø‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡¶¨‡ßá ?

 Input (Dialect):   ‡¶§‡ßã‡¶∞ ‡¶π‡¶∞‡¶ø‡¶ï‡ßç‡¶∑‡¶æ ‡¶ï‡¶¨‡ßá?
 Actual (Standard): ‡¶§‡ßã‡¶∞ ‡¶™‡¶∞‡¶ø‡¶ï‡ßç‡¶∑‡¶æ ‡¶ï‡¶¨‡ßá ?
 Predicted:         ‡¶§‡ßã‡¶Æ‡¶æ‡¶∞ ‡¶™‡¶õ‡¶®‡ßç‡¶¶ ‡¶ï‡¶¨‡ßá?

 Input (Dialect):   ‡¶π‡ßá‡¶§‡ßá 