# **Barishal** **3** **Epoch**

In [None]:
# ===============================
# 🧠 Bangla Dialect → Standard Bangla using NLLB-200
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
from google.colab import drive

print("✅ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
# 📂 Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("⚠ Dataset not found — using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়。"],
        'Barisal': ["হেই ইস্কুলে যায়。"],
        'Chittagong': ["হেই স্কোলে যায়。"],
        'Sylhet': ["তারে ইস্কুলে যায়。"]
    })

# Choose dialects to train (can be one or multiple)
DIALECTS_TO_TRAIN = ['Barisal'] # You can change this, e.g., ['Barisal', 'Sylhet']

# ===============================
# ⚙ Load Model & Tokenizer (NLLB Version)
# ===============================

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Set source and target languages (Bangla, using NLLB's code)
SRC_LANG = "ben_Beng"
TGT_LANG = "ben_Beng"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# --- CRITICAL for NLLB ---
# Force the model to always generate Bengali tokens
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
# -------------------------

print(f"\n✅ Tokenizer for '{MODEL_NAME}' loaded.")
print(f"🌐 Language pair: {SRC_LANG} → {TGT_LANG} (Dialect → Standard).")


# ===============================
# 🧩 Helper Functions (Unchanged)
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    # Ensure data is string and not empty, just in case
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]

    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) == 0:
        print(f"⚠ No valid data found for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Handle small datasets
    test_size = min(0.2, 500 / len(hf_dataset)) if len(hf_dataset) > 500 else 0.2
    if len(hf_dataset) * (1 - test_size) < 10: # Ensure train set is not too small
         print(f"⚠ Very small dataset for {dialect_col}. Using all for training.")
         return DatasetDict({'train': hf_dataset, 'validation': hf_dataset, 'test': hf_dataset})

    train_test_split = hf_dataset.train_test_split(test_size=test_size, seed=42)

    val_size = min(0.5, 250 / len(train_test_split['test'])) if len(train_test_split['test']) > 250 else 0.5
    if len(train_test_split['test']) * (1 - val_size) < 5: # Ensure val set is not too small
        test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
    else:
        test_val_split = train_test_split['test'].train_test_split(test_size=val_size, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"✅ Dataset splits created for {dialect_col}: Train/Val/Test")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # The tokenizer will use tokenizer.src_lang set earlier
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # The tokenizer will use tokenizer.tgt_lang when text_target is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['source', 'target'])
    print("✅ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- MODIFIED: Updated output directory for NLLB ---
    output_dir = f"/content/Bangla_Dialect_Models/nllb-bangla-{dialect_name.lower()}"

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        # Compute BLEU
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,  # Increased batch size for NLLB-600M, adjust based on VRAM
        per_device_eval_batch_size=8,   # Increased batch size for NLLB-600M
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=3,
        predict_with_generate=True,     # This is crucial!
        fp16=torch.cuda.is_available(), # Use FP16 if you have a GPU
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,      # Must match tokenization max_length
        generation_num_beams=4,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,    # Good practice
        metric_for_best_model="bleu",   # Good practice
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n🚀 Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n📊 Evaluating on the test set for {dialect_name}...")
    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f"🎯 Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\n🔍 Example Translations:")
    # We need to get the original items from the test_ds before tokenization
    # Since tokenized_datasets removed columns, let's re-tokenize just for prediction examples

    # Re-fetch original test data for comparison
    original_test_ds = split_dataset_dict["test"]
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

    for i in range(min(3, len(predictions))):
        print(f"🗣 Input (Dialect):   {original_test_ds[i]['source']}")
        print(f"📘 Actual (Standard): {original_test_ds[i]['target']}")
        print(f"🤖 Predicted:         {predictions[i]}\n")

    # Save the final model
    trainer.save_model(f"{output_dir}/final_model")
    print(f"✅ Final model for {dialect_name} saved to {output_dir}/final_model")

    return trainer


# ===============================
# 🚀 Training Loop
# ===============================
global_model = model # Use one model instance to train sequentially
global_tokenizer = tokenizer

for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # Pass the global model to continue fine-tuning
    # Or re-load the base model if you want separate models:
    # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    # Update the global model with the newly trained one for the next loop
    # This creates a "curriculum" where the model learns one dialect, then the next
    # If you want separate models, uncomment the reload lines above.
    global_model = trained_trainer.model


print("\n✅ Finished training all dialect models. Models are saved in Google Drive.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries installed and imported successfully.
✅ Successfully loaded the dataset.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]


✅ Tokenizer for 'facebook/nllb-200-distilled-600M' loaded.
🌐 Language pair: ben_Beng → ben_Beng (Dialect → Standard).

--- Processing dialect: Barisal ---
✅ Dataset splits created for Barisal: Train/Val/Test


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

✅ Tokenization complete.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



🚀 Starting training for Barisal dialect...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
500,3.6399,2.834762,30.199009
1000,0.5203,0.398505,39.161888


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Evaluating on the test set for Barisal...


🎯 Test Set BLEU Score for Barisal: 40.50

🔍 Example Translations:
🗣 Input (Dialect):   মোর চাকরি করতে আর ভালো লাগদেয়াছে না
📘 Actual (Standard): চাকরি করতে আর ভালো লাগে না
🤖 Predicted:         আমার চাকরি করতে আর ভালো লাগে না

🗣 Input (Dialect):   পরথমে এউক্কা দোকানে যাইতে হইবে
📘 Actual (Standard): প্রথমে একটি দোকানে যেতে হবে
🤖 Predicted:         প্রথমে একটা দোকানে যেতে হবে

🗣 Input (Dialect):   হেইহানের পানি টলটইল্লা
📘 Actual (Standard): সেখানকার পানি টলটলে
🤖 Predicted:         তার পানি ঝলমলে ছিল

✅ Final model for Barisal saved to /content/Bangla_Dialect_Models/nllb-bangla-barisal/final_model

✅ Finished training all dialect models. Models are saved in Google Drive.


# **Barishal** **with** **Epoch** **5**

In [1]:
# ===============================
# 🧠 Bangla Dialect → Standard Bangla using NLLB-200
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
from google.colab import drive

print("✅ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
# 📂 Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("⚠ Dataset not found — using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়。"],
        'Barisal': ["হেই ইস্কুলে যায়。"],
        'Chittagong': ["হেই স্কোলে যায়。"],
        'Sylhet': ["তারে ইস্কুলে যায়。"]
    })

# Choose dialects to train (can be one or multiple)
DIALECTS_TO_TRAIN = ['Barisal'] # You can change this, e.g., ['Barisal', 'Sylhet']

# ===============================
# ⚙ Load Model & Tokenizer (NLLB Version)
# ===============================

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Set source and target languages (Bangla, using NLLB's code)
SRC_LANG = "ben_Beng"
TGT_LANG = "ben_Beng"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# --- CRITICAL for NLLB ---
# Force the model to always generate Bengali tokens
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
# -------------------------

print(f"\n✅ Tokenizer for '{MODEL_NAME}' loaded.")
print(f"🌐 Language pair: {SRC_LANG} → {TGT_LANG} (Dialect → Standard).")


# ===============================
# 🧩 Helper Functions (Unchanged)
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    # Ensure data is string and not empty, just in case
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]

    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) == 0:
        print(f"⚠ No valid data found for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Handle small datasets
    test_size = min(0.2, 500 / len(hf_dataset)) if len(hf_dataset) > 500 else 0.2
    if len(hf_dataset) * (1 - test_size) < 10: # Ensure train set is not too small
         print(f"⚠ Very small dataset for {dialect_col}. Using all for training.")
         return DatasetDict({'train': hf_dataset, 'validation': hf_dataset, 'test': hf_dataset})

    train_test_split = hf_dataset.train_test_split(test_size=test_size, seed=42)

    val_size = min(0.5, 250 / len(train_test_split['test'])) if len(train_test_split['test']) > 250 else 0.5
    if len(train_test_split['test']) * (1 - val_size) < 5: # Ensure val set is not too small
        test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
    else:
        test_val_split = train_test_split['test'].train_test_split(test_size=val_size, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"✅ Dataset splits created for {dialect_col}: Train/Val/Test")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # The tokenizer will use tokenizer.src_lang set earlier
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # The tokenizer will use tokenizer.tgt_lang when text_target is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['source', 'target'])
    print("✅ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- MODIFIED: Updated output directory for NLLB ---
    output_dir = f"/content/Bangla_Dialect_Models/nllb-bangla-{dialect_name.lower()}"

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        # Compute BLEU
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,  # Increased batch size for NLLB-600M, adjust based on VRAM
        per_device_eval_batch_size=8,   # Increased batch size for NLLB-600M
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=5,
        predict_with_generate=True,     # This is crucial!
        fp16=torch.cuda.is_available(), # Use FP16 if you have a GPU
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,      # Must match tokenization max_length
        generation_num_beams=4,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,    # Good practice
        metric_for_best_model="bleu",   # Good practice
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n🚀 Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n📊 Evaluating on the test set for {dialect_name}...")
    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f"🎯 Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\n🔍 Example Translations:")
    # We need to get the original items from the test_ds before tokenization
    # Since tokenized_datasets removed columns, let's re-tokenize just for prediction examples

    # Re-fetch original test data for comparison
    original_test_ds = split_dataset_dict["test"]
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

    for i in range(min(5, len(predictions))):
        print(f"🗣 Input (Dialect):   {original_test_ds[i]['source']}")
        print(f"📘 Actual (Standard): {original_test_ds[i]['target']}")
        print(f"🤖 Predicted:         {predictions[i]}\n")

    # Save the final model
    trainer.save_model(f"{output_dir}/final_model")
    print(f"✅ Final model for {dialect_name} saved to {output_dir}/final_model")

    return trainer


# ===============================
# 🚀 Training Loop
# ===============================
global_model = model # Use one model instance to train sequentially
global_tokenizer = tokenizer

for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # Pass the global model to continue fine-tuning
    # Or re-load the base model if you want separate models:
    # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    # Update the global model with the newly trained one for the next loop
    # This creates a "curriculum" where the model learns one dialect, then the next
    # If you want separate models, uncomment the reload lines above.
    global_model = trained_trainer.model


print("\n✅ Finished training all dialect models. Models are saved in Google Drive.")

✅ Libraries installed and imported successfully.
✅ Successfully loaded the dataset.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



✅ Tokenizer for 'facebook/nllb-200-distilled-600M' loaded.
🌐 Language pair: ben_Beng → ben_Beng (Dialect → Standard).

--- Processing dialect: Barisal ---
✅ Dataset splits created for Barisal: Train/Val/Test


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

✅ Tokenization complete.


  trainer = Seq2SeqTrainer(



🚀 Starting training for Barisal dialect...


[34m[1mwandb[0m: Currently logged in as: [33mimran-bhuiyan[0m ([33mimran-bhuiyan-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
500,3.5412,2.670437,30.917659
1000,0.1347,0.096729,45.113745
1500,0.0624,0.068346,49.71968


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Evaluating on the test set for Barisal...


🎯 Test Set BLEU Score for Barisal: 49.01

🔍 Example Translations:
🗣 Input (Dialect):   মোর চাকরি করতে আর ভালো লাগদেয়াছে না
📘 Actual (Standard): চাকরি করতে আর ভালো লাগে না
🤖 Predicted:         আমার চাকরি করতে আর ভালো লাগছে না

🗣 Input (Dialect):   পরথমে এউক্কা দোকানে যাইতে হইবে
📘 Actual (Standard): প্রথমে একটি দোকানে যেতে হবে
🤖 Predicted:         প্রথমে একটি দোকানে যেতে হবে

🗣 Input (Dialect):   হেইহানের পানি টলটইল্লা
📘 Actual (Standard): সেখানকার পানি টলটলে
🤖 Predicted:         সেখানকার পানি টলটল

🗣 Input (Dialect):   তোমারে দেখলাম
📘 Actual (Standard): তোমাকে দেখলাম
🤖 Predicted:         তোমাকে দেখলাম

🗣 Input (Dialect):   একজন আরেকজনের প্রতি যত বেশি বিশ্বাস করবে , হ্যাগো ভালোবাসার পালা-পইরান হ্যাতো ভারী হইবে
📘 Actual (Standard): একে অপরের প্রতি যতো বেশী বিশ্বাস থাকবে, তাদের ভালোবাসার পাল্লা ততো ভারী হবে
🤖 Predicted:         একজন আরেকজনের প্রতি যত বেশি বিশ্বাস করবে, তাদের ভালোবাসার পালা তত ভারি হবে

✅ Final model for Barisal saved to /content/Bangla_Dialect_Models/nllb-bangla-barisal/fi

# **Sylhet** **with** **3** **Epoch**

In [None]:
# ===============================
# 🧠 Bangla Dialect → Standard Bangla using NLLB-200
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
from google.colab import drive

print("✅ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
# 📂 Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("⚠ Dataset not found — using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়。"],
        'Barisal': ["হেই ইস্কুলে যায়。"],
        'Chittagong': ["হেই স্কোলে যায়。"],
        'Sylhet': ["তারে ইস্কুলে যায়。"]
    })

# Choose dialects to train (can be one or multiple)
DIALECTS_TO_TRAIN = ['Sylhet'] # You can change this, e.g., ['Barisal', 'Sylhet']

# ===============================
# ⚙ Load Model & Tokenizer (NLLB Version)
# ===============================

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Set source and target languages (Bangla, using NLLB's code)
SRC_LANG = "ben_Beng"
TGT_LANG = "ben_Beng"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# --- CRITICAL for NLLB ---
# Force the model to always generate Bengali tokens
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
# -------------------------

print(f"\n✅ Tokenizer for '{MODEL_NAME}' loaded.")
print(f"🌐 Language pair: {SRC_LANG} → {TGT_LANG} (Dialect → Standard).")


# ===============================
# 🧩 Helper Functions (Unchanged)
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    # Ensure data is string and not empty, just in case
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]

    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) == 0:
        print(f"⚠ No valid data found for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Handle small datasets
    test_size = min(0.2, 500 / len(hf_dataset)) if len(hf_dataset) > 500 else 0.2
    if len(hf_dataset) * (1 - test_size) < 10: # Ensure train set is not too small
         print(f"⚠ Very small dataset for {dialect_col}. Using all for training.")
         return DatasetDict({'train': hf_dataset, 'validation': hf_dataset, 'test': hf_dataset})

    train_test_split = hf_dataset.train_test_split(test_size=test_size, seed=42)

    val_size = min(0.5, 250 / len(train_test_split['test'])) if len(train_test_split['test']) > 250 else 0.5
    if len(train_test_split['test']) * (1 - val_size) < 5: # Ensure val set is not too small
        test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
    else:
        test_val_split = train_test_split['test'].train_test_split(test_size=val_size, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"✅ Dataset splits created for {dialect_col}: Train/Val/Test")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # The tokenizer will use tokenizer.src_lang set earlier
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # The tokenizer will use tokenizer.tgt_lang when text_target is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['source', 'target'])
    print("✅ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- MODIFIED: Updated output directory for NLLB ---
    output_dir = f"/content/Bangla_Dialect_Models/nllb-bangla-{dialect_name.lower()}"

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        # Compute BLEU
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,  # Increased batch size for NLLB-600M, adjust based on VRAM
        per_device_eval_batch_size=8,   # Increased batch size for NLLB-600M
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=3,
        predict_with_generate=True,     # This is crucial!
        fp16=torch.cuda.is_available(), # Use FP16 if you have a GPU
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,      # Must match tokenization max_length
        generation_num_beams=4,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,    # Good practice
        metric_for_best_model="bleu",   # Good practice
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n🚀 Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n📊 Evaluating on the test set for {dialect_name}...")
    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f"🎯 Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\n🔍 Example Translations:")
    # We need to get the original items from the test_ds before tokenization
    # Since tokenized_datasets removed columns, let's re-tokenize just for prediction examples

    # Re-fetch original test data for comparison
    original_test_ds = split_dataset_dict["test"]
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

    for i in range(min(3, len(predictions))):
        print(f"🗣 Input (Dialect):   {original_test_ds[i]['source']}")
        print(f"📘 Actual (Standard): {original_test_ds[i]['target']}")
        print(f"🤖 Predicted:         {predictions[i]}\n")

    # Save the final model
    trainer.save_model(f"{output_dir}/final_model")
    print(f"✅ Final model for {dialect_name} saved to {output_dir}/final_model")

    return trainer


# ===============================
# 🚀 Training Loop
# ===============================
global_model = model # Use one model instance to train sequentially
global_tokenizer = tokenizer

for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # Pass the global model to continue fine-tuning
    # Or re-load the base model if you want separate models:
    # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    # Update the global model with the newly trained one for the next loop
    # This creates a "curriculum" where the model learns one dialect, then the next
    # If you want separate models, uncomment the reload lines above.
    global_model = trained_trainer.model


print("\n✅ Finished training all dialect models. Models are saved in Google Drive.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries installed and imported successfully.
✅ Successfully loaded the dataset.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]


✅ Tokenizer for 'facebook/nllb-200-distilled-600M' loaded.
🌐 Language pair: ben_Beng → ben_Beng (Dialect → Standard).

--- Processing dialect: Sylhet ---
✅ Dataset splits created for Sylhet: Train/Val/Test


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

✅ Tokenization complete.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



🚀 Starting training for Sylhet dialect...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimran-bhuiyan[0m ([33mimran-bhuiyan-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
500,3.648,2.847757,20.404228
1000,0.5294,0.408332,29.890219


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Evaluating on the test set for Sylhet...


🎯 Test Set BLEU Score for Sylhet: 29.08

🔍 Example Translations:
🗣 Input (Dialect):   চাকরি করতে আর বালা লাগরো না
📘 Actual (Standard): চাকরি করতে আর ভালো লাগে না
🤖 Predicted:         চাকরি করতে আর ভালো লাগে না

🗣 Input (Dialect):   পয়লা এখটা দুকানো যাওয়া লাগবো
📘 Actual (Standard): প্রথমে একটি দোকানে যেতে হবে
🤖 Predicted:         প্রথমে একটা দোকানে যেতে হবে

🗣 Input (Dialect):   হনোর পানি টলটলে
📘 Actual (Standard): সেখানকার পানি টলটলে
🤖 Predicted:         হোনার পানি টলটলে

✅ Final model for Sylhet saved to /content/Bangla_Dialect_Models/nllb-bangla-sylhet/final_model

✅ Finished training all dialect models. Models are saved in Google Drive.


# **Sylhet** **5** **Epoch**

In [1]:
# ===============================
# 🧠 Bangla Dialect → Standard Bangla using NLLB-200
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
from google.colab import drive

print("✅ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
# 📂 Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("⚠ Dataset not found — using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়。"],
        'Barisal': ["হেই ইস্কুলে যায়。"],
        'Chittagong': ["হেই স্কোলে যায়。"],
        'Sylhet': ["তারে ইস্কুলে যায়。"]
    })

# Choose dialects to train (can be one or multiple)
DIALECTS_TO_TRAIN = ['Sylhet'] # You can change this, e.g., ['Barisal', 'Sylhet']

# ===============================
# ⚙ Load Model & Tokenizer (NLLB Version)
# ===============================

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Set source and target languages (Bangla, using NLLB's code)
SRC_LANG = "ben_Beng"
TGT_LANG = "ben_Beng"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# --- CRITICAL for NLLB ---
# Force the model to always generate Bengali tokens
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
# -------------------------

print(f"\n✅ Tokenizer for '{MODEL_NAME}' loaded.")
print(f"🌐 Language pair: {SRC_LANG} → {TGT_LANG} (Dialect → Standard).")


# ===============================
# 🧩 Helper Functions (Unchanged)
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    # Ensure data is string and not empty, just in case
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]

    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) == 0:
        print(f"⚠ No valid data found for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Handle small datasets
    test_size = min(0.2, 500 / len(hf_dataset)) if len(hf_dataset) > 500 else 0.2
    if len(hf_dataset) * (1 - test_size) < 10: # Ensure train set is not too small
         print(f"⚠ Very small dataset for {dialect_col}. Using all for training.")
         return DatasetDict({'train': hf_dataset, 'validation': hf_dataset, 'test': hf_dataset})

    train_test_split = hf_dataset.train_test_split(test_size=test_size, seed=42)

    val_size = min(0.5, 250 / len(train_test_split['test'])) if len(train_test_split['test']) > 250 else 0.5
    if len(train_test_split['test']) * (1 - val_size) < 5: # Ensure val set is not too small
        test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
    else:
        test_val_split = train_test_split['test'].train_test_split(test_size=val_size, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"✅ Dataset splits created for {dialect_col}: Train/Val/Test")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # The tokenizer will use tokenizer.src_lang set earlier
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # The tokenizer will use tokenizer.tgt_lang when text_target is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['source', 'target'])
    print("✅ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- MODIFIED: Updated output directory for NLLB ---
    output_dir = f"/content/Bangla_Dialect_Models/nllb-bangla-{dialect_name.lower()}"

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        # Compute BLEU
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,  # Increased batch size for NLLB-600M, adjust based on VRAM
        per_device_eval_batch_size=8,   # Increased batch size for NLLB-600M
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=5,
        predict_with_generate=True,     # This is crucial!
        fp16=torch.cuda.is_available(), # Use FP16 if you have a GPU
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,      # Must match tokenization max_length
        generation_num_beams=4,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,    # Good practice
        metric_for_best_model="bleu",   # Good practice
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n🚀 Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n📊 Evaluating on the test set for {dialect_name}...")
    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f"🎯 Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\n🔍 Example Translations:")
    # We need to get the original items from the test_ds before tokenization
    # Since tokenized_datasets removed columns, let's re-tokenize just for prediction examples

    # Re-fetch original test data for comparison
    original_test_ds = split_dataset_dict["test"]
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

    for i in range(min(5, len(predictions))):
        print(f"🗣 Input (Dialect):   {original_test_ds[i]['source']}")
        print(f"📘 Actual (Standard): {original_test_ds[i]['target']}")
        print(f"🤖 Predicted:         {predictions[i]}\n")

    # Save the final model
    trainer.save_model(f"{output_dir}/final_model")
    print(f"✅ Final model for {dialect_name} saved to {output_dir}/final_model")

    return trainer


# ===============================
# 🚀 Training Loop
# ===============================
global_model = model # Use one model instance to train sequentially
global_tokenizer = tokenizer

for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # Pass the global model to continue fine-tuning
    # Or re-load the base model if you want separate models:
    # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    # Update the global model with the newly trained one for the next loop
    # This creates a "curriculum" where the model learns one dialect, then the next
    # If you want separate models, uncomment the reload lines above.
    global_model = trained_trainer.model


print("\n✅ Finished training all dialect models. Models are saved in Google Drive.")

✅ Libraries installed and imported successfully.
✅ Successfully loaded the dataset.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



✅ Tokenizer for 'facebook/nllb-200-distilled-600M' loaded.
🌐 Language pair: ben_Beng → ben_Beng (Dialect → Standard).

--- Processing dialect: Sylhet ---
✅ Dataset splits created for Sylhet: Train/Val/Test


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

✅ Tokenization complete.


  trainer = Seq2SeqTrainer(



🚀 Starting training for Sylhet dialect...


[34m[1mwandb[0m: Currently logged in as: [33mimran-bhuiyan[0m ([33mimran-bhuiyan-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
500,3.5489,2.682799,21.355164
1000,0.1474,0.109067,38.129366
1500,0.0761,0.080838,45.353688


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Evaluating on the test set for Sylhet...


🎯 Test Set BLEU Score for Sylhet: 39.75

🔍 Example Translations:
🗣 Input (Dialect):   চাকরি করতে আর বালা লাগরো না
📘 Actual (Standard): চাকরি করতে আর ভালো লাগে না
🤖 Predicted:         চাকরি করতে ভালো লাগে না

🗣 Input (Dialect):   পয়লা এখটা দুকানো যাওয়া লাগবো
📘 Actual (Standard): প্রথমে একটি দোকানে যেতে হবে
🤖 Predicted:         প্রথমে একটা দোকানে যেতে হবে

🗣 Input (Dialect):   হনোর পানি টলটলে
📘 Actual (Standard): সেখানকার পানি টলটলে
🤖 Predicted:         হান্নার পানি টলটলে

🗣 Input (Dialect):   তুমারে দেখলাম
📘 Actual (Standard): তোমাকে দেখলাম
🤖 Predicted:         তোমাকে দেখলাম

🗣 Input (Dialect):   একে অপরের পতি যতো বেশি বিশ্বাস থাকবো,  তাদের ভালা ফাইয়া পাল্লা ততো ভারি অইবো
📘 Actual (Standard): একে অপরের প্রতি যতো বেশী বিশ্বাস থাকবে, তাদের ভালোবাসার পাল্লা ততো ভারী হবে
🤖 Predicted:         পরস্পরের স্বামী যত বেশি বিশ্বাস থাকবে, তাদের ভালোবাসার পালা তত ভারী হবে

✅ Final model for Sylhet saved to /content/Bangla_Dialect_Models/nllb-bangla-sylhet/final_model

✅ Finished training all dialect 

In [None]:
# Add this to a new cell before loading the model
print("Clearing NLLB model cache...")
!rm -rf ~/.cache/huggingface/hub/models--facebook--nllb-200-distilled-600M
print("Cache cleared.")

Clearing NLLB model cache...
Cache cleared.


# **Chittagong** **with** **3** **Epoch**

In [None]:
# ===============================
# 🧠 Bangla Dialect → Standard Bangla using NLLB-200
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
from google.colab import drive

print("✅ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
# 📂 Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("⚠ Dataset not found — using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়。"],
        'Barisal': ["হেই ইস্কুলে যায়。"],
        'Chittagong': ["হেই স্কোলে যায়。"],
        'Sylhet': ["তারে ইস্কুলে যায়。"]
    })

# Choose dialects to train (can be one or multiple)
DIALECTS_TO_TRAIN = ['Chittagong'] # You can change this, e.g., ['Barisal', 'Sylhet']

# ===============================
# ⚙ Load Model & Tokenizer (NLLB Version)
# ===============================

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Set source and target languages (Bangla, using NLLB's code)
SRC_LANG = "ben_Beng"
TGT_LANG = "ben_Beng"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# --- CRITICAL for NLLB ---
# Force the model to always generate Bengali tokens
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
# -------------------------

print(f"\n✅ Tokenizer for '{MODEL_NAME}' loaded.")
print(f"🌐 Language pair: {SRC_LANG} → {TGT_LANG} (Dialect → Standard).")


# ===============================
# 🧩 Helper Functions (Unchanged)
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    # Ensure data is string and not empty, just in case
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]

    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) == 0:
        print(f"⚠ No valid data found for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Handle small datasets
    test_size = min(0.2, 500 / len(hf_dataset)) if len(hf_dataset) > 500 else 0.2
    if len(hf_dataset) * (1 - test_size) < 10: # Ensure train set is not too small
         print(f"⚠ Very small dataset for {dialect_col}. Using all for training.")
         return DatasetDict({'train': hf_dataset, 'validation': hf_dataset, 'test': hf_dataset})

    train_test_split = hf_dataset.train_test_split(test_size=test_size, seed=42)

    val_size = min(0.5, 250 / len(train_test_split['test'])) if len(train_test_split['test']) > 250 else 0.5
    if len(train_test_split['test']) * (1 - val_size) < 5: # Ensure val set is not too small
        test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
    else:
        test_val_split = train_test_split['test'].train_test_split(test_size=val_size, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"✅ Dataset splits created for {dialect_col}: Train/Val/Test")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # The tokenizer will use tokenizer.src_lang set earlier
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # The tokenizer will use tokenizer.tgt_lang when text_target is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['source', 'target'])
    print("✅ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- MODIFIED: Updated output directory for NLLB ---
    output_dir = f"/content/Bangla_Dialect_Models/nllb-bangla-{dialect_name.lower()}"

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        # Compute BLEU
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,  # Increased batch size for NLLB-600M, adjust based on VRAM
        per_device_eval_batch_size=8,   # Increased batch size for NLLB-600M
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=3,
        predict_with_generate=True,     # This is crucial!
        fp16=torch.cuda.is_available(), # Use FP16 if you have a GPU
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,      # Must match tokenization max_length
        generation_num_beams=4,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,    # Good practice
        metric_for_best_model="bleu",   # Good practice
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n🚀 Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n📊 Evaluating on the test set for {dialect_name}...")
    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f"🎯 Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\n🔍 Example Translations:")
    # We need to get the original items from the test_ds before tokenization
    # Since tokenized_datasets removed columns, let's re-tokenize just for prediction examples

    # Re-fetch original test data for comparison
    original_test_ds = split_dataset_dict["test"]
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

    for i in range(min(3, len(predictions))):
        print(f"🗣 Input (Dialect):   {original_test_ds[i]['source']}")
        print(f"📘 Actual (Standard): {original_test_ds[i]['target']}")
        print(f"🤖 Predicted:         {predictions[i]}\n")

    # Save the final model
    trainer.save_model(f"{output_dir}/final_model")
    print(f"✅ Final model for {dialect_name} saved to {output_dir}/final_model")

    return trainer


# ===============================
# 🚀 Training Loop
# ===============================
global_model = model # Use one model instance to train sequentially
global_tokenizer = tokenizer

for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # Pass the global model to continue fine-tuning
    # Or re-load the base model if you want separate models:
    # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    # Update the global model with the newly trained one for the next loop
    # This creates a "curriculum" where the model learns one dialect, then the next
    # If you want separate models, uncomment the reload lines above.
    global_model = trained_trainer.model


print("\n✅ Finished training all dialect models. Models are saved in Google Drive.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries installed and imported successfully.
✅ Successfully loaded the dataset.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]


✅ Tokenizer for 'facebook/nllb-200-distilled-600M' loaded.
🌐 Language pair: ben_Beng → ben_Beng (Dialect → Standard).

--- Processing dialect: Chittagong ---
✅ Dataset splits created for Chittagong: Train/Val/Test


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

✅ Tokenization complete.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



🚀 Starting training for Chittagong dialect...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimran-bhuiyan[0m ([33mimran-bhuiyan-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
500,3.7074,2.910214,10.343039
1000,0.5804,0.45642,16.748862


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Evaluating on the test set for Chittagong...


🎯 Test Set BLEU Score for Chittagong: 19.70

🔍 Example Translations:
🗣 Input (Dialect):   চঁঅরি গইরত আত্তুন আর গম ন লাগের
📘 Actual (Standard): চাকরি করতে আর ভালো লাগে না
🤖 Predicted:         চারপাশে গিয়ে দেখলাম আর ভালো লাগে না

🗣 Input (Dialect):   ফাস্টে এক্কান দোয়ান ওত যা ফরিবো
📘 Actual (Standard): প্রথমে একটি দোকানে যেতে হবে
🤖 Predicted:         তাড়াতাড়ি একটা ডোয়ান খেতে হবে

🗣 Input (Dialect):   ইয়ানোর ফানি টলটল
📘 Actual (Standard): সেখানকার পানি টলটলে
🤖 Predicted:         আমার মেয়ে টলটল



# **Chittagong** **with** **Epoch** **5**




In [None]:
# ===============================
# 🧠 Bangla Dialect → Standard Bangla using NLLB-200
# ===============================

!pip install transformers[sentencepiece] datasets sacrebleu evaluate torch pandas openpyxl --quiet

import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch
from google.colab import drive

print("✅ Libraries installed and imported successfully.")
#drive.mount('/content/drive')

# ===============================
# 📂 Load Dataset
# ===============================

try:
    df = pd.read_excel("/content/bangla_dialect_aligned_18920.xlsx")
    print("✅ Successfully loaded the dataset.")
except FileNotFoundError:
    print("⚠ Dataset not found — using sample data.")
    df = pd.DataFrame({
        'Standard_Bangla': ["সে স্কুলে যায়。"],
        'Barisal': ["হেই ইস্কুলে যায়。"],
        'Chittagong': ["হেই স্কোলে যায়。"],
        'Sylhet': ["তারে ইস্কুলে যায়。"]
    })

# Choose dialects to train (can be one or multiple)
DIALECTS_TO_TRAIN = ['Chittagong'] # You can change this, e.g., ['Barisal', 'Sylhet']

# ===============================
# ⚙ Load Model & Tokenizer (NLLB Version)
# ===============================

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Set source and target languages (Bangla, using NLLB's code)
SRC_LANG = "ben_Beng"
TGT_LANG = "ben_Beng"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# --- CRITICAL for NLLB ---
# Force the model to always generate Bengali tokens
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
# -------------------------

print(f"\n✅ Tokenizer for '{MODEL_NAME}' loaded.")
print(f"🌐 Language pair: {SRC_LANG} → {TGT_LANG} (Dialect → Standard).")


# ===============================
# 🧩 Helper Functions (Unchanged)
# ===============================

def create_dataset_dict(dialect_col):
    """Creates a preprocessed DatasetDict for a given dialect."""
    print(f"\n--- Processing dialect: {dialect_col} ---")
    # Ensure data is string and not empty, just in case
    df_clean = df[['Standard_Bangla', dialect_col]].dropna()
    df_clean = df_clean[
        (df_clean['Standard_Bangla'].apply(lambda x: isinstance(x, str) and x.strip() != "")) &
        (df_clean[dialect_col].apply(lambda x: isinstance(x, str) and x.strip() != ""))
    ]

    subset_df = df_clean.rename(columns={'Standard_Bangla': 'target', dialect_col: 'source'})

    if len(subset_df) == 0:
        print(f"⚠ No valid data found for {dialect_col}. Skipping.")
        return None

    hf_dataset = Dataset.from_pandas(subset_df)

    # Handle small datasets
    test_size = min(0.2, 500 / len(hf_dataset)) if len(hf_dataset) > 500 else 0.2
    if len(hf_dataset) * (1 - test_size) < 10: # Ensure train set is not too small
         print(f"⚠ Very small dataset for {dialect_col}. Using all for training.")
         return DatasetDict({'train': hf_dataset, 'validation': hf_dataset, 'test': hf_dataset})

    train_test_split = hf_dataset.train_test_split(test_size=test_size, seed=42)

    val_size = min(0.5, 250 / len(train_test_split['test'])) if len(train_test_split['test']) > 250 else 0.5
    if len(train_test_split['test']) * (1 - val_size) < 5: # Ensure val set is not too small
        test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
    else:
        test_val_split = train_test_split['test'].train_test_split(test_size=val_size, seed=42)

    dataset_dict = DatasetDict({
        'train': train_test_split['train'],
        'validation': test_val_split['train'],
        'test': test_val_split['test']
    })
    print(f"✅ Dataset splits created for {dialect_col}: Train/Val/Test")
    return dataset_dict


def tokenize_and_prepare_datasets(dataset_dict):
    """Tokenizes the source and target text in the dataset."""

    def tokenize_fn(examples):
        # The tokenizer will use tokenizer.src_lang set earlier
        model_inputs = tokenizer(
            examples["source"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # The tokenizer will use tokenizer.tgt_lang when text_target is provided
        labels = tokenizer(
            text_target=examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['source', 'target'])
    print("✅ Tokenization complete.")
    return tokenized_datasets


def train_and_evaluate(dialect_name, train_ds, val_ds, test_ds):
    """Initializes and runs the training, then evaluates on the test set."""

    # --- MODIFIED: Updated output directory for NLLB ---
    output_dir = f"/content/Bangla_Dialect_Models/nllb-bangla-{dialect_name.lower()}"

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    bleu_metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        # Compute BLEU
        result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,  # Increased batch size for NLLB-600M, adjust based on VRAM
        per_device_eval_batch_size=8,   # Increased batch size for NLLB-600M
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=5,
        predict_with_generate=True,     # This is crucial!
        fp16=torch.cuda.is_available(), # Use FP16 if you have a GPU
        warmup_steps=300,
        max_grad_norm=1.0,
        generation_max_length=128,      # Must match tokenization max_length
        generation_num_beams=4,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        load_best_model_at_end=True,    # Good practice
        metric_for_best_model="bleu",   # Good practice
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"\n🚀 Starting training for {dialect_name} dialect...")
    trainer.train()

    print(f"\n📊 Evaluating on the test set for {dialect_name}...")
    test_results = trainer.predict(test_ds)
    final_bleu_score = test_results.metrics.get('test_bleu', 0.0)
    print(f"🎯 Test Set BLEU Score for {dialect_name}: {final_bleu_score:.2f}")

    print("\n🔍 Example Translations:")
    # We need to get the original items from the test_ds before tokenization
    # Since tokenized_datasets removed columns, let's re-tokenize just for prediction examples

    # Re-fetch original test data for comparison
    original_test_ds = split_dataset_dict["test"]
    predictions = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)

    for i in range(min(3, len(predictions))):
        print(f"🗣 Input (Dialect):   {original_test_ds[i]['source']}")
        print(f"📘 Actual (Standard): {original_test_ds[i]['target']}")
        print(f"🤖 Predicted:         {predictions[i]}\n")

    # Save the final model
    trainer.save_model(f"{output_dir}/final_model")
    print(f"✅ Final model for {dialect_name} saved to {output_dir}/final_model")

    return trainer


# ===============================
# 🚀 Training Loop
# ===============================
global_model = model # Use one model instance to train sequentially
global_tokenizer = tokenizer

for dialect in DIALECTS_TO_TRAIN:
    split_dataset_dict = create_dataset_dict(dialect)

    if split_dataset_dict is None:
        continue # Skip if dataset creation failed

    tokenized_datasets = tokenize_and_prepare_datasets(split_dataset_dict)

    # Pass the global model to continue fine-tuning
    # Or re-load the base model if you want separate models:
    # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

    trained_trainer = train_and_evaluate(
        dialect,
        tokenized_datasets["train"],
        tokenized_datasets["validation"],
        tokenized_datasets["test"]
    )

    # Update the global model with the newly trained one for the next loop
    # This creates a "curriculum" where the model learns one dialect, then the next
    # If you want separate models, uncomment the reload lines above.
    global_model = trained_trainer.model


print("\n✅ Finished training all dialect models. Models are saved in Google Drive.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries installed and imported successfully.
✅ Successfully loaded the dataset.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]


✅ Tokenizer for 'facebook/nllb-200-distilled-600M' loaded.
🌐 Language pair: ben_Beng → ben_Beng (Dialect → Standard).

--- Processing dialect: Chittagong ---
✅ Dataset splits created for Chittagong: Train/Val/Test


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

✅ Tokenization complete.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(



🚀 Starting training for Chittagong dialect...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimran-bhuiyan[0m ([33mimran-bhuiyan-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
500,3.609,2.745966,10.544315
1000,0.1842,0.140305,22.97658
1500,0.1023,0.105756,30.671225


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Evaluating on the test set for Chittagong...


🎯 Test Set BLEU Score for Chittagong: 30.57

🔍 Example Translations:
🗣 Input (Dialect):   চঁঅরি গইরত আত্তুন আর গম ন লাগের
📘 Actual (Standard): চাকরি করতে আর ভালো লাগে না
🤖 Predicted:         চুরি করতে আমার আর ভালো লাগে না

🗣 Input (Dialect):   ফাস্টে এক্কান দোয়ান ওত যা ফরিবো
📘 Actual (Standard): প্রথমে একটি দোকানে যেতে হবে
🤖 Predicted:         তাড়াতাড়ি একটা দোকানে যেতে হবে

🗣 Input (Dialect):   ইয়ানোর ফানি টলটল
📘 Actual (Standard): সেখানকার পানি টলটলে
🤖 Predicted:         পানির পানি টলটল

✅ Final model for Chittagong saved to /content/Bangla_Dialect_Models/nllb-bangla-chittagong/final_model

✅ Finished training all dialect models. Models are saved in Google Drive.
