# Barisal Dialect 5 epoch 80 10 10 split

In [None]:
!pip install transformers datasets sacrebleu evaluate torch pandas openpyxl accelerate --quiet

import os, pandas as pd, numpy as np, evaluate, torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from google.colab import drive
import shutil
from sklearn.model_selection import train_test_split

print("✅ Libraries installed.")


drive.mount('/content/drive')


path = "/content/drive/MyDrive/bangla_dialect_aligned_18920.xlsx"

df = pd.read_excel(path)
print("✅ Loaded dataset from Drive.")
print(df.columns)

dialect = "Barisal"
subset = df[['Standard_Bangla', dialect]].dropna()
subset = subset[subset['Standard_Bangla'] != subset[dialect]]
subset.columns = ['target', 'source']


print("Splitting data into 80/10/10...")


train_df, temp_df = train_test_split(subset, test_size=0.2, random_state=42)


val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)} | Test size: {len(test_dataset)}")


MODEL_NAME = "alirezamsh/small100"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "bn"

print("✅ Tokenizer and model loaded.")

def tokenize_fn(examples):
    inputs = ["normalize from Barisal to standard bangla: " + ex for ex in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["target"], max_length=128,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_val   = val_dataset.map(tokenize_fn, batched=True)
tokenized_test  = test_dataset.map(tokenize_fn, batched=True)

print("✅ Tokenization complete for train, validation, and test sets.")


output_dir = "/content/drive/MyDrive/Bangla_Dialect_Models/small100-Barisal-801010"

# Clean previous checkpoints if any
if os.path.exists(output_dir):
    for f in os.listdir(output_dir):
        if f.startswith("checkpoint-"):
            shutil.rmtree(os.path.join(output_dir, f), ignore_errors=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


print("🚀 Training started (80% train, 10% validation)...")
trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Best model saved to {output_dir}")

bleu = evaluate.load("sacrebleu")

print("📊 Evaluating BLEU on held-out 10% test set...")


model.to("cuda" if torch.cuda.is_available() else "cpu")

preds, refs, srcs = [], [], []
for i in range(len(test_dataset)):
    src = test_dataset[i]['source']
    tgt = test_dataset[i]['target']

    input_text = f"normalize from Barisal to standard bangla: {src}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=128, num_beams=4)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)

    preds.append(pred.strip())
    refs.append([tgt.strip()])
    srcs.append(src.strip())

result = bleu.compute(predictions=preds, references=refs)
print(f"🎯 Test BLEU = {result['score']:.2f}")

# Save predictions for inspection
output_df = pd.DataFrame({
    "Dialect_Input_Text": srcs,
    "Predicted_Text": preds,
    "Standard_Text": [r[0] for r in refs]
})

predictions_path = "/content/drive/MyDrive/Bangla_Dialect_Models/Barisal_801010_predictions.csv"
output_df.to_csv(predictions_path, index=False)
print(f"💾 Predictions saved to {predictions_path}")

# Optional: Print the first 5 rows to see the result
print("\n--- Sample Predictions ---")
print(output_df.head())

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries installed.
Mounted at /content/drive
✅ Loaded dataset from Drive.
Index(['ID', 'Standard_Bangla', 'Barisal', 'Chittagong', 'Sylhet', 'Noakhali',
       'Mymensingh'],
      dtype='object')
Splitting data into 80/10/10...
Train size: 2724 | Validation size: 340 | Test size: 341


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

✅ Tokenizer and model loaded.


Map:   0%|          | 0/2724 [00:00<?, ? examples/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

✅ Tokenization complete for train, validation, and test sets.


  trainer = Seq2SeqTrainer(


🚀 Training started (80% train, 10% validation)...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mirfanuzzamanmontasir75[0m ([33mirfanuzzamanmontasir75-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.098,0.083384
2,0.0679,0.070229
3,0.0483,0.066988
4,0.0373,0.06536
5,0.0336,0.065117


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Best model saved to /content/drive/MyDrive/Bangla_Dialect_Models/small100-Barisal-801010


Downloading builder script: 0.00B [00:00, ?B/s]

📊 Evaluating BLEU on held-out 10% test set...
🎯 Test BLEU = 45.14
💾 Predictions saved to /content/drive/MyDrive/Bangla_Dialect_Models/Barisal_801010_predictions.csv

--- Sample Predictions ---
                           Dialect_Input_Text  \
0                           শ্যাষ করলাম হগলডি   
1              তোমার এত্ত ঘুম আয় কির লইগ্গা?   
2  তোমারে দেইখ্যা অসুস্থ্য বইললা মনে হইতেয়াছে   
3         জামাইর উপরে মোর একটুও কোন বিচার নাই   
4                                 ঘুমাইয়া লও   

                            Predicted_Text  \
0                           শেষ করেছি সবাই   
1                    তোমার এত ঘুম আসে কেন?   
2        তোমাকে দেখে অসুস্থ বইটি মনে হচ্ছে   
3  স্বামীর প্রতি আমার একটুও কোন অভিযোগ নেই   
4                              ঘুমিয়ে চলো   

                                   Standard_Text  
0                               শেষ করলাম সবকিছু  
1                          তোমার এত ঘুম আসে কেন?  
2               তোমাকে দেখে অসুস্থ বলে মনে হচ্ছে  
3  স্বামীর প্রতি আমার বিন্দুমাত

# Noakhali Dialect 5 epoch 80 10 10 split



In [None]:
dialect = "Noakhali"
subset = df[['Standard_Bangla', dialect]].dropna()
subset = subset[subset['Standard_Bangla'] != subset[dialect]]
subset.columns = ['target', 'source']


print("Splitting data into 80/10/10...")


train_df, temp_df = train_test_split(subset, test_size=0.2, random_state=42)


val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)} | Test size: {len(test_dataset)}")


MODEL_NAME = "alirezamsh/small100"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "bn"

print("✅ Tokenizer and model loaded.")

def tokenize_fn(examples):
    inputs = ["normalize from Noakhali to standard bangla: " + ex for ex in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["target"], max_length=128,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_val   = val_dataset.map(tokenize_fn, batched=True)
tokenized_test  = test_dataset.map(tokenize_fn, batched=True)

print("✅ Tokenization complete for train, validation, and test sets.")


output_dir = "/content/drive/MyDrive/Bangla_Dialect_Models/small100-Noakhali-801010"

# Clean previous checkpoints if any
if os.path.exists(output_dir):
    for f in os.listdir(output_dir):
        if f.startswith("checkpoint-"):
            shutil.rmtree(os.path.join(output_dir, f), ignore_errors=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


print("🚀 Training started (80% train, 10% validation)...")
trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Best model saved to {output_dir}")

bleu = evaluate.load("sacrebleu")

print("📊 Evaluating BLEU on held-out 10% test set...")


model.to("cuda" if torch.cuda.is_available() else "cpu")

preds, refs, srcs = [], [], []
for i in range(len(test_dataset)):
    src = test_dataset[i]['source']
    tgt = test_dataset[i]['target']

    input_text = f"normalize from Noakhali to standard bangla: {src}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=128, num_beams=4)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)

    preds.append(pred.strip())
    refs.append([tgt.strip()])
    srcs.append(src.strip())

result = bleu.compute(predictions=preds, references=refs)
print(f"🎯 Test BLEU = {result['score']:.2f}")

# Save predictions for inspection
output_df = pd.DataFrame({
    "Dialect_Input_Text": srcs,
    "Predicted_Text": preds,
    "Standard_Text": [r[0] for r in refs]
})

predictions_path = "/content/drive/MyDrive/Bangla_Dialect_Models/noakhali_801010_predictions.csv"
output_df.to_csv(predictions_path, index=False)
print(f"💾 Predictions saved to {predictions_path}")

# Optional: Print the first 5 rows to see the result
print("\n--- Sample Predictions ---")
print(output_df.head())

Splitting data into 80/10/10...
Train size: 1981 | Validation size: 248 | Test size: 248
✅ Tokenizer and model loaded.


Map:   0%|          | 0/1981 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

✅ Tokenization complete for train, validation, and test sets.


  trainer = Seq2SeqTrainer(


🚀 Training started (80% train, 10% validation)...


Epoch,Training Loss,Validation Loss
1,0.1531,0.110305
2,0.1093,0.094853
3,0.0733,0.092551
4,0.0631,0.090829
5,0.0534,0.090447


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Best model saved to /content/drive/MyDrive/Bangla_Dialect_Models/small100-Noakhali-801010
📊 Evaluating BLEU on held-out 10% test set...
🎯 Test BLEU = 38.81
💾 Predictions saved to /content/drive/MyDrive/Bangla_Dialect_Models/noakhali_801010_predictions.csv

--- Sample Predictions ---
                                  Dialect_Input_Text  \
0              আর হেসবুক আইডিকান যদি চিড়িয়াখানা অইতো   
1                                   আরে সত্যি কতা কন   
2                আইচ্ছা বলো সাই বাংলাদেশে কোগা জেলা?   
3  তবে আর গার্লফ্রেন্ড আরে কইছে আই হেতিরে যেন্নে ...   
4                   আন্নে এতো খারাপ অই কিল্লাই আছেন?   

                                      Predicted_Text  \
0                                            আমার হে   
1                               আমাকে সত্যি কথা বলুন   
2               আইচ্ছা বলো এসে বাংলাদেশে কোথাও জেলা?   
3  তবে আমার গার্লফ্রেন্ড আমাকে বলেছে আমি তাকে যেম...   
4                      আপনি এতো খারাপ হয়ে কেন আছেন?   

                                       S

# Chittagong Dialect 5 epoch 80 10 10 split


In [None]:
dialect = "Chittagong"
subset = df[['Standard_Bangla', dialect]].dropna()
subset = subset[subset['Standard_Bangla'] != subset[dialect]]
subset.columns = ['target', 'source']


print("Splitting data into 80/10/10...")


train_df, temp_df = train_test_split(subset, test_size=0.2, random_state=42)


val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)} | Test size: {len(test_dataset)}")


MODEL_NAME = "alirezamsh/small100"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "bn"

print("✅ Tokenizer and model loaded.")

def tokenize_fn(examples):
    inputs = ["normalize from Chittagong to standard bangla: " + ex for ex in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["target"], max_length=128,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_val   = val_dataset.map(tokenize_fn, batched=True)
tokenized_test  = test_dataset.map(tokenize_fn, batched=True)

print("✅ Tokenization complete for train, validation, and test sets.")


output_dir = "/content/drive/MyDrive/Bangla_Dialect_Models/small100-Chittagong-801010"


if os.path.exists(output_dir):
    for f in os.listdir(output_dir):
        if f.startswith("checkpoint-"):
            shutil.rmtree(os.path.join(output_dir, f), ignore_errors=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


print("🚀 Training started (80% train, 10% validation)...")
trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Best model saved to {output_dir}")

bleu = evaluate.load("sacrebleu")

print("📊 Evaluating BLEU on held-out 10% test set...")


model.to("cuda" if torch.cuda.is_available() else "cpu")

preds, refs, srcs = [], [], []
for i in range(len(test_dataset)):
    src = test_dataset[i]['source']
    tgt = test_dataset[i]['target']

    input_text = f"normalize from Chittagong to standard bangla: {src}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=128, num_beams=4)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)

    preds.append(pred.strip())
    refs.append([tgt.strip()])
    srcs.append(src.strip())

result = bleu.compute(predictions=preds, references=refs)
print(f"🎯 Test BLEU = {result['score']:.2f}")

# Save predictions for inspection
output_df = pd.DataFrame({
    "Dialect_Input_Text": srcs,
    "Predicted_Text": preds,
    "Standard_Text": [r[0] for r in refs]
})

predictions_path = "/content/drive/MyDrive/Bangla_Dialect_Models/Chittagong_801010_predictions.csv"
output_df.to_csv(predictions_path, index=False)
print(f"💾 Predictions saved to {predictions_path}")

# Optional: Print the first 5 rows to see the result
print("\n--- Sample Predictions ---")
print(output_df.head())

Splitting data into 80/10/10...
Train size: 2760 | Validation size: 345 | Test size: 345
✅ Tokenizer and model loaded.


Map:   0%|          | 0/2760 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

✅ Tokenization complete for train, validation, and test sets.


  trainer = Seq2SeqTrainer(


🚀 Training started (80% train, 10% validation)...


Epoch,Training Loss,Validation Loss
1,0.1651,0.134617
2,0.1077,0.113991
3,0.0827,0.108634
4,0.0746,0.105924
5,0.0619,0.106261


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Best model saved to /content/drive/MyDrive/Bangla_Dialect_Models/small100-Chittagong-801010
📊 Evaluating BLEU on held-out 10% test set...
🎯 Test BLEU = 30.66
💾 Predictions saved to /content/drive/MyDrive/Bangla_Dialect_Models/Chittagong_801010_predictions.csv

--- Sample Predictions ---
                                  Dialect_Input_Text  \
0  আঁই তোঁয়ার ফোয়ারে রাতিয়া দইরগার চর অত তিয়ায় তা...   
1                       অনে কি এই বইওয়া ফরিত ফার অন?   
2  তুঁই যাইবর ফর তোঁয়ার মাইয়্যা কেন যানি আন্ত অয় ...   
3             ইবা একদিন পাশর গেরাম অত যাই টুফি বেচের   
4                              তুই কি জাম্বুরা হোনা?   

                                  Predicted_Text  \
0    আমি তোমার সাথে রাতের দিকে তাকিয়ে থাকতে চাই   
1                   আপনি কি এই বইটা পড়তে পারেন?   
2  তুমি যাওয়ার পর তোমার মেয়েরা কেমন জানে এনেছে   
3    সে একদিন পাশের গ্রামে গিয়ে টুপি বিক্রি করে   
4                             তুমি কি জাম্বুরা ?   

                                       Standard_Text  
0    

# Sylhet Dialect 5 epoch 80 10 10 split


In [None]:
dialect = "Sylhet"
subset = df[['Standard_Bangla', dialect]].dropna()
subset = subset[subset['Standard_Bangla'] != subset[dialect]]
subset.columns = ['target', 'source']


print("Splitting data into 80/10/10...")


train_df, temp_df = train_test_split(subset, test_size=0.2, random_state=42)


val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)} | Test size: {len(test_dataset)}")


MODEL_NAME = "alirezamsh/small100"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "bn"

print("✅ Tokenizer and model loaded.")

def tokenize_fn(examples):
    inputs = ["normalize from Sylhet to standard bangla: " + ex for ex in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["target"], max_length=128,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_val   = val_dataset.map(tokenize_fn, batched=True)
tokenized_test  = test_dataset.map(tokenize_fn, batched=True)

print("✅ Tokenization complete for train, validation, and test sets.")


output_dir = "/content/drive/MyDrive/Bangla_Dialect_Models/small100-Sylhet-801010"

# Clean previous checkpoints if any
if os.path.exists(output_dir):
    for f in os.listdir(output_dir):
        if f.startswith("checkpoint-"):
            shutil.rmtree(os.path.join(output_dir, f), ignore_errors=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,     # <-- Pass in the validation set
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


print("🚀 Training started (80% train, 10% validation)...")
trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Best model saved to {output_dir}")

bleu = evaluate.load("sacrebleu")

print("📊 Evaluating BLEU on held-out 10% test set...")


model.to("cuda" if torch.cuda.is_available() else "cpu")

preds, refs, srcs = [], [], []
for i in range(len(test_dataset)):
    src = test_dataset[i]['source']
    tgt = test_dataset[i]['target']

    input_text = f"normalize from Sylhet to standard bangla: {src}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=128, num_beams=4)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)

    preds.append(pred.strip())
    refs.append([tgt.strip()])
    srcs.append(src.strip())

result = bleu.compute(predictions=preds, references=refs)
print(f"🎯 Test BLEU = {result['score']:.2f}")

# Save predictions for inspection
output_df = pd.DataFrame({
    "Dialect_Input_Text": srcs,
    "Predicted_Text": preds,
    "Standard_Text": [r[0] for r in refs]
})

predictions_path = "/content/drive/MyDrive/Bangla_Dialect_Models/Sylhet_801010_predictions.csv"
output_df.to_csv(predictions_path, index=False)
print(f"💾 Predictions saved to {predictions_path}")

# Optional: Print the first 5 rows to see the result
print("\n--- Sample Predictions ---")
print(output_df.head())

Splitting data into 80/10/10...
Train size: 2726 | Validation size: 341 | Test size: 341
✅ Tokenizer and model loaded.


Map:   0%|          | 0/2726 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

✅ Tokenization complete for train, validation, and test sets.


  trainer = Seq2SeqTrainer(


🚀 Training started (80% train, 10% validation)...


Epoch,Training Loss,Validation Loss
1,0.1204,0.104159
2,0.0793,0.089536
3,0.0606,0.085421
4,0.0503,0.083769
5,0.045,0.083675


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Best model saved to /content/drive/MyDrive/Bangla_Dialect_Models/small100-Sylhet-801010
📊 Evaluating BLEU on held-out 10% test set...
🎯 Test BLEU = 37.01
💾 Predictions saved to /content/drive/MyDrive/Bangla_Dialect_Models/Sylhet_801010_predictions.csv

--- Sample Predictions ---
                                  Dialect_Input_Text  \
0                     তুমাইতাইন সরকারি চাকরি করছ নই?   
1                  বিয়ার ১ বৎসর ফরেও তারার ফুড়ি অইছে   
2                     ইতা করের কিতার লাগি বইয়া বইয়া?   
3                             কুন্টা খাইতে ভালা লাগে   
4  নিজর বাড়িত আইয়া হুনি পুরা এলাকাত আমরার নামে খা...   

                                      Predicted_Text  \
0                            তুমি সরকারি চাকরি করছো?   
1               বিয়ের ১ বছর আগেও তাদের মেয়ে হয়েছে   
2                          এসব করতে কেন বসে বসে বসে?   
3                                কিছু খেতে ভালো লাগে   
4  নিজের বাড়িতে এসে শুনি পুরো এলাকায় আমাদের নাম...   

                                       Stand

# Mymensingh Dialect 5 epoch 80 10 10 split


In [None]:
dialect = "Mymensingh"
subset = df[['Standard_Bangla', dialect]].dropna()
subset = subset[subset['Standard_Bangla'] != subset[dialect]]
subset.columns = ['target', 'source']


print("Splitting data into 80/10/10...")


train_df, temp_df = train_test_split(subset, test_size=0.2, random_state=42)


val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)} | Test size: {len(test_dataset)}")


MODEL_NAME = "alirezamsh/small100"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


tokenizer.src_lang = "bn"
tokenizer.tgt_lang = "bn"

print("✅ Tokenizer and model loaded.")

def tokenize_fn(examples):
    inputs = ["normalize from Mymensingh to standard bangla: " + ex for ex in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["target"], max_length=128,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_val   = val_dataset.map(tokenize_fn, batched=True)
tokenized_test  = test_dataset.map(tokenize_fn, batched=True)

print("✅ Tokenization complete for train, validation, and test sets.")


output_dir = "/content/drive/MyDrive/Bangla_Dialect_Models/small100-Mymensingh-801010"

# Clean previous checkpoints if any
if os.path.exists(output_dir):
    for f in os.listdir(output_dir):
        if f.startswith("checkpoint-"):
            shutil.rmtree(os.path.join(output_dir, f), ignore_errors=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,     # <-- Pass in the validation set
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


print("🚀 Training started (80% train, 10% validation)...")
trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Best model saved to {output_dir}")

bleu = evaluate.load("sacrebleu")

print("📊 Evaluating BLEU on held-out 10% test set...")


model.to("cuda" if torch.cuda.is_available() else "cpu")

preds, refs, srcs = [], [], []
for i in range(len(test_dataset)):
    src = test_dataset[i]['source']
    tgt = test_dataset[i]['target']

    input_text = f"normalize from Mymensingh to standard bangla: {src}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=128, num_beams=4)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)

    preds.append(pred.strip())
    refs.append([tgt.strip()])
    srcs.append(src.strip())

result = bleu.compute(predictions=preds, references=refs)
print(f"🎯 Test BLEU = {result['score']:.2f}")

# Save predictions for inspection
output_df = pd.DataFrame({
    "Dialect_Input_Text": srcs,
    "Predicted_Text": preds,
    "Standard_Text": [r[0] for r in refs]
})

predictions_path = "/content/drive/MyDrive/Bangla_Dialect_Models/Mymensingh_801010_predictions.csv"
output_df.to_csv(predictions_path, index=False)
print(f"💾 Predictions saved to {predictions_path}")

# Optional: Print the first 5 rows to see the result
print("\n--- Sample Predictions ---")
print(output_df.head())

Splitting data into 80/10/10...
Train size: 1909 | Validation size: 239 | Test size: 239
✅ Tokenizer and model loaded.


Map:   0%|          | 0/1909 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

✅ Tokenization complete for train, validation, and test sets.


  trainer = Seq2SeqTrainer(


🚀 Training started (80% train, 10% validation)...


Epoch,Training Loss,Validation Loss
1,0.0835,0.047852
2,0.0424,0.035711
3,0.0266,0.033175
4,0.0212,0.031559
5,0.0169,0.031535


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Best model saved to /content/drive/MyDrive/Bangla_Dialect_Models/small100-Mymensingh-801010
📊 Evaluating BLEU on held-out 10% test set...
🎯 Test BLEU = 56.77
💾 Predictions saved to /content/drive/MyDrive/Bangla_Dialect_Models/Mymensingh_801010_predictions.csv

--- Sample Predictions ---
                                  Dialect_Input_Text  \
0                            আমি গতকাল ইস্কুলে আইনাই   
1                       এক চানাচুরওয়ালারে দেহন যায়   
2        তুমি যদি কাউরে ভালোবাসো,তবে তাকে ছাইড়া দেও   
3  আমার দ্বিতীয় বিয়া নিয়া আমার নিজের কাক্কারা, মা...   
4  আফনেরে কিসু কওন লাগে না , আফনে এমনিতেই আমার মন...   

                                      Predicted_Text  \
0                         আমি গতকাল বিদ্যালয়ে আইনাই   
1                       এক চানাচুরওয়ালারে দেখা যায়   
2         তুমি যদি কাউকে ভালোবাসো,তবে তাকে ছেড়ে দাও   
3  আমার দ্বিতীয় বিয়ে নিয়ে আমার নিজের কুকুরা, ম...   
4  আপনাকে কিছু বলতে লাগে না , আপনাকে এমনিতেই আমার...   

                                    