In [1]:
# ======================
# PART 1: SETUP & DATA PREPARATION
# ======================
import warnings
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
)
import torch
import os
from google.colab import drive
import numpy as np

# Mount Google Drive FIRST
print("🔗 Mounting Google Drive...")
drive.mount('/content/drive')

🔗 Mounting Google Drive...
Mounted at /content/drive


In [2]:


# Hide warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# 1) Load and prepare data
print("📊 Loading dataset...")
file_path = "/content/drive/MyDrive/Dataset/Roman_Balti_dataset.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")
df = df.rename(columns={"Roman Balti": "balti", "English": "english"})





📊 Loading dataset...


In [3]:
print(df.head())

                          balti               english
0                   do rox song           that helped
1         namna chuchik songfin  it is already eleven
2            kho naatpa thongat   he seems to be sick
3  charpha ong ma tsokh che yod           it may rain
4       nga gonchas phuday youd       i am undressing


In [4]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase text
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Remove unwanted symbols (keep letters, numbers, basic punctuation)
    text = re.sub(r"[^a-zA-Z0-9ء-ے.,!?؛،؟ ]+", "", text)
    return text

# Apply cleaning on both columns
df["balti"] = df["balti"].apply(clean_text)
df["english"] = df["english"].apply(clean_text)

print("✅ Text preprocessing completed. Sample after cleaning:")
print(df.head())


✅ Text preprocessing completed. Sample after cleaning:
                          balti               english
0                   do rox song           that helped
1         namna chuchik songfin  it is already eleven
2            kho naatpa thongat   he seems to be sick
3  charpha ong ma tsokh che yod           it may rain
4       nga gonchas phuday youd       i am undressing


In [5]:
# Clean data - remove any NaN values
df = df.dropna()
print(f"✅ Loaded {len(df)} samples")

# Create bidirectional dataset (both directions)
print("🔄 Creating bidirectional training data...")
balti_to_english_data = df[['balti', 'english']].rename(columns={'balti': 'src', 'english': 'tgt'})
balti_to_english_data['direction'] = 'balti_to_english'

english_to_balti_data = df[['english', 'balti']].rename(columns={'english': 'src', 'balti': 'tgt'})
english_to_balti_data['direction'] = 'english_to_balti'

# Combine both directions
bidirectional_df = pd.concat([balti_to_english_data, english_to_balti_data], ignore_index=True)
print(f"📈 Total training samples (bidirectional): {len(bidirectional_df)}")

dataset = Dataset.from_pandas(bidirectional_df)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = split["train"]
eval_ds = split["test"]

print(f"📊 Training samples: {len(train_ds)}")
print(f"📊 Validation samples: {len(eval_ds)}")

✅ Loaded 13077 samples
🔄 Creating bidirectional training data...
📈 Total training samples (bidirectional): 26154
📊 Training samples: 23538
📊 Validation samples: 2616


In [6]:
# ======================
# PART 2: MODEL & TOKENIZATION SETUP (FIXED FOR BIDIRECTIONAL)
# ======================
print("🔄 Loading model and tokenizer...")

# 2) Load model & tokenizer - Using mBART (Bidirectional)
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define language codes
BALTI_LANG_CODE = "ur_PK"  # Using Urdu as proxy for Balti
ENGLISH_LANG_CODE = "en_XX"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 3) Enhanced tokenization with language codes
max_length = 64

def tokenize_fn(examples):
    # Determine language direction
    directions = examples['direction']
    src_texts = examples['src']
    tgt_texts = examples['tgt']

    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}

    for i, direction in enumerate(directions):
        # Set source language based on direction
        if direction == 'balti_to_english':
            tokenizer.src_lang = BALTI_LANG_CODE
            tokenizer.tgt_lang = ENGLISH_LANG_CODE
        else:  # english_to_balti
            tokenizer.src_lang = ENGLISH_LANG_CODE
            tokenizer.tgt_lang = BALTI_LANG_CODE

        # Tokenize source
        src_encoding = tokenizer(
            src_texts[i],
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize target
        with tokenizer.as_target_tokenizer():
            tgt_encoding = tokenizer(
                tgt_texts[i],
                max_length=max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )

        model_inputs["input_ids"].append(src_encoding["input_ids"][0])
        model_inputs["attention_mask"].append(src_encoding["attention_mask"][0])
        model_inputs["labels"].append(tgt_encoding["input_ids"][0])

    return model_inputs

# Apply tokenization
print("🔤 Tokenizing data...")
tokenized_train = train_ds.map(
    tokenize_fn,
    batched=True,
    batch_size=32,
    remove_columns=train_ds.column_names
)
tokenized_eval = eval_ds.map(
    tokenize_fn,
    batched=True,
    batch_size=32,
    remove_columns=eval_ds.column_names
)

print("✅ Tokenization completed successfully!")

# Verify GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️  Using device: {device}")

if device.type == "cuda":
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    model = model.to(device)
else:
    print("⚠️  No GPU detected - using CPU (training will be slower)")

print("✅ PART 2 COMPLETED: Model and tokenization ready")



🔄 Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

🔤 Tokenizing data...


Map:   0%|          | 0/23538 [00:00<?, ? examples/s]



Map:   0%|          | 0/2616 [00:00<?, ? examples/s]

✅ Tokenization completed successfully!
🖥️  Using device: cuda
🎮 GPU: Tesla T4
💾 GPU Memory: 14.7 GB
✅ PART 2 COMPLETED: Model and tokenization ready


In [7]:
# ======================
# PART 3: TRAINING CONFIGURATION & EXECUTION
# ======================
print("⚙️ Setting up training configuration...")

# Prepare for training with proper save paths
save_path = "/content/drive/MyDrive/FYP/bidirectional_roman_balti_translation_model"
os.makedirs(save_path, exist_ok=True)

# Check transformers version
import transformers
print(f"Transformers version: {transformers.__version__}")

# Training arguments
training_args_dict = {
    "output_dir": save_path,
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 2,
    "logging_dir": os.path.join(save_path, "logs"),
    "logging_steps": 500,      # Log every 500 steps
    "num_train_epochs": 10,
    "learning_rate": 3e-5,
    "weight_decay": 0.01,
    "fp16": torch.cuda.is_available(),
    "report_to": "none",
    "predict_with_generate": True,
    "load_best_model_at_end": True,
    "save_total_limit": 2,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
    "save_strategy": "steps",   # Save every 500 steps
    "save_steps": 500,
    "eval_strategy": "steps",   # ✅ Correct key for Transformers ≥4.57
    "eval_steps": 500,
}

training_args = Seq2SeqTrainingArguments(**training_args_dict)

# Data collator for seq2seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Save configuration
config = {
    "model_name": model_name,
    "max_length": max_length,
    "total_epochs": 10,
    "batch_size": 2,
    "learning_rate": 3e-5,
    "save_path": save_path,
    "train_samples": len(tokenized_train),
    "eval_samples": len(tokenized_eval),
    "transformers_version": transformers.__version__,
    "balti_lang": BALTI_LANG_CODE,
    "english_lang": ENGLISH_LANG_CODE
}

import json
with open(os.path.join(save_path, "training_config.json"), "w") as f:
    json.dump(config, f, indent=2)

print("📋 Configuration saved for future sessions")
print("✅ PART 3 COMPLETED: Training configured and ready")


⚙️ Setting up training configuration...
Transformers version: 4.57.1
📋 Configuration saved for future sessions
✅ PART 3 COMPLETED: Training configured and ready


In [None]:
# ======================
# START TRAINING
# ======================
print("\n🚀 STARTING BIDIRECTIONAL TRAINING...")
print("=" * 60)

trainer.train(resume_from_checkpoint="/content/drive/MyDrive/Colab Notebooks/checkpoint-26000")







🚀 STARTING BIDIRECTIONAL TRAINING...


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
	logging_steps: 500 (from args) != 200 (from trainer_state.json)


Step,Training Loss,Validation Loss


In [None]:
print("\n💾 Saving final model...")
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Model saved to: {save_path}")

# Evaluation
print("\n🧪 Evaluating model...")
eval_results = trainer.evaluate()
print(f"📊 Final Evaluation Loss: {eval_results['eval_loss']:.4f}")

print("\n🎉 BIDIRECTIONAL TRAINING COMPLETED SUCCESSFULLY!")


💾 Saving final model...
✅ Model saved to: /content/drive/MyDrive/FYP/bidirectional_roman_balti_translation_model

🧪 Evaluating model...


Epoch,Training Loss,Validation Loss
0,No log,0.30911


📊 Final Evaluation Loss: 0.3091

🎉 BIDIRECTIONAL TRAINING COMPLETED SUCCESSFULLY!


In [None]:
# ======================
# TRANSLATION FUNCTIONS
# ======================
def translate_balti_to_english(sentence):
    """Translate Balti to English"""
    tokenizer.src_lang = BALTI_LANG_CODE
    tokenizer.tgt_lang = ENGLISH_LANG_CODE

    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def translate_english_to_balti(sentence):
    """Translate English to Balti"""
    tokenizer.src_lang = ENGLISH_LANG_CODE
    tokenizer.tgt_lang = BALTI_LANG_CODE

    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)



In [None]:
# ======================
# TEST BOTH DIRECTIONS
# ======================
print("\n🧪 TESTING BOTH TRANSLATION DIRECTIONS...")

# Test sentences
balti_test_sentences = [
    "nga de birkuq ping song phinpa",
    "nga yari bakshesh po yagat",
    "musharaf chatpa minang yot",
    "nga du la thatat pin",
    "kho bank song sat"
]

english_test_sentences = [
    "Hello how are you",
    "What is your name",
    "Thank you very much",
    "Where is the market",
    "I am learning translation"
]

print("\n📤 BALTI TO ENGLISH TRANSLATION:")
print("=" * 40)
for i, sentence in enumerate(balti_test_sentences, 1):
    translation = translate_balti_to_english(sentence)
    print(f"{i}. {sentence}")
    print(f"   → {translation}")
    print()

print("\n📥 ENGLISH TO BALTI TRANSLATION:")
print("=" * 40)
for i, sentence in enumerate(english_test_sentences, 1):
    translation = translate_english_to_balti(sentence)
    print(f"{i}. {sentence}")
    print(f"   → {translation}")
    print()

print("🎯 BIDIRECTIONAL TRANSLATION TESTING COMPLETED!")


🧪 TESTING BOTH TRANSLATION DIRECTIONS...

📤 BALTI TO ENGLISH TRANSLATION:
1. nga de birkuq ping song phinpa
   → i went to that party

2. nga yari bakshesh po yagat
   → i accept your apology

3. musharaf chatpa minang yot
   → musharaf is paying off

4. nga du la thatat pin
   → i like that

5. kho bank song sat
   → he went into the bank


📥 ENGLISH TO BALTI TRANSLATION:
1. Hello how are you
   → cheena yod kheri cheena yo

2. What is your name
   → yeri mingtakh po chi in

3. Thank you very much
   → yang ishan zarat

4. Where is the market
   → market gar yo

5. I am learning translation
   → nga tamlzoq zaben yod pin

🎯 BIDIRECTIONAL TRANSLATION TESTING COMPLETED!
