In [None]:
# Cell 1: Install dependencies and imports
!pip install evaluate datasets sacrebleu transformers

import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import os
import re
from tqdm import tqdm  
import warnings
import json
from datetime import datetime
import wandb

warnings.filterwarnings("ignore")
torch.cuda.empty_cache()

try:
    from transformers.trainer_utils import get_last_checkpoint
except ImportError:
    from transformers.trainer_callback import TrainerState

try:
    import evaluate
except ImportError:
    evaluate = None
    print("Warning: 'evaluate' package not found. Will attempt to use sacrebleu directly if needed.")

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: po

In [None]:
# Cell 2: Configuration
CONFIG = {
    "data_file": "C:\python\2022-CS-645,2022-CS-649,2022-CS-656,2022-CS-669\Datasets\Combined Dataset Sample.csv", 
    "model_name": "facebook/nllb-200-distilled-600M",
    "output_dir": "./fine-tuned-ur-en-model",
    "batch_size": 2,
    "learning_rate": 5e-5,
    "epochs": 3,
    "max_length": 128,
    "seed": 42,
    "eval_split": 0.1,
    "early_stopping_patience": 3,
    "test_examples": 500,
    "bleu_output_file": "bleu_scores.json"
}

# Global variables
model = None
tokenizer = None
split_dataset = None

In [3]:
# Cell 3: Helper functions
def clean_text(text):
    """Clean and normalize text data"""
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.\?\!\,\;\:\-\'\"\[\]\{\}\(\)،؟]', '', text)
    return text.strip()

def load_and_preprocess_data(file_path, eval_split=0.1):
    """Load and preprocess the dataset"""
    print(f"Loading data from {file_path}...")

    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error loading file: {e}")
        raise

    # Check and handle required columns
    required_columns = ['Urdu Sentence', 'English Sentence']
    missing_columns = [col for col in required_columns if col not in df.columns]

    if missing_columns:
        print(f"Warning: Missing columns: {missing_columns}")
        for missing_col in missing_columns:
            base_name = missing_col.split()[0].lower()
            potential_matches = [col for col in df.columns if base_name.lower() in col.lower()]
            if potential_matches:
                df[missing_col] = df[potential_matches[0]]
                print(f"Using '{potential_matches[0]}' as '{missing_col}'")

    # Clean text
    df['Urdu Sentence'] = df['Urdu Sentence'].apply(clean_text)
    df['English Sentence'] = df['English Sentence'].apply(clean_text)
    df = df[(df['Urdu Sentence'] != "") & (df['English Sentence'] != "")]

    if len(df) == 0:
        raise ValueError("No valid translation pairs found after cleaning.")

    print(f"Dataset size after cleaning: {len(df)} pairs")

    # Convert to HF dataset and split
    dataset = Dataset.from_pandas(df)
    split_dataset = dataset.shuffle(seed=CONFIG["seed"]).train_test_split(test_size=eval_split)
    return split_dataset

In [4]:
# Cell 4: Preprocessing function
def preprocess_function(examples, tokenizer, max_length):
    """Tokenize the inputs and targets"""
    source_texts = examples["Urdu Sentence"]
    target_texts = examples["English Sentence"]

    # Tokenize inputs
    model_inputs = tokenizer(
        source_texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors=None
    )

    # Tokenize targets
    try:
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                target_texts,
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_tensors=None
            )
    except (AttributeError, TypeError):
        prefix = ""
        if hasattr(tokenizer, 'lang_code_to_id'):
            if 'en' in tokenizer.lang_code_to_id:
                prefix = "en_XX "
        labels = tokenizer(
            [prefix + txt for txt in target_texts],
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors=None
        )

    model_inputs["labels"] = labels["input_ids"]

    if tokenizer.pad_token_id is not None:
        model_inputs["labels"] = [
            [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
            for labels_example in model_inputs["labels"]
        ]

    return model_inputs

In [5]:
# Cell 5: Metrics and training functions
def compute_metrics(eval_preds):
    """Compute BLEU score for evaluation"""
    try:
        bleu = evaluate.load("sacrebleu")
        preds, labels = eval_preds
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        formatted_labels = [[label] for label in decoded_labels]

        result = bleu.compute(predictions=decoded_preds, references=formatted_labels)
        return {"bleu": result["score"]}
    except Exception:
        try:
            import sacrebleu
            preds, labels = eval_preds
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            formatted_labels = [[label] for label in decoded_labels]

            bleu = sacrebleu.corpus_bleu(decoded_preds, formatted_labels)
            return {"bleu": bleu.score}
        except ImportError:
            print("Warning: Could not calculate BLEU score.")
            return {"bleu": 0.0}

# Cell 5 (Updated): Metrics and training functions
def train_model():
    """Train the Urdu-to-English translation model"""
    global model, tokenizer, split_dataset

    # Set random seeds
    torch.manual_seed(CONFIG["seed"])
    np.random.seed(CONFIG["seed"])

    # Load data
    split_dataset = load_and_preprocess_data(CONFIG["data_file"], CONFIG["eval_split"])
    print(f"Training set size: {len(split_dataset['train'])}")
    print(f"Validation set size: {len(split_dataset['test'])}")

    # Load model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

    # Tokenize datasets
    tokenized_train = split_dataset["train"].map(
        lambda x: preprocess_function(x, tokenizer, CONFIG["max_length"]),
        batched=True
    )
    tokenized_eval = split_dataset["test"].map(
        lambda x: preprocess_function(x, tokenizer, CONFIG["max_length"]),
        batched=True
    )

    # Training arguments (compatible with older versions)
    training_args = Seq2SeqTrainingArguments(
        output_dir=CONFIG["output_dir"],
        per_device_train_batch_size=CONFIG["batch_size"],
        per_device_eval_batch_size=CONFIG["batch_size"],
        learning_rate=CONFIG["learning_rate"],
        num_train_epochs=CONFIG["epochs"],
        # For older versions, use these parameters instead:
        eval_steps=500,               # Evaluation every 500 steps
        save_steps=500,               # Save every 500 steps
        save_total_limit=2,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        report_to="none",
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding="max_length",
        max_length=CONFIG["max_length"],
    )

    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Start training
    print("Starting training...")
    trainer.train()

    # Save model
    print(f"Saving model to {CONFIG['output_dir']}")
    trainer.save_model(CONFIG["output_dir"])
    tokenizer.save_pretrained(CONFIG["output_dir"])

    # Evaluate
    results = trainer.evaluate()
    print(f"Evaluation results: {results}")
    print("\n✅ Training completed successfully!")

In [6]:
# Cell 6: Translation and testing functions
def translate_text(text, model, tokenizer, max_length=128):
    """Translate Urdu text to English"""
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    if torch.cuda.is_available():
        model.to("cuda")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length, num_beams=4)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def test_model():
    """Test the model and generate BLEU scores"""
    global model, tokenizer, split_dataset

    # Load model if not already loaded
    if model is None or tokenizer is None:
        try:
            model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["output_dir"])
            tokenizer = AutoTokenizer.from_pretrained(CONFIG["output_dir"])
        except Exception as e:
            print(f"❌ Failed to load model: {str(e)}")
            return

    # Load dataset if not already loaded
    if split_dataset is None:
        split_dataset = load_and_preprocess_data(CONFIG["data_file"], CONFIG["eval_split"])

    # Select test examples
    test_dataset = split_dataset["test"]
    num_examples = min(CONFIG["test_examples"], len(test_dataset))
    examples = test_dataset.select(range(num_examples))

    # Initialize variables for BLEU
    all_references = []
    all_predictions = []

    print(f"Testing model on {num_examples} examples:")

    # Process examples
    for i, example in enumerate(tqdm(examples, desc="Testing")):
        urdu_text = example["Urdu Sentence"]
        reference_english = example["English Sentence"]
        translated_english = translate_text(urdu_text, model, tokenizer)

        all_references.append([reference_english])
        all_predictions.append(translated_english)

        if i < 5:  # Print first 5 examples
            print(f"\nExample {i+1}:")
            print(f"Urdu: {urdu_text}")
            print(f"Reference English: {reference_english}")
            print(f"Translated English: {translated_english}")

    # Calculate BLEU score
    try:
        if evaluate:
            bleu = evaluate.load("sacrebleu")
            bleu_result = bleu.compute(predictions=all_predictions, references=all_references)
            bleu_score = bleu_result["score"]
        else:
            import sacrebleu
            bleu_result = sacrebleu.corpus_bleu(all_predictions, all_references)
            bleu_score = bleu_result.score
    except Exception as e:
        print(f"Warning: Error calculating BLEU score: {str(e)}")
        bleu_score = 0.0

    print(f"\nFinal BLEU score on {num_examples} test examples: {bleu_score:.2f}")

    # Save results
    bleu_data = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "model_name": CONFIG["model_name"],
        "num_test_examples": num_examples,
        "bleu_score": bleu_score,
        "examples": [
            {
                "urdu": example["Urdu Sentence"],
                "reference_english": example["English Sentence"],
                "translated_english": translate_text(example["Urdu Sentence"], model, tokenizer)
            }
            for example in examples.select(range(min(5, num_examples)))
        ]
    }

    with open(CONFIG["bleu_output_file"], "w", encoding="utf-8") as f:
        json.dump(bleu_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Testing completed! Results saved to {CONFIG['bleu_output_file']}")

In [7]:
# Cell 7: Main execution
if __name__ == "__main__":
    print("Urdu-to-English Translation Model")
    print("=================================")

    # Uncomment to run training
    print("\nStarting training...")
    train_model()

    # Uncomment to run testing
    print("\nStarting testing...")
    test_model()

Urdu-to-English Translation Model

Starting training...
Loading data from /content/combined_dataset1.csv...
Dataset size after cleaning: 8000 pairs
Training set size: 7200
Validation set size: 800


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Starting training...


Step,Training Loss
500,1.1295
1000,1.0465
1500,1.0083
2000,1.0564
2500,0.9714
3000,1.0313
3500,0.9819
4000,0.7204
4500,0.6936
5000,0.7224


Saving model to ./fine-tuned-ur-en-model


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Evaluation results: {'eval_loss': 1.0765668153762817, 'eval_bleu': 37.62028230383172, 'eval_runtime': 193.1399, 'eval_samples_per_second': 4.142, 'eval_steps_per_second': 2.071, 'epoch': 3.0}

✅ Training completed successfully!

Starting testing...
Testing model on 500 examples:


Testing:   0%|          | 1/500 [00:00<05:54,  1.41it/s]


Example 1:
Urdu: تو میرا دوست آرہا ہے اور آپ تیار ہیں
Reference English: So my friend is coming, you are ready
Translated English: So my friend is coming and you're ready.


Testing:   0%|          | 2/500 [00:01<04:04,  2.04it/s]


Example 2:
Urdu: آج کی تاریخ بہت سمجھ گئی
Reference English: Today's history is very understandable
Translated English: Today's history was well understood.


Testing:   1%|          | 3/500 [00:01<04:07,  2.01it/s]


Example 3:
Urdu: مجھے باہر جانا پڑا لیکن مجھے یہاں ایک اچھی یونیورسٹی جانا پڑا
Reference English: University educated and will bring a degree for you
Translated English: I had to go out but I had to go to a good university here


Testing:   1%|          | 4/500 [00:01<03:33,  2.32it/s]


Example 4:
Urdu: میں یہ کرنے سے قاصر ہوں
Reference English: I can't do this.
Translated English: I'm not able to do it.


Testing:   1%|          | 5/500 [00:02<03:26,  2.40it/s]


Example 5:
Urdu: یہ ہے کہ اب کوئی حقیقی چیز نہیں ہے
Reference English: It's no real thing now
Translated English: It's that there's no real thing anymore.


Testing: 100%|██████████| 500/500 [03:13<00:00,  2.59it/s]



Final BLEU score on 500 test examples: 38.54

✅ Testing completed! Results saved to bleu_scores.json


In [9]:
!zip -r Ouput.zip /content/fine-tuned-ur-en-model


  adding: content/fine-tuned-ur-en-model/ (stored 0%)
  adding: content/fine-tuned-ur-en-model/model.safetensors (deflated 7%)
  adding: content/fine-tuned-ur-en-model/generation_config.json (deflated 34%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/ (stored 0%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/model.safetensors (deflated 7%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/scaler.pt (deflated 60%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/rng_state.pth (deflated 25%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/generation_config.json (deflated 34%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/optimizer.pt (deflated 8%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/tokenizer_config.json (deflated 94%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/config.json (deflated 57%)
  adding: content/fine-tuned-ur-en-model/checkpoint-10500/scheduler.pt (deflated 56%)
  adding: conten

In [11]:
from google.colab import files
files.download('/content/Ouput.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>