In [None]:
# CELL 1: Install Dependencies
%pip install transformers==4.44.2 datasets==3.0.1 sentencepiece sacrebleu torch accelerate pandas tqdm evaluate rouge-score IndicTransToolkit -q
print("‚úÖ Installation complete!")

In [1]:
# CELL 2: Imports and Device Setup
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import evaluate
from tqdm.auto import tqdm
import warnings
import os
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

üñ•Ô∏è Using device: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Memory: 4.29 GB


In [2]:
# CELL 3: Configuration
import torch
class Config:
    MODEL_NAME = "ai4bharat/indictrans2-en-indic-dist-200M"
    SOURCE_LANG = "eng_Latn"  # English (Latin script)
    TARGET_LANG = "tam_Taml"  # Tamil (Tamil script)
    SOURCE_CODE = "en"
    TARGET_CODE = "ta"
    MAX_INPUT_LENGTH = 64  # Increased for longer sentences
    MAX_TARGET_LENGTH = 64  # Increased for longer sentences
    BATCH_SIZE = 3
    LEARNING_RATE = 3e-5  # Slightly higher for faster adaptation
    NUM_EPOCHS = 1  # More epochs
    WARMUP_STEPS = 500  # Adjusted for larger data
    WEIGHT_DECAY = 0.01
    NUM_BEAMS = 2  # Increased for better generation
    OUTPUT_DIR = "./indictrans2-finetuned-news-translation"
    LOGGING_DIR = "./logs"
    SEED = 42
    FP16 = torch.cuda.is_available()
    SAVE_STEPS = 250 # Eval more often
    EVAL_STEPS = 250
    LOGGING_STEPS = 100
    GRAD_ACCUM_STEPS = 4 # For effective larger batch

config = Config()
print("‚úÖ Configuration loaded!")
print(f"üìä Training: {config.SOURCE_LANG} ‚Üí {config.TARGET_LANG}")

‚úÖ Configuration loaded!
üìä Training: eng_Latn ‚Üí tam_Taml


In [3]:
# CELL 4: Load EnTam v2 Dataset from Kaggle Path
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
import random
import os
import torch

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

print("üì• Loading EnTam v2 dataset from Kaggle path...")
extracted_path = "D:\Work\Projects\S5\TA\en-ta-parallel-v2\en-ta-parallel-v2"
print(f"\nContents of directory ({extracted_path}):")
if os.path.exists(extracted_path):
    print(os.listdir(extracted_path))
else:
    raise Exception(f"Directory not found: {extracted_path}")

try:
    with open(os.path.join(extracted_path, "corpus.bcn.train.en"), 'r', encoding='utf-8') as en_f, \
         open(os.path.join(extracted_path, "corpus.bcn.train.ta"), 'r', encoding='utf-8') as ta_f:
        en_lines = [line.strip() for line in en_f.readlines() if line.strip()]
        ta_lines = [line.strip() for line in ta_f.readlines() if line.strip()]
    min_length = min(len(en_lines), len(ta_lines))
    df = pd.DataFrame({'src': en_lines[:min_length], 'tgt': ta_lines[:min_length]})
except FileNotFoundError:
    raise Exception(f"Dataset files corpus.bcn.train.en or corpus.bcn.train.ta not found in {extracted_path}")
except Exception as e:
    raise Exception(f"Error loading dataset files: {e}")

full_dataset = Dataset.from_pandas(df)
print(f"\nOriginal full train examples: {len(full_dataset)}")

train_test = full_dataset.train_test_split(test_size=0.2, seed=SEED)
train_dataset_full = train_test['train']
val_test = train_test['test'].train_test_split(test_size=0.5, seed=SEED)
val_dataset_full = val_test['train']
test_dataset_full = val_test['test']

train_size = 5000 # Scaled up
val_size = 500
test_size = 500

train_dataset = train_dataset_full.shuffle(seed=SEED).select(range(min(train_size, len(train_dataset_full))))
val_dataset = val_dataset_full.shuffle(seed=SEED).select(range(min(val_size, len(val_dataset_full))))
test_dataset = test_dataset_full.shuffle(seed=SEED).select(range(min(test_size, len(test_dataset_full))))

raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(f"\n‚úÖ Dataset loaded and subsetted!")
print(f"Train examples: {len(raw_datasets['train'])}")
print(f"Validation examples: {len(raw_datasets['validation'])}")
print(f"Test examples: {len(raw_datasets['test'])}")

print("\nüìù Sample data:")
sample = raw_datasets['train'][0]
print(f"English (src): {sample['src'][:100]}...")
print(f"Tamil (tgt): {sample['tgt'][:100]}...")

üì• Loading EnTam v2 dataset from Kaggle path...

Contents of directory (D:\Work\Projects\S5\TA\en-ta-parallel-v2\en-ta-parallel-v2):
['corpus.bcn.dev.en', 'corpus.bcn.dev.ta', 'corpus.bcn.test.en', 'corpus.bcn.test.ta', 'corpus.bcn.train.en', 'corpus.bcn.train.ta']

Original full train examples: 166871

‚úÖ Dataset loaded and subsetted!
Train examples: 5000
Validation examples: 500
Test examples: 500

üìù Sample data:
English (src): The Centre Party also claimed that Estonian 'independence' was being sacrificed by joining the EU....
Tamil (tgt): ‡Æê‡Æ∞‡Øá‡Ææ‡Æ™‡Øç‡Æ™‡Æø‡ÆØ ‡Æí‡Æ©‡Øç‡Æ±‡Æø‡ÆØ‡Æ§‡Øç‡Æ§‡Æø‡Æ≤‡Øç ‡Æá‡Æ£‡Øà‡Æµ‡Æ§‡Æ©‡Øç ‡ÆÆ‡ØÇ‡Æ≤‡ÆÆ‡Øç ‡Æé‡Æ∏‡Øç‡Æ§‡Øá‡Ææ‡Æ©‡Æø‡ÆØ ‡Æ®‡Ææ‡Æü‡Øç‡Æü‡Æø‡Æ©‡Øç "‡Æö‡ØÅ‡Æ§‡Æ®‡Øç‡Æ§‡Æø‡Æ∞‡ÆÆ‡Øç" ‡Æ™‡Æ±‡Æø‡Æ™‡Øá‡Ææ‡ÆØ‡Øç‡Æµ‡Æø‡Æü‡ØÅ‡ÆÆ‡Øç ‡Æé‡Æ©‡Øç‡Æ±‡ØÅ ‡ÆÆ‡Æ§‡Øç‡Æ§‡Æø‡ÆØ ‡Æï‡Æü‡Øç...


In [4]:
# CELL 5: Load Model and Tokenizer

%pip install IndicTransToolkit

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor
import torch

print("ü§ñ Loading IndicTrans2 model and tokenizer from Hugging Face...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    # Load tokenizer and model directly from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
    model = model.to(device)

    # Initialize IndicProcessor without lang argument, handle language later if needed
    processor = IndicProcessor(inference=True)  # Removed 'lang=config.TARGET_LANG'

    print(f"‚úÖ Model loaded successfully!")
    print(f"Model parameters: {model.num_parameters() / 1e6:.2f}M")

except Exception as e:
    print(f"‚ùå Model loading failed: {e}")
    print("Ensure internet is enabled and IndicTransToolkit is installed. Try restarting the kernel.")

Note: you may need to restart the kernel to use updated packages.
ü§ñ Loading IndicTrans2 model and tokenizer from Hugging Face...
Using device: cuda
‚úÖ Model loaded successfully!
Model parameters: 211.78M


In [5]:
# CELL 6: Preprocessing Function
def preprocess_function(examples):
    inputs = [f"{config.SOURCE_LANG} {config.TARGET_LANG} {src}" for src in examples['src']]  # Add language tags
    targets = examples['tgt']
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=config.MAX_INPUT_LENGTH,
        truncation=True,
        padding=False
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=config.MAX_TARGET_LENGTH,
            truncation=True,
            padding=False
        )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

print("‚úÖ Preprocessing function defined!")

‚úÖ Preprocessing function defined!


In [6]:
# CELL 7: Tokenize Dataset
print("üîÑ Tokenizing dataset...")

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets['train'].column_names,
    desc="Tokenizing"
)

print("‚úÖ Tokenization complete!")
print(f"Train samples: {len(tokenized_datasets['train'])}")
print(f"Validation samples: {len(tokenized_datasets['validation'])}")
print(f"Test samples: {len(tokenized_datasets['test'])}")

üîÑ Tokenizing dataset...


Tokenizing:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/500 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!
Train samples: 5000
Validation samples: 500
Test samples: 500


In [7]:
# CELL 8: Setup Evaluation Metrics
import evaluate
import numpy as np  ### CHANGED: Added explicit numpy import (was missing)
from IndicTransToolkit.processor import IndicProcessor  # Ensure this is imported

bleu_metric = evaluate.load('sacrebleu')
rouge_metric = evaluate.load('rouge')  ### CHANGED: Added ROUGE and chrF loads
chrf_metric = evaluate.load('chrf')

# Reuse processor from CELL 5 (assume it's global; reinitialize if needed)
# processor = IndicProcessor(inference=True)  ### CHANGED: Commented reinitialize option for safety

def postprocess_text(preds, labels):
    # Strip and normalize predictions
    preds = [pred.strip() for pred in preds]
    preds = processor.postprocess_batch(preds, lang=config.TARGET_LANG)  ### CHANGED: Swapped to correct batch method (was post_process_sentence)
    
    # Normalize references (labels) similarly for consistency
    normalized_labels = []  ### CHANGED: New loop to batch-process labels (prevents single-item errors)
    for label in labels:
        norm_label = processor.postprocess_batch([label.strip()], lang=config.TARGET_LANG)[0]  ### CHANGED: Batch even single labels
        normalized_labels.append(norm_label)
    labels = [[lbl] for lbl in normalized_labels]  # SacreBLEU expects list of list(str)
    
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  ### CHANGED: Now uses np (imported above)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)  ### CHANGED: Added ROUGE/chrF computes
    chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    result = {
        'bleu': bleu_result['score'],
        'rouge1': rouge_result['rouge1'],  ### CHANGED: Added to result dict
        'chrf': chrf_result['score']  ### CHANGED: Added to result dict
    }
    
    result = {k: round(v, 4) for k, v in result.items()}
    return result

print("‚úÖ Evaluation metrics configured!")

‚úÖ Evaluation metrics configured!


In [8]:

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch

# Make sure device is defined
device = "cuda" if torch.cuda.is_available() else "cpu"

# Optional: define output directory if config not available
OUTPUT_DIR = getattr(config, "OUTPUT_DIR", "./mt5-finetuned-news-translation")

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",          # use eval_strategy instead of evaluation_strategy
    eval_steps=getattr(config, "EVAL_STEPS", 1000),
    save_strategy="steps",
    save_steps=getattr(config, "SAVE_STEPS", 1000),
    per_device_train_batch_size=getattr(config, "BATCH_SIZE", 4),
    per_device_eval_batch_size=getattr(config, "BATCH_SIZE", 4),
    gradient_accumulation_steps=getattr(config, "GRAD_ACCUM_STEPS", 4),  # New: larger effective batch
    learning_rate=getattr(config, "LEARNING_RATE", 5e-5),
    weight_decay=getattr(config, "WEIGHT_DECAY", 0.01),
    save_total_limit=getattr(config, "SAVE_TOTAL_LIMIT", 3),
    num_train_epochs=getattr(config, "NUM_EPOCHS", 1),
    predict_with_generate=True,
    generation_max_length=getattr(config, "MAX_TARGET_LENGTH", 128),
    generation_num_beams=getattr(config, "NUM_BEAMS", 4),
    fp16=getattr(config, "FP16", torch.cuda.is_available()),
    logging_dir=getattr(config, "LOGGING_DIR", "./logs"),
    load_best_model_at_end=True,
    metric_for_best_model='bleu',
    greater_is_better=True,
    warmup_steps=getattr(config, "WARMUP_STEPS", 500),
    #report_to="none",
    seed=getattr(config, "SEED", 42),
    save_safetensors=False, # Added to fix non-contiguous tensor save error
    logging_strategy="steps",  # Log every X steps
    logging_steps=50,  # Frequent (every 50 steps; adjust to 100 if too noisy)
    report_to=["tensorboard"],  # Enables live graphs (install !pip install tensorboard if needed)
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

print("‚úÖ Training arguments configured!")

‚úÖ Training arguments configured!


In [9]:
# CELL 10: Initialize Trainer
from transformers import EarlyStoppingCallback
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement
)

print("‚úÖ Trainer initialized!")
print("\nüìä Training Configuration:")
print(f"  ‚Ä¢ Batch size: {config.BATCH_SIZE} (effective: {config.BATCH_SIZE * 4})")
print(f"  ‚Ä¢ Learning rate: {config.LEARNING_RATE}")
print(f"  ‚Ä¢ Epochs: {config.NUM_EPOCHS}")
print(f"  ‚Ä¢ Total training steps: {len(tokenized_datasets['train']) // config.BATCH_SIZE * config.NUM_EPOCHS}")

‚úÖ Trainer initialized!

üìä Training Configuration:
  ‚Ä¢ Batch size: 3 (effective: 12)
  ‚Ä¢ Learning rate: 3e-05
  ‚Ä¢ Epochs: 1
  ‚Ä¢ Total training steps: 1666


In [None]:
# CELL 11: Start Training
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print("üöÄ Starting training...")
print("="*70)

try:
    torch.cuda.empty_cache()
    print("üßπ GPU cache cleared!")
    print(f"Memory before training: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    
    train_result = trainer.train()
    
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    
    print("\n‚úÖ Training completed successfully!")
    print("\nüìä Final Training Metrics:")
    for key, value in metrics.items():
        print(f"  ‚Ä¢ {key}: {value}")
except Exception as e:
    print(f"\n‚ùå Training failed: {e}")
    print("Try reducing MAX_INPUT_LENGTH to 32 or restarting the kernel")

üöÄ Starting training...
üßπ GPU cache cleared!
Memory before training: 0.85 GB


  0%|          | 0/416 [00:00<?, ?it/s]

{'loss': 9.4834, 'grad_norm': 7.160135269165039, 'learning_rate': 3e-06, 'epoch': 0.12}
{'loss': 7.7502, 'grad_norm': 5.7943115234375, 'learning_rate': 6e-06, 'epoch': 0.24}
{'loss': 5.6016, 'grad_norm': 4.3690595626831055, 'learning_rate': 9e-06, 'epoch': 0.36}
{'loss': 4.6176, 'grad_norm': 3.24072003364563, 'learning_rate': 1.2e-05, 'epoch': 0.48}
{'loss': 4.0149, 'grad_norm': 2.606715440750122, 'learning_rate': 1.5e-05, 'epoch': 0.6}


  0%|          | 0/167 [00:00<?, ?it/s]

In [None]:
# NEW CELL: Test Sample Translation (Run after training)
def translate_sample(text):
    inputs = tokenizer(f"{config.SOURCE_LANG} {config.TARGET_LANG} {text}", return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=config.MAX_TARGET_LENGTH, num_beams=config.NUM_BEAMS)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Normalize with processor
    pred = processor.post_process_batch(pred, lang=config.TARGET_CODE)
    return pred

# Pick a test sample
sample_en = raw_datasets['test'][0]['src']
pred_ta = translate_sample(sample_en)
ref_ta = raw_datasets['test'][0]['tgt']

print("üìù Sample Translation Test:")
print(f"English: {sample_en}")
print(f"Predicted Tamil: {pred_ta}")
print(f"Reference Tamil: {ref_ta}")

# Manual BLEU for this sample
manual_bleu = bleu_metric.compute(predictions=[pred_ta], references=[[ref_ta]])
print(f"Manual BLEU for this sample: {manual_bleu['score']:.2f}")