# 1_Setup_and_Testing

In [35]:
# All imports
import sys
import pip
import torch
from datasets import get_dataset_split_names, load_dataset, load_dataset_builder, get_dataset_config_names,  load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBart50Tokenizer, MBartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate
import numpy as np
print("All imports are successful ✅")

print("--" * 50)

#---------------------------------------------------------------
# Check Python, pip, and pytorch versions and cuda compatibility
#---------------------------------------------------------------
print("Python version:", sys.version)
# Print pip version
print("Pip version:", pip.__version__)
# Print pytorch version
print("Pytorch version:", torch.__version__)
# Print CUDA version
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available.")

# Print GPU information
if torch.cuda.is_available():
    print("GPU is available.")
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

# Check if pytorch can use CUDA
if torch.cuda.is_available():
    x = torch.rand(5, 3).cuda()
    if x.is_cuda:
        print("Pytorch can use CUDA ✅Tensor on GPU")
else:
    print("Pytorch is not using CUDA.")

print("--" * 50)
# Check if evaluate is working
!python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
print("Evaluate is working ✅")


All imports are successful ✅
----------------------------------------------------------------------------------------------------
Python version: 3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:09:17) [GCC 11.2.0]
Pip version: 25.1
Pytorch version: 2.7.0+cu126
CUDA version: 12.6
GPU is available.
Number of GPUs: 2
GPU 0: NVIDIA GeForce RTX 4070
GPU 1: NVIDIA GeForce RTX 4070
Pytorch can use CUDA ✅Tensor on GPU
----------------------------------------------------------------------------------------------------
{'exact_match': np.float64(1.0)}
Evaluate is working ✅


# 2_Load_Dataset

In [36]:
# https://huggingface.co/docs/datasets/load_hub
splits = get_dataset_split_names("rahular/itihasa")
print("Available dataset splits:", splits)
configs = get_dataset_config_names("rahular/itihasa")
print("Available dataset configurations:", configs)

Available dataset splits: ['train', 'validation', 'test']
Available dataset configurations: ['Itihasa']


In [37]:
ds_builder = load_dataset_builder("rahular/itihasa")

# Inspect dataset description
ds_builder.info.description

# Inspect dataset features
ds_builder.info.features

{'translation': Translation(languages=['sn', 'en'], id=None)}

In [38]:
from datasets import load_dataset

train_dataset = load_dataset("rahular/itihasa", split="train")
valid_dataset = load_dataset("rahular/itihasa", split="validation")
test_dataset  = load_dataset("rahular/itihasa", split="test")
print("Datasets loaded successfully ✅.")

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Datasets loaded successfully ✅.
Train dataset size: 75162
Validation dataset size: 6149
Test dataset size: 11722


In [39]:
train_dataset[0]  # Inspect the first example in the train dataset

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.',
  'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}

In [40]:
test_dataset[0]  # Inspect the first example in the test dataset

{'translation': {'en': 'Hearing the words of Viśvāmitra, Rāghava, together with Laksmana, was struck with amazement, and spoke to Viśvāmitra, saying,',
  'sn': 'विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः। विस्मयं परमं गत्वा विश्वामित्रमथाब्रवीत्॥'}}

In [41]:
valid_dataset[0] # Inspect the first example in the validation dataset

{'translation': {'en': 'When Şītā, having a husband although seeming as if she had none, was putting on the ascetic guise, the people got into a wrath and exclaimed, “O Dasaratha, fie on you!"',
  'sn': 'तस्यां चीरं वसानायां नाथवत्यामनाथवत्। प्रचुक्रोश जनः सर्वो धिक् त्वां दशरथं त्विति ॥'}}

In [42]:
# Indexing the datasets
print(train_dataset[0])  # To see the full content of the first example
print("--" * 50)
print(train_dataset[0]["translation"])  # To see the root of the nested dictionary
print("--" * 50)
print(train_dataset[0]["translation"]["en"])  # To see the English translation of the first example
print("--" * 50)
print(train_dataset[0]["translation"]["sn"])  # To see the Sanskrit translation of the first example
print("--" * 50)
for i in range(3):
    print(f"Example {i}: (English: {train_dataset[i]['translation']['en']}) (Sanskrit: {train_dataset[i]['translation']['sn']})")

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.', 'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}
----------------------------------------------------------------------------------------------------
{'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.', 'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}
----------------------------------------------------------------------------------------------------
The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.
----------------------------------------------------------------------------------------------------
ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वर

# 3_Modelling

In [43]:
MODEL = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
TOKENIZER = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
TOKENIZER.src_lang = "en_XX"
TOKENIZER.tgt_lang = "hi_IN"  # Setting Hindi token id as a proxy for Sanskrit


TEXT_TO_TRANSLATE = "For one who has conquered the mind, the mind is the best of friends; but for one who has failed to do so, his very mind will be the greatest enemy."

In [44]:
def translate_text(text, model=MODEL, tokenizer=TOKENIZER, src_lang=TOKENIZER.src_lang, tgt_lang=TOKENIZER.tgt_lang, skip_special_tokens=True):

    inputs = tokenizer(text, return_tensors="pt")

    # Force decoder to use target language
    output_ids = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=skip_special_tokens)

translate_text(text=TEXT_TO_TRANSLATE)

'जिसने मन पर विजय प्राप्त की है तो वह सबसे अच्छा मित्र है, पर जिसने ऐसा नहीं किया तो उसका मन ही सबसे बड़ा शत्रु होगा।'

In [45]:
# Human-readable language names mapped to mBART-50 language codes
lang_code_to_name = {
    "ar_AR": "Arabic", "cs_CZ": "Czech", "de_DE": "German", "en_XX": "English", "es_XX": "Spanish",
    "et_EE": "Estonian", "fi_FI": "Finnish", "fr_XX": "French", "gu_IN": "Gujarati", "hi_IN": "Hindi",
    "it_IT": "Italian", "ja_XX": "Japanese", "kk_KZ": "Kazakh", "ko_KR": "Korean", "lt_LT": "Lithuanian",
    "lv_LV": "Latvian", "my_MM": "Burmese", "ne_NP": "Nepali", "nl_XX": "Dutch", "ro_RO": "Romanian",
    "ru_RU": "Russian", "si_LK": "Sinhala", "tr_TR": "Turkish", "vi_VN": "Vietnamese", "zh_CN": "Chinese (Simplified)",
    "af_ZA": "Afrikaans", "az_AZ": "Azerbaijani", "bn_IN": "Bengali", "fa_IR": "Persian", "he_IL": "Hebrew",
    "hr_HR": "Croatian", "id_ID": "Indonesian", "ka_GE": "Georgian", "km_KH": "Khmer", "mk_MK": "Macedonian",
    "ml_IN": "Malayalam", "mn_MN": "Mongolian", "mr_IN": "Marathi", "pl_PL": "Polish", "ps_AF": "Pashto",
    "pt_XX": "Portuguese", "sr_XX": "Serbian", "ta_IN": "Tamil", "te_IN": "Telugu", "th_TH": "Thai",
    "tl_XX": "Tagalog", "uk_UA": "Ukrainian", "ur_PK": "Urdu", "xh_ZA": "Xhosa", "gl_ES": "Galician",
    "sl_SI": "Slovenian"
}

# Print total number of languages
print("Total languages supported by the tokenizer:", len(TOKENIZER.lang_code_to_id))

# Print human-readable name for each language code
for lang_code, token_id in TOKENIZER.lang_code_to_id.items():
    name = lang_code_to_name.get(lang_code, "Unknown")
    print(f"Language Code: {lang_code}, Human Name: {name}, Token ID: {token_id}")

Total languages supported by the tokenizer: 52
Language Code: ar_AR, Human Name: Arabic, Token ID: 250001
Language Code: cs_CZ, Human Name: Czech, Token ID: 250002
Language Code: de_DE, Human Name: German, Token ID: 250003
Language Code: en_XX, Human Name: English, Token ID: 250004
Language Code: es_XX, Human Name: Spanish, Token ID: 250005
Language Code: et_EE, Human Name: Estonian, Token ID: 250006
Language Code: fi_FI, Human Name: Finnish, Token ID: 250007
Language Code: fr_XX, Human Name: French, Token ID: 250008
Language Code: gu_IN, Human Name: Gujarati, Token ID: 250009
Language Code: hi_IN, Human Name: Hindi, Token ID: 250010
Language Code: it_IT, Human Name: Italian, Token ID: 250011
Language Code: ja_XX, Human Name: Japanese, Token ID: 250012
Language Code: kk_KZ, Human Name: Kazakh, Token ID: 250013
Language Code: ko_KR, Human Name: Korean, Token ID: 250014
Language Code: lt_LT, Human Name: Lithuanian, Token ID: 250015
Language Code: lv_LV, Human Name: Latvian, Token ID: 250

# 4_Preprocessing

In [46]:
# Calculate the length of input IDs for each Sanskrit translation in the training dataset
# This will help to select max length for model inputs in the preprocess function
# Extract list of Sanskrit texts
# Sanskrit contains lot of samasa (compound words) which can be long therefore appropriate to check token lengths
sanskrit_texts = [item["translation"]["sn"] for item in train_dataset]

# Now calculate token lengths
token_lens = [len(TOKENIZER(text)["input_ids"]) for text in sanskrit_texts]

# Check maximum and top 10 longest
print("Max length:", max(token_lens))
print("Top 10 longest:", sorted(token_lens)[-10:])


Max length: 1068
Top 10 longest: [451, 460, 484, 485, 642, 668, 794, 814, 911, 1068]


In [47]:
def preprocess_function(examples):
    inputs = [t["en"] for t in examples["translation"]]
    targets = [t["sn"] for t in examples["translation"]]  # Sanskrit texts
    
    model_inputs = TOKENIZER(inputs, max_length=512, truncation=True, padding=False)

    # tokenize targets, can also sat padding as 'longest' to save memory and pad only to the longest target in the batch
    labels = TOKENIZER(targets, max_length=1024, truncation=True, padding=False)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)

# Tokenize the validation dataset
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

# Tokenize the test dataset
tokenized_test = test_dataset.map(preprocess_function, batched=True)


# 5_Training

In [48]:
# Data collator for Seq2Seq models used for padding and creating attention masks
data_collator = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=MODEL)

In [49]:
bleu = evaluate.load("bleu")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # decode predictions and labels
    decoded_preds = TOKENIZER.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, TOKENIZER.pad_token_id)
    decoded_labels = TOKENIZER.batch_decode(labels, skip_special_tokens=True)
    
    # BLEU expects list of references for each prediction (hence [[ref1], [ref2], ...])
    decoded_labels = [[label.split()] for label in decoded_labels]
    decoded_preds = [pred.split() for pred in decoded_preds]
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}

In [50]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    predict_with_generate=True,  # important for seq2seq tasks
)

In [52]:
trainer = Seq2SeqTrainer(
    model=MODEL,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    processing_class=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

# 6_Evaluation

In [None]:
# Evaluate the model on the test dataset