# 1_Setup_and_Testing

In [14]:
# All imports
import sys
import pip
import torch
from datasets import get_dataset_split_names, load_dataset, load_dataset_builder, get_dataset_config_names
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBart50Tokenizer, MBartForConditionalGeneration

print("All imports are successful ✅")

print("--" * 50)

#---------------------------------------------------------------
# Check Python, pip, and pytorch versions and cuda compatibility
#---------------------------------------------------------------
print("Python version:", sys.version)
# Print pip version
print("Pip version:", pip.__version__)
# Print pytorch version
print("Pytorch version:", torch.__version__)
# Print CUDA version
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available.")

# Print GPU information
if torch.cuda.is_available():
    print("GPU is available.")
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

# Check if pytorch can use CUDA
if torch.cuda.is_available():
    x = torch.rand(5, 3).cuda()
    if x.is_cuda:
        print("Pytorch can use CUDA ✅Tensor on GPU")
else:
    print("Pytorch is not using CUDA.")


All imports are successful ✅
----------------------------------------------------------------------------------------------------
Python version: 3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:09:17) [GCC 11.2.0]
Pip version: 25.1
Pytorch version: 2.7.0+cu126
CUDA version: 12.6
GPU is available.
Number of GPUs: 2
GPU 0: NVIDIA GeForce RTX 4070
GPU 1: NVIDIA GeForce RTX 4070
Pytorch can use CUDA ✅Tensor on GPU


# 2_Load_Dataset_and_Preprocess

In [15]:
# https://huggingface.co/docs/datasets/load_hub
splits = get_dataset_split_names("rahular/itihasa")
print("Available dataset splits:", splits)
configs = get_dataset_config_names("rahular/itihasa")
print("Available dataset configurations:", configs)

Available dataset splits: ['train', 'validation', 'test']
Available dataset configurations: ['Itihasa']


In [16]:
ds_builder = load_dataset_builder("rahular/itihasa")

# Inspect dataset description
ds_builder.info.description

# Inspect dataset features
ds_builder.info.features

{'translation': Translation(languages=['sn', 'en'], id=None)}

In [17]:
from datasets import load_dataset

train_dataset = load_dataset("rahular/itihasa", split="train")
valid_dataset = load_dataset("rahular/itihasa", split="validation")
test_dataset  = load_dataset("rahular/itihasa", split="test")
print("Datasets loaded successfully ✅.")

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Datasets loaded successfully ✅.
Train dataset size: 75162
Validation dataset size: 6149
Test dataset size: 11722


In [18]:
train_dataset[0]  # Inspect the first example in the train dataset

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.',
  'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}

In [19]:
test_dataset[0]  # Inspect the first example in the test dataset

{'translation': {'en': 'Hearing the words of Viśvāmitra, Rāghava, together with Laksmana, was struck with amazement, and spoke to Viśvāmitra, saying,',
  'sn': 'विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः। विस्मयं परमं गत्वा विश्वामित्रमथाब्रवीत्॥'}}

In [20]:
valid_dataset[0] # Inspect the first example in the validation dataset

{'translation': {'en': 'When Şītā, having a husband although seeming as if she had none, was putting on the ascetic guise, the people got into a wrath and exclaimed, “O Dasaratha, fie on you!"',
  'sn': 'तस्यां चीरं वसानायां नाथवत्यामनाथवत्। प्रचुक्रोश जनः सर्वो धिक् त्वां दशरथं त्विति ॥'}}

In [21]:
# Indexing the datasets
print(train_dataset[0])  # To see the full content of the first example
print("--" * 50)
print(train_dataset[0]["translation"])  # To see the root of the nested dictionary
print("--" * 50)
print(train_dataset[0]["translation"]["en"])  # To see the English translation of the first example
print("--" * 50)
print(train_dataset[0]["translation"]["sn"])  # To see the Sanskrit translation of the first example
print("--" * 50)
for i in range(3):
    print(f"Example {i}: (English: {train_dataset[i]['translation']['en']}) (Sanskrit: {train_dataset[i]['translation']['sn']})")

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.', 'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}
----------------------------------------------------------------------------------------------------
{'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.', 'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}
----------------------------------------------------------------------------------------------------
The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.
----------------------------------------------------------------------------------------------------
ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वर

# 3_Modelling_and_Training

In [22]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "hi_IN"  # Set source and target languages

text = "."
inputs = tokenizer(text, return_tensors="pt")

# Force decoder to use target language
output_ids = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id[tokenizer.tgt_lang]
)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))


.


In [None]:
# Check total
tokenizer.lang_code_to_id


{'ar_AR': 250001,
 'cs_CZ': 250002,
 'de_DE': 250003,
 'en_XX': 250004,
 'es_XX': 250005,
 'et_EE': 250006,
 'fi_FI': 250007,
 'fr_XX': 250008,
 'gu_IN': 250009,
 'hi_IN': 250010,
 'it_IT': 250011,
 'ja_XX': 250012,
 'kk_KZ': 250013,
 'ko_KR': 250014,
 'lt_LT': 250015,
 'lv_LV': 250016,
 'my_MM': 250017,
 'ne_NP': 250018,
 'nl_XX': 250019,
 'ro_RO': 250020,
 'ru_RU': 250021,
 'si_LK': 250022,
 'tr_TR': 250023,
 'vi_VN': 250024,
 'zh_CN': 250025,
 'af_ZA': 250026,
 'az_AZ': 250027,
 'bn_IN': 250028,
 'fa_IR': 250029,
 'he_IL': 250030,
 'hr_HR': 250031,
 'id_ID': 250032,
 'ka_GE': 250033,
 'km_KH': 250034,
 'mk_MK': 250035,
 'ml_IN': 250036,
 'mn_MN': 250037,
 'mr_IN': 250038,
 'pl_PL': 250039,
 'ps_AF': 250040,
 'pt_XX': 250041,
 'sv_SE': 250042,
 'sw_KE': 250043,
 'ta_IN': 250044,
 'te_IN': 250045,
 'th_TH': 250046,
 'tl_XX': 250047,
 'uk_UA': 250048,
 'ur_PK': 250049,
 'xh_ZA': 250050,
 'gl_ES': 250051,
 'sl_SI': 250052}