# 1_Setup_and_Testing

In [None]:
# All imports
import sys
import pip
import torch
from datasets import get_dataset_split_names, load_dataset, load_dataset_builder, get_dataset_config_names
from transformers import pipeline

print("All imports are successful ✅")

print("--" * 50)

#---------------------------------------------------------------
# Check Python, pip, and pytorch versions and cuda compatibility
#---------------------------------------------------------------
print("Python version:", sys.version)
# Print pip version
print("Pip version:", pip.__version__)
# Print pytorch version
print("Pytorch version:", torch.__version__)
# Print CUDA version
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available.")

# Print GPU information
if torch.cuda.is_available():
    print("GPU is available.")
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

# Check if pytorch can use CUDA
if torch.cuda.is_available():
    x = torch.rand(5, 3).cuda()
    if x.is_cuda:
        print("Pytorch can use CUDA ✅Tensor on GPU")
else:
    print("Pytorch is not using CUDA.")


# 2_Load_Dataset_and_Preprocess

In [None]:
# https://huggingface.co/docs/datasets/load_hub
splits = get_dataset_split_names("rahular/itihasa")
print("Available dataset splits:", splits)
configs = get_dataset_config_names("rahular/itihasa")
print("Available dataset configurations:", configs)

In [None]:
ds_builder = load_dataset_builder("rahular/itihasa")

# Inspect dataset description
ds_builder.info.description

# Inspect dataset features
ds_builder.info.features

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("rahular/itihasa", split="train")
valid_dataset = load_dataset("rahular/itihasa", split="validation")
test_dataset  = load_dataset("rahular/itihasa", split="test")
print("Datasets loaded successfully ✅.")

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


In [None]:
train_dataset[0]  # Inspect the first example in the train dataset

In [None]:
test_dataset[0]  # Inspect the first example in the test dataset

In [None]:
valid_dataset[0] # Inspect the first example in the validation dataset

In [None]:
# Indexing the datasets
print(train_dataset[0])  # To see the full content of the first example
print("--" * 50)
print(train_dataset[0]["translation"])  # To see the root of the nested dictionary
print("--" * 50)
print(train_dataset[0]["translation"]["en"])  # To see the English translation of the first example
print("--" * 50)
print(train_dataset[0]["translation"]["sn"])  # To see the Sanskrit translation of the first example
print("--" * 50)
for i in range(3):
    print(f"Example {i}: (English: {train_dataset[i]['translation']['en']}) (Sanskrit: {train_dataset[i]['translation']['sn']})")

In [None]:

pipeline = pipeline(
    task="translation",
    model="facebook/mbart-large-50-many-to-many-mmt",
    device=0,
    torch_dtype=torch.float16,
    src_lang="en_XX",
    tgt_lang="fr_XX",
)
print(pipeline("UN Chief Says There Is No Military Solution in Syria"))