In [1]:
# In this notebook, you learn:
#
# 1) How to load a dataset from the Hugging Face Datasets library.
# 2) How to explore the dataset and the data it contains.
# 
# I am building a transformer model for English to Telugu translation. So, we need a dataset of English and Telugu 
# sentences. I will be using the dataset from the Hugging Face Datasets library. 
# The dataset is called ai4bharat/samanantar (https://huggingface.co/datasets/ai4bharat/samanantar). It contains 
# sentences for several Indian languages. However, I will be using only the English to Telugu translation pairs.
# 
# This notebook explores the ai4bharat dataset and the data it contains.

In [3]:
from datasets import load_dataset, load_from_disk

In [4]:
# Path at which the smaller datasets created from the original dataset are stored.
AI4_BHARAT_DATA_PATH = "../../Data/AI4Bharat"
# Just a separator to print in the console.
SEPARATOR_LENGTH = 150

In [5]:
# NOTE: This piece of code downloads the entire AI4BHARAT dataset that is about 7.5GB.
# So, it might take very long when you run this line for the first time. For the subsequent runs, it will load 
# the dataset from the machine itself and so it should be significantly faster.
# 
# Here we are only loading the English to Telugu translation dataset for now. However, the entire dataset is
# downloaded to the machine.
en_te_translation_dataset = load_dataset("ai4bharat/samanantar", 'te', trust_remote_code=True)

In [6]:
print(en_te_translation_dataset)
# We can essentaially use this as a dictionary to access the data.
print(type(en_te_translation_dataset))

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 4946035
    })
})
<class 'datasets.dataset_dict.DatasetDict'>


In [7]:
print(en_te_translation_dataset["train"])
# Notice the type of the dataset. This is the hugging face dataset object.
print(type(en_te_translation_dataset["train"]))

Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 4946035
})
<class 'datasets.arrow_dataset.Dataset'>


In [8]:
# Note that the translation dataset itself is not perfect. Some of the English sentences are not translated very 
# well to Telugu. For example, a good translation for second sentence in the dataset is "మీరు చీమలను తింటారా?".
print(en_te_translation_dataset["train"][0])
print(en_te_translation_dataset["train"][10000])
print(en_te_translation_dataset["train"][1249849])

{'idx': 0, 'src': 'Have you heard about Foie gras?', 'tgt': 'ఇక ఫ్రూట్ ఫ్లైస్ గురించి మీరు విన్నారా?'}
{'idx': 10000, 'src': 'You eat ants?', 'tgt': 'మీరు చీమలు తినడానికి?'}
{'idx': 1249849, 'src': 'Ban on international flights extended till Oct 31', 'tgt': 'అంతర్జాతీయ విమాన సర్వీసులపై నిషేధాన్ని కేంద్ర ప్రభుత్వం అక్టోబర్ 31 వరకు పొడగించింది'}


In [7]:
# Dataset to be used to train the translation model.
train_dataset = en_te_translation_dataset["train"].select(range(0, 250000))
# Dataset to be used for validation purposes.
validation_dataset = en_te_translation_dataset["train"].select(range(4946034, 4941035, -1))
# Complete dataset. This will be used for training the Tokenizers.
full_en_te_dataset = en_te_translation_dataset["train"]
# Example datasets with 200 examples to test run and resolve any issues in the model.
# A very small dataset to perform sample runs and debug issues.
debug_dataset = en_te_translation_dataset["train"].select(range(200))

In [8]:
print("train_dataset: ", train_dataset)
print(type(train_dataset))
print("-" * SEPARATOR_LENGTH)
print("validation_dataset: ", validation_dataset)
print("-" * SEPARATOR_LENGTH)
print("debug_dataset: ", debug_dataset)
print("-" * SEPARATOR_LENGTH)
print("full_en_te_dataset: ", full_en_te_dataset)

train_dataset:  Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 250000
})
<class 'datasets.arrow_dataset.Dataset'>
------------------------------------------------------------------------------------------------------------------------------------------------------
validation_dataset:  Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 4999
})
------------------------------------------------------------------------------------------------------------------------------------------------------
debug_dataset:  Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 200
})
------------------------------------------------------------------------------------------------------------------------------------------------------
full_en_te_dataset:  Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 4946035
})


In [10]:
# Saving the filtered datasets to the disk. These datasets will be used for training the transformer model.
# Running thie cell will replace the existing datasets in this repository although the dataset itself should
# remain the same.
train_dataset.save_to_disk(dataset_path=f"{AI4_BHARAT_DATA_PATH}/train_dataset")
validation_dataset.save_to_disk(dataset_path=f"{AI4_BHARAT_DATA_PATH}/validation_dataset")
debug_dataset.save_to_disk(dataset_path=f"{AI4_BHARAT_DATA_PATH}/debug_dataset")
full_en_te_dataset.save_to_disk(dataset_path=f"{AI4_BHARAT_DATA_PATH}/full_en_te_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 250000/250000 [00:00<00:00, 1419601.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4999/4999 [00:00<00:00, 137025.22 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 200/200 [00:00<00:00, 30148.82 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 4946035/4946035 [00:02<00:00, 1740910.28 examples/s]


In [11]:
# Load the saved datasets and print the data points which we printed from the original dataset.
# Notice that the datapoints are exactly the same as before as they should be.
train_dataset_loaded = load_from_disk(dataset_path=f"{AI4_BHARAT_DATA_PATH}/train_dataset")
print(train_dataset_loaded)
print(type(train_dataset_loaded))
print("-" * SEPARATOR_LENGTH)
print(train_dataset_loaded[0])
print(train_dataset_loaded[10000])

Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 250000
})
<class 'datasets.arrow_dataset.Dataset'>
------------------------------------------------------------------------------------------------------------------------------------------------------
{'idx': 0, 'src': 'Have you heard about Foie gras?', 'tgt': 'ఇక ఫ్రూట్ ఫ్లైస్ గురించి మీరు విన్నారా?'}
{'idx': 10000, 'src': 'You eat ants?', 'tgt': 'మీరు చీమలు తినడానికి?'}


In [17]:
# Note that accessing the hugging face dataset this way is returning a dictionary. This is important.
# I ran into extremely annoying bugs in the later notebooks not noticing this before hand.
debug_dataset[:5]

{'idx': [0, 1, 2, 3, 4],
 'src': ['Have you heard about Foie gras?',
  'I never thought of acting in films.',
  'Installed Software',
  'A case has been registered under Sections 302 and 376, IPC.',
  'Of this, 10 people succumbed to the injuries.'],
 'tgt': ['ఇక ఫ్రూట్ ఫ్లైస్ గురించి మీరు విన్నారా?',
  'సూర్య సినిమాల్లో నటించాలని ఎప్పుడూ అనుకోలేదు.',
  'స్థాపించబడిన సాఫ్ట్\u200dవేర్',
  'నిందితులపై సెక్షన్ 376 మరియు 302ల కింద కేసు నమోదు చేశాం.',
  'అందులో 10 మంది తీవ్రంగా గాయపడ్డారు.']}

In [18]:
print(type(debug_dataset[:5]))

<class 'dict'>


In [19]:
# Use this format to find the size of the dataset i.e., number of examples in the dataset.
print(debug_dataset.num_rows)

200


In [20]:
# Returns the data_point at index 4 which is a dictionary in this case.
debug_dataset[4]

{'idx': 4,
 'src': 'Of this, 10 people succumbed to the injuries.',
 'tgt': 'అందులో 10 మంది తీవ్రంగా గాయపడ్డారు.'}