In [1]:
# In this notebook, you learn:
#
# 1) How to use DataLoader in Pytorch?

In [2]:
# Resources to go through before continuing further in this notebook:
#
# 1) https://machinelearningmastery.com/training-a-pytorch-model-with-dataloader-and-dataset/
#       -- Explain how to use Dataset and DataLoader in pytorch.
# 2) https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
#       -- Explain how to use Dataset and DataLoader in pytorch.
# 3) https://blog.paperspace.com/dataloaders-abstractions-pytorch/
#       -- Explains how to use Dataset and DataLoader in pytorch.
# 4) https://pytorch.org/docs/stable/data.html
#       -- Official pytorch documentation for DataLoader interface.
# 5) https://geekflare.com/python-unpacking-operators/
#       -- Explains unpacking operators(*, **) in python.
# 6) https://realpython.com/python-zip-function/
#       -- zip function in python.

In [5]:
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple

import torch

In [7]:
# Example to show the usage of zip in python
sample_iterable_1 = [1, 2, 3, 4]
sample_iterable_2 = ['a', 'b', 'c', 'd']
# zip combines the corresponding elements of each iterable into a tuple.
zip_output_1 = zip(sample_iterable_1, sample_iterable_2)
print(type(zip_output_1), zip_output_1)
print(list(zip_output_1))
print("-" * 150)
# The last element 'E' will be ignored since the shortest iterable among the passed ones has size '4'.
sample_iterable_3 = ['A', 'B', 'C', 'D', 'E']
zip_output_2 = zip(sample_iterable_1, sample_iterable_2, sample_iterable_3)
print(type(zip_output_2), zip_output_2)
print(list(zip_output_2))

<class 'zip'> <zip object at 0x7fef43a6dd00>
[(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]
------------------------------------------------------------------------------------------------------------------------------------------------------
<class 'zip'> <zip object at 0x7fef43a6ec80>
[(1, 'a', 'A'), (2, 'b', 'B'), (3, 'c', 'C'), (4, 'd', 'D')]


In [8]:
# Example to show the usage of unpacking operator (*).
# * just unpacks the iterables and gives out individual elements.
sample_iterable_1 = [1, 2, 3]
print(*sample_iterable_1)
sample_iterable2 = [(1, 2), (3, 4), (5, 6), (7, 8)]
print(*sample_iterable2)

1 2 3
(1, 2) (3, 4) (5, 6) (7, 8)


In [10]:
# Creating a SimpleDataset to be used in assocation with pytorch 'DataLoader'.
# This is a pytorch 'Dataset' class which needs to be inherited to create a custom dataset.
# 'DataLoader' uses 'Dataset' type to iterate on the data.
class SimpleDataset(Dataset):
    def __init__(self, data: List[Tuple[int, int]]):
        # We are just using a random list of tuples for the data to show the usage of Dataset and DataLoader.
        self.data = data

    # This function needs to be implemented for DataLoader to work with Dataset.
    # It needs to return the length of the dataset which will be used to create 
    # batches by the DataLoader.
    def __len__(self) -> int:
        return len(self.data)

    # This function needs to be implemented for DataLoader to work with Dataset.
    # Given an index, it needs to return the datapoint at that index.
    def __getitem__(self, index: int) -> Tuple[int, int]:
        return self.data[index]

In [26]:
# We define a very simple custom collate_fn that gets a batch of input tuples (features, target)
# and combines them into a batched tensor format which is the input format expected by our
# transformer model.
def custom_collate_fn(batch: List[Tuple[int, int]]) -> Tuple[Tensor, Tensor]:
    # batch is a list of (feature, target) pairs. This is what collate_fn
    # is passed in the DataLoader by default.
    print(f"In custom_collate_fn:batch: {batch}, -- type(batch): {type(batch)}")
    # Refer the above 2 cells to understand this operation.
    # [(0, 1), (1, 2), (2, 3), (3, 4)] --> (0, 1), (1, 2), (2, 3), (3, 4) --> *batch does this where each tuple produced is an iterable.
    # (0, 1), (1, 2), (2, 3), (3, 4) --> [(0, 1, 2, 3), (1, 2, 3, 4)] --> zip(*batch) does this where corresponding elements at every index are added to a tuple.
    # features = (0, 1, 2, 3) and labels = (1, 2, 3, 4)
    features, labels = zip(*batch)
    print(f"In custom_collate_fn:features: {features}, -- type(features): {type(features)}")
    print(f"In custom_collate_fn:labels: {labels}, -- type(labels): {type(labels)}")
    # Convert the features tuple into a tensor which is what the transformer expects.
    features = torch.tensor(features, dtype=torch.float32)
    print(f"In custom_collate_fn:features: {features}")
    # Convert the labels tuple into a tensor which is what the transformer expects.
    labels = torch.tensor(labels, dtype=torch.float32)
    print(f"In custom_collate_fn:labels: {labels}")
    return features, labels
    

In [27]:
# Create the dataset object which is passed to the DataLoader.
data = [(i, i + 1) for i in range(12)]
print(f"data: {data}")
# Notice that the HuggingFace's 'datasets.arrow_dataset.Dataset' which we used step 1 and step 2 notebooks is not the same as 
# the pytorch's 'torch.utils.data.Dataset'.
my_dataset = SimpleDataset(data=data)
print(type(my_dataset), " -- ", my_dataset)

data: [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12)]
<class '__main__.SimpleDataset'>  --  <__main__.SimpleDataset object at 0x7fef43a53be0>


In [28]:
# Create a 'DataLoader' object by passing the pytorch 'Dataset' (my_dataset) object created above.
# 'num_workers' specifies the number of workers to be used to load the data. Increase the number of workers
# to load the data faster.
my_dataloader = DataLoader(dataset=my_dataset, num_workers=0, batch_size=4, collate_fn=custom_collate_fn)

In [29]:
print(type(my_dataloader), " -- ", my_dataloader)

<class 'torch.utils.data.dataloader.DataLoader'>  --  <torch.utils.data.dataloader.DataLoader object at 0x7fef43a52980>


THERE IS SOME WEIRD ISSUE WITH PARALLEL PROCESSING OF DATA ON WINDOWS AND MAC WHICH LEADS TO THE ERROR IN THE NEXT CELL. PYTORCH OFFICIAL [DOCUMENTATION](https://pytorch.org/docs/stable/data.html#multi-process-data-loading) GIVES MORE DETAILS ABOUT THIS ISSUE (THOUGH NOT FULLY CLEAR) AND HOW TO AVOID THIS ERROR.

In [30]:
# I faced the error as described in this blog (https://discuss.pytorch.org/t/errors-when-using-num-workers-0-in-dataloader/97564) 
# when I first tried num_workers >= 1 while creating the DataLoader.

In [31]:
# NOTE: This cell will work fine (without any errors on Windows, Mac) if the 'num_workers' is set 
# to '0' but fails if num_workers is set to a number >=1 . It works fine on Linux even with num_workers >= 1.
# Observe that each batch is being passed through the custom collate function to create a single tensor for 
# features and labels. The print statements in the custom_collate_fn will help you understand the flow of data.
# However, the prints will all be jumbled up if num_workers is set to a number >= 1 since processing is done in 
# parallel.
# Use the 'DataLoader' object to iterate through the 'Dataset' in batches.
for features, labels in my_dataloader:
    print(f"features: {features}, -- type(features): {type(features)}")
    print(f"labels: {labels}, -- type(labels): {type(labels)}")
    print("-" * 150)

In custom_collate_fn:batch: [(0, 1), (1, 2), (2, 3), (3, 4)], -- type(batch): <class 'list'>
In custom_collate_fn:features: (0, 1, 2, 3), -- type(features): <class 'tuple'>
In custom_collate_fn:labels: (1, 2, 3, 4), -- type(labels): <class 'tuple'>
In custom_collate_fn:features: tensor([0., 1., 2., 3.])
In custom_collate_fn:labels: tensor([1., 2., 3., 4.])
features: tensor([0., 1., 2., 3.]), -- type(features): <class 'torch.Tensor'>
labels: tensor([1., 2., 3., 4.]), -- type(labels): <class 'torch.Tensor'>
------------------------------------------------------------------------------------------------------------------------------------------------------
In custom_collate_fn:batch: [(4, 5), (5, 6), (6, 7), (7, 8)], -- type(batch): <class 'list'>
In custom_collate_fn:features: (4, 5, 6, 7), -- type(features): <class 'tuple'>
In custom_collate_fn:labels: (5, 6, 7, 8), -- type(labels): <class 'tuple'>
In custom_collate_fn:features: tensor([4., 5., 6., 7.])
In custom_collate_fn:labels: tens