In [2]:
import os

import numpy as np
import pandas as pd
from easydict import EasyDict

import torch
from torch.utils.data import (
    Dataset
    , IterableDataset
    , DataLoader
    , SubsetRandomSampler
    , random_split
)

# Building a Data Pipeline

To build a data pipline with `PyTorch`, there are three major components:

* `Dataset`
* `DataLoader`
* `Sampler`

In general, the workflow would be: (1) load raw dataset from disk/web to create a torch `Dataset` object, (2) determine a sampling scheme and instantiate `Sampler` object(s), and lastly (3) glue the `Dataset` and `Sampler`, together with other training parameters, to generate a `DataLoader`. During the model training, we load batches of data by iterating through the `DataLoader` we have created. Examples will be given below.

## The Dataset Object

The official documentation can be found [here](https://pytorch.org/docs/stable/data.html). There are two types of datasets: (1) **Map-style Dataset** and (2) **Iterable-style Dataset**. In a word, map-style datasets act like a **list** or **table** that we can index on. For instance, we can select the 7th sample. On the other hand, the iterable-style datasets work like iterators, meaning that we only define how we to **retrieve next sample/batch**. Both of these two types of datasets can be very useful. I will introduce the map-style dataset as it is more intuitive. 

### Map-style Dataset

To construct a custom map-style dataset, we must implement at least three methods:

* `__init__(self, ...)` (obviously)
* `__len__(self)`
* `__getitem__(self, idx)`

Just in case, the third method is actually called through the square bracket operator `[]`. For instance `my_dataset[0]` gives the first item (well, it depends on how you would implement the method, e.g. whether it supports ranged slicing).

A simple template is shown below.

In [5]:
class MyDataset(torch.utils.data.Dataset):

    def __init__(self, path):  # <-- My habit, not necessarily <path> only
        pass

    def __len__(self):
        pass

    def __getitem__(self, idx):
        pass

# Sample creation of a dataset
try:
    my_dataset = MyDataset("~/path/to/your/raw/data.csv")
except:
    pass

#### A Toy Example

The toy example consists only 10 samples with two attributes: factor and target, both of which are numerical data.

In [9]:
df_toy = pd.read_csv("../data/classification/toy_example/data.csv")
df_toy.head()

Unnamed: 0,factor,target
0,10.0,0
1,5.0,1
2,2.5,1
3,11.0,0
4,15.0,0


In [6]:
class ToyMapDataset(Dataset):

    def __init__(self, path):
        
        self.df = pd.read_csv(path)
        return

    def __len__(self):
        
        return len(self.df)

    def __getitem__(self, idx):

        return self.df.iloc[idx, :]

t = ToyMapDataset("../data/classification/toy_example/data.csv")


slice(0, 5, None)


Unnamed: 0,factor,target
0,10.0,0
1,5.0,1
2,2.5,1
3,11.0,0
4,15.0,0


## The Iris Dataset

## The Toxic-Comments Dataset

## Large Dataset: Read by Chunks

In [83]:
class ToyLargeDataset(IterableDataset):

    def __init__(self, path):

        self.path = path

    def __iter__(self):

        self.df = pd.read_csv(self.path, chunksize=4)
        for dfr in self.df:

            fct = torch.tensor(dfr.iloc[:, 0].values)
            tgt = torch.tensor(dfr.iloc[:, 1].values)
            yield fct, tgt

In [93]:
t = ToyLargeDataset("../data/classification/toy_example/data.csv")
l = DataLoader(t, batch_size=None)

In [94]:
for x, y in l:
    print(y)

tensor([0, 1, 1, 0])
tensor([0, 0, 1, 1])
tensor([0, 0])


In [95]:
for x, y in l:
    print(y)


tensor([0, 1, 1, 0])
tensor([0, 0, 1, 1])
tensor([0, 0])
