In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

# Datasets and DataLoaders

In order to efficiently train a neural network using batched data, we will use two of [PyTorch's data utilities](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html): the `Dataset` and the `DataLoader` classes.

PyTorch (or other frameworks / packages) provide several well-known datasets, such as MNIST, CIFAR-10, Fashion-MNIST, etc. However, we are interested in creating our own custom dataset based on the data we have.

Once the `Dataset` is created, we can pass it to a `DataLoader` object. The `DataLoader` object wraps around the `Dataset` and provides us with the ability to iterate over the dataset in batches. This is especially useful when the dataset is large and cannot fit into memory. The `DataLoader` also provides other utilities such as shuffling and random sampling of the data.

## Minimal Interface of a Dataset

In order to create a custom dataset, we need to create a class that inherits from the `Dataset` class and implements the following methods:

- `__init__`: The constructor method where we can read in any data, such as a CSV file.
- `__len__`: The method that returns the length of the dataset.
- `__getitem__`: The method that returns a sample from the dataset given an index.

Once these three methods are implemented, we can instantiate the class, pass it to a `DataLoader` object and start iterating over it to train our model.

In [None]:
# Create an object that inherits from the Dataset class
class ExampleDataset(Dataset): 
    # Constructor
    def __init__(self, X, y):
        # Store an X and y (arbitrary choice, we could store different things,
        # here our X will be the features/input and y the target/output)
        self.X = torch.Tensor(X)  
        self.y = torch.Tensor(y) 
        # Transforming to tensor at construction instead of indexing will reduce 
        # overhead when iterating over the dataset but might require more memory
    
    # Length method (len) returns the number of samples in the dataset
    def __len__(self):
        return self.y.shape[0] # Number of observations given our y is NxK (K outputs, N observations)
    
    # Get item method (getitem) returns the i-th sample of the dataset
    def __getitem__(self, i):
        return self.X[i], self.y[i] # Return the i-th sample

In [None]:
# Create some example data
X = np.arange(12).reshape(-1, 3) # 4x3 matrix of features (4 observations, 3 features)
y = np.arange(4).reshape(-1, 1) # 4x1 matrix of targets (4 observations, 1 target)

# Create a dataset object
dataset = ExampleDataset(X, y)

# Create a dataloader object
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

In [None]:
for i, (x, y) in enumerate(dataloader):
    print(f"Batch {i+1}:\n")
    print(f" X:\n{x}")
    print(f" y:\n{y}")
    print("-" * 25)

Notice two important things from the code above:

+ Each batch has 2 elements. This is given by the `batch_size` parameter in the `DataLoader` object. Consider what happens when the `batch_size` does not divide the length of the dataset. For example, if we set the `batch_size` to 3, then the last batch will have only 1 element. This is not a problem and the `DataLoader` handles this case automatically. However, it is important to keep this in mind when writing the training code.
+ The batches are ordered the way our data was created. This is because we set `shuffle` to `False`. If we set `shuffle` to `True`, then the `DataLoader` will shuffle the batches and we will get different orders of data in each epoch.

## Special Considerations for RNNs

Datasets and data loaders are useful for all kinds of neural networks. However, we should keep in mind, that different types of networks will expect different input shapes. For instance, a feedforward neural network expects a batch of input to have 2 dimensions: (`batch_size`, `input_size`). A recurrent neural network, on the other hand, expects its batched input to have 3 dimensions: (`batch_size`, `sequence_length`, `input_size`) if we set the parameter `batch_first` to `True` in the RNN's constructor. Otherwise, the input shape will be (`sequence_length`, `batch_size`, `input_size`).

### Batching and Sequence Length

Suppose we have 100 observations $x_1, x_2, \dots, x_{100}$. If we were to batch this data for a FFNN, we would have no troubles at all, suppose we would like batches of size $B=5$, we simply partition the data into $\lceil 100/B \rceil$ non-overlapping chunks, e.g., $\{x_1, x_2, x_3, x_4, x_5\}, \{x_6, x_7, x_8, x_9, x_{10}\}, \dots, \{x_{96}, x_{97}, x_{98}, x_{99}, x_{100}\}$.

Unfortunately, we want to work with models that can handle sequential data, and so we need to be a bit more careful. Indeed, our models don't only take an observation as input but rather a **sequence of observations**. Hence if we choose $B=5$, we should have 5 sequences of observations per batch, not just 5 observations! But what is the length of each sequence? Well, this is a design choice, but we can choose to have all sequences of the same length, e.g., $L=10$. This means that we will have 10 observations per sequence, and so we will have $L \cdot B = 50$ observations per batch. Hence, we would have only two batches instead of the 20 we had in the FFNN case.

#### FFNN Batching
Batch 1: $\{x_1, x_2, x_3, x_4, x_5\}$  
Batch 2: $\{x_6, x_7, x_8, x_9, x_{10}\}$  
$\dots$  
Batch 20: $\{x_{96}, x_{97}, x_{98}, x_{99}, x_{100}\}$


#### RNN Batching
Batch 1: $\{(x_1, x_2, x_3, x_4, x_5), (x_6, x_7, x_8, x_9, x_{10}), \dots (x_{45}, x_{46}, x_{47}, x_{48}, x_{49}, x_{50})\}$  
Batch 2: $\{(x_{51}, x_{52}, x_{53}, x_{54}, x_{55}), \dots (x_{95}, x_{96}, x_{97}, x_{98}, x_{99}, x_{100})\}$

#### Pause and Ponder
What is stopping us from having overlapping sequences? E.g., the first batch could be $\{(x_1, x_2, x_3, x_4, x_5), (x_2, x_3, x_4, x_5, x_6), \dots (x_6, x_7, x_8, x_9 x_{10})\}$. This would leave us with 19 batches, so are we just *losing data* by not doing this?

# Batching PM2.5 Data

Let us make things a bit more concrete and look at how to batch the data of the predictive challenge. To keep things simple, we will focus our example on the observations in Delhi only.

In [None]:
# Keep only observations from Delhi
df = pd.read_pickle("train.pkl")
df = df.xs("Delhi", level="city")
# Keep only the first 5 columns to make it more legible
df = df[df.columns[:5]]
# Avoid NaNs later, just to showcase that data loading in practice later on
df.replace({np.nan: 0}, inplace=True)
df

In [None]:
class CityDataset(Dataset):
    # We could also define our constructor to take a pandas dataframe and extract the 
    # features / targets directly. Notice that we must also specify a sequence length
    def __init__(self, X, y, seq_len):
        self.X = torch.Tensor(X)
        self.y = torch.Tensor(y)
        self.seq_len = seq_len

    def __len__(self):
        return self.y.shape[0] - self.seq_len
    
    def __getitem__(self, idx):
        return self.X[idx:idx+self.seq_len], self.y[idx:idx+self.seq_len]

In [None]:
# Create the dataset based on our training data
dataset = CityDataset(
    X=df.drop("PM2.5_target", axis=1).values,
    y=df[["PM2.5_target"]].values,
    seq_len=24
)

# Create a dataloader, loading 32 batches of sequences at a time
dataloader = DataLoader(dataset, batch_size=32)

In [None]:
# Look at how one single batch looks like...
for x, y in dataloader:
    print("First Batch:")
    print(f" X shape: {x.shape}")
    print(f" Y shape: {y.shape}")
    break

## Example Usage

Finally, now that we have specified how our data should be loaded, we can turn to a simple example of training a neural network using this particular dataloader. The way we built our dataloader implies that the first dimension is the batch size. Hence, we have to specify `batch_first=True` when instantiating the recurrent layers!

In [None]:
# Create a recurrent network 
class MyRNN(nn.Module):
    def __init__(self, n_inputs, n_outputs, n_hidden):
        super().__init__()
        # Use a Gated Recurrent Unit (beware of the batch_first parameter!)
        self.rnn = nn.GRU(n_inputs, n_hidden, batch_first=True)
        self.linear = nn.Linear(n_hidden, n_outputs)

    # Define the forward pass to remove the hidden state from the output
    def forward(self, x):
        # Apply the RNN
        x, _ = self.rnn(x) # (output, hidden state), discard the hidden state
        # Apply the linear layer
        x = self.linear(x)
        return x

In [None]:
# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate an RNN with 4 inputs, 1 output and 32 hidden nodes
gru = MyRNN(4, 1, 32).to(device)

# Use mean-square error as a loss
loss_fn = nn.MSELoss()

# Instantiate Adam optimizer with learning rate 0.0001
opt = optim.Adam(gru.parameters(), lr=1e-4)

In [None]:
# Notice that the typical training structure doesn't change
n_epochs = 10
batch_size = 64
losses = np.zeros(n_epochs)

for epoch in tqdm(range(n_epochs), desc="Training Network...", unit="epoch"):
    # Within each epoch, we iterate over the batches
    for X, y in dataloader:
        # Pass the batch to the GPU
        X = X.to(device)
        y = y.to(device)

        # Reset the gradients
        opt.zero_grad()

        # Compute the forward pass
        y_pred = gru(X)
        
        # Compute the loss
        loss = loss_fn(y, y_pred)
        
        # Compute the gradients
        loss.backward()

        # Update the parameters
        opt.step()

        # Store the epoch average losses to plot later on
        losses[epoch] += loss.detach().numpy() / y.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))

ax.plot(losses, "-o")
ax.set_xlabel("Epoch")
ax.set_ylabel("MSE (on train set)")
ax.grid(alpha=.3)