### Dataset Class and DataLoaders

Used in the backpropogation step in the NN 

**Date:** 29/10/2021  
**Author:** Murad Popattia

### Basic Dataset Class

In [1]:
import torch
from sklearn.datasets import make_classification
import cv2
import numpy as np

In [2]:
# important to give your model, samples
class CustomDataset:
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    # should return len of the data
    def __len__(self):
        return len(self.data)
    
    # returns the data element at the specified idx
    def __getitem__(self, idx):
        cur_sample = self.data[idx, :]
        cur_target = self.targets[idx]
        
        # returns a dict of tensors
        return {
            "sample": torch.tensor(cur_sample, dtype=torch.float),
            "target": torch.tensor(cur_target,  dtype=torch.long)
        }

In [3]:
# ?make_classification
data, targets = make_classification(n_samples=1000)

In [4]:
# targets are binary
data.shape, targets.shape

((1000, 20), (1000,))

In [5]:
# Looping over the dataset class

custom_dataset = CustomDataset(data=data, targets=targets)

for sample in custom_dataset:
    print(sample)
    break

{'sample': tensor([-0.4422, -0.5011,  1.5107,  0.7165,  0.5521,  1.0575, -1.2364, -0.1809,
         0.2105,  0.9634,  0.2448, -2.1240,  0.2823, -0.4810, -0.8553, -0.2054,
         0.1008,  0.2937,  0.6091, -0.6417]), 'target': tensor(0)}


In [6]:
custom_dataset[0]

{'sample': tensor([-0.4422, -0.5011,  1.5107,  0.7165,  0.5521,  1.0575, -1.2364, -0.1809,
          0.2105,  0.9634,  0.2448, -2.1240,  0.2823, -0.4810, -0.8553, -0.2054,
          0.1008,  0.2937,  0.6091, -0.6417]),
 'target': tensor(0)}

### Dataset class for basic NLP problem

In [7]:
# classification/regression probs
class CustomDataset:
    def __init__(self, data, targets, tokenizer):
        self.data = data
        self.targets = targets
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx, :]
        target = self.targets[idx]
        
        # here we would need to tokenize the text label before returning
        input_ids = tokenizer(text)
        # the tokenizer can return something like [101, 12, 23, 44, ....] where 101 can be the <s> etc.
        
        # returns a dict of tensors
        return {
            "text": torch.tensor(input_ids, dtype=torch.long),
            "target": torch.tensor(target,  dtype=torch.long)
        }

### Dataset class for basic Vision Problems

In [8]:
# we read the image from the paths here
class CustomDataset:
    def __init__(self, image_paths, targets, augmentations=None):
        self.image_paths = image_paths
        self.targets = targets
        self.augmentations = augmentations
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tagret = self.targets[idx]
        image = cv2.imread(self.image_paths[idx])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # opencv reads in BGR format so we need to convert them
        
        # applying augmentations
        # you can use https://albumentations.ai/ 
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
            
        # pytorch expects images in channel first format -> c x h x w
        image = np.tranpose(image, (2,0,1)).astype(np.float32)
            
        return {
            "image": torch.tensor(image, dtype=torch.long),
            "target": torch.tensor(target,  dtype=torch.long)
        }

### Dataloaders

Data needs to be passed in batches to pass it to the neural network

In [2]:
# important to give your model, samples
class CustomDataset:
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    # should return len of the data
    def __len__(self):
        return len(self.data)
    
    # returns the data element at the specified idx
    def __getitem__(self, idx):
        cur_sample = self.data[idx, :]
        cur_target = self.targets[idx]
        
        # returns a dict of tensors
        return {
            "sample": torch.tensor(cur_sample, dtype=torch.float),
            "target": torch.tensor(cur_target,  dtype=torch.long)
        }

In [3]:
# ?make_classification
data, targets = make_classification(n_samples=1000)

In [4]:
dataset = CustomDataset(data, targets)

In [5]:
len(dataset)

1000

In [6]:
# ?torch.autils.data.DataLoader

In [7]:
# Creating the dataloader
# jupyter notebook does not support num_workers > 0 hence kept to 0 here 

train_loader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0)

In [8]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x26c49841370>

**train_loader** is now my generator that is supplying the data in batches instead of sending all in once

In [9]:
for data in train_loader:
    print(data["sample"].shape)
    print(data["target"].shape)
    break

torch.Size([4, 20])
torch.Size([4])


In [10]:
# for epoch in range(10):
#     for data in train_loader:
#         x = data["sample"]
#         y = data["targets"]
#         outputs = model(x,y)
        
#         # calculate loss
#         # loss = ...
        
#         # loss.backwards()
#         #....