# Dataset

### Customize your dataset
Basics
- `__len__`: length of the dataset
- `__getitem__`: get the $i$-th item of the dataset

In [None]:
import pandas as pd
from torch.utils.data import Dataset


class Example(Dataset):
    def __init__(self, file_name):
        self.data = pd.read_csv(file_name).to_numpy()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = self.data[:, :-1]
        label = self.data[:, -1]
        return feature, label

### Load data with built-in functions

For example, suppose you have a folder with images. Then you can use `ImageFolder` from `torchvision`

In [None]:
# Use ImageFolder to create dataset(s)
from torchvision import datasets
from torchvision.transforms import transforms

# Write transform for image
data_transform = transforms.Compose([
    # Resize the images to 64x64
    transforms.Resize(size=(64, 64)),
    # Flip the images randomly on the horizontal
    transforms.RandomHorizontalFlip(p=0.5), # p = probability of flip, 0.5 = 50% chance
    # Turn the image into a torch.Tensor
    transforms.ToTensor() # this also converts all pixel values from 0 to 255 to be between 0.0 and 1.0 
])

train_data = datasets.ImageFolder(root=train_dir, # target folder of images
                                  transform=data_transform, # transforms to perform on data (images)
                                  target_transform=None) # transforms to perform on labels (if necessary)

# DataLoader

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

In [None]:
for i, (features, labels) in enumerate(train_dataloader):
    print(f"Features shape: {features.shape}, Labels shape: {labels.shape}")
    # (64, feature_dim)  # (64, label_dim)