## PyTorch Dataloaders

[Code](https://github.com/priyammaz/PyTorch-Adventures/blob/main/PyTorch%20Basics/PyTorch%20DataLoaders/DataLoaders.ipynb)

In [None]:
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder  # Stream data from images stored in folders
from tqdm import tqdm

import os  # Allows to access files
from PIL import Image  # Allows us to Load Images
from collections import Counter  # Utility function to give us the counts of unique items in an iterable

## Image Data

In [2]:
class DogsVsCats(Dataset):
    def __init__(self, path_to_folder):
        path_to_cats = os.path.join(path_to_folder, "Cat")
        path_to_dogs = os.path.join(path_to_folder, "Dog")

        cat_files = os.listdir(path_to_cats)
        dog_files = os.listdir(path_to_dogs)

        path_to_cat_files = [os.path.join(path_to_cats, f) for f in cat_files]
        path_to_dog_files = [os.path.join(path_to_dogs, f) for f in dog_files]

        self.training_files = path_to_cat_files + path_to_dog_files

        self.dog_label = 0
        self.cat_label = 1

        self.transform = transforms.Compose(
            [
                transforms.Resize([224, 224]),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )

    def __len__(self):
        return len(self.training_files)

    def __getitem__(self, index):
        path_to_image = self.training_files[index]

        if "Dog" in path_to_image:
            label = self.dog_label
        else:
            label = self.cat_label

        image = Image.open(path_to_image).convert("RGB")
        image = self.transform(image)
        return image, label


path_to_folder = "./catsanddogs/PetImages/"
dataset = DogsVsCats(path_to_folder)

for sample in dataset:
    print(sample)
    break

(tensor([[[ 0.8447,  0.8618,  0.8447,  ...,  1.3755,  1.3584,  1.3070],
         [ 0.9303,  0.9132,  0.8789,  ...,  1.3755,  1.3584,  1.3413],
         [ 0.9988,  0.9474,  0.8618,  ...,  1.3242,  1.3413,  1.3242],
         ...,
         [ 0.1083,  0.4508,  0.4337,  ..., -0.7993, -1.0562, -1.0219],
         [ 0.3138,  0.4679,  0.5193,  ..., -0.7650, -0.9877, -0.9877],
         [ 0.0056,  0.1426,  0.1426,  ..., -0.7137, -0.9534, -0.9534]],

        [[ 0.0651,  0.1001,  0.1001,  ...,  1.0105,  0.9930,  0.9405],
         [ 0.1527,  0.1702,  0.1352,  ...,  1.0105,  0.9930,  0.9755],
         [ 0.2577,  0.2227,  0.1527,  ...,  0.9580,  0.9755,  0.9580],
         ...,
         [ 1.0455,  1.4657,  1.4832,  ..., -0.8627, -1.1253, -1.1078],
         [ 1.1155,  1.2381,  1.2731,  ..., -0.8452, -1.1078, -1.1253],
         [ 0.7304,  0.8179,  0.7829,  ..., -0.7927, -1.0903, -1.1078]],

        [[-0.1487, -0.0964, -0.0964,  ...,  0.5834,  0.5485,  0.4962],
         [-0.0615, -0.0267, -0.0615,  ...,  

In [None]:
dogs_vs_cats_loader = DataLoader(dataset, batch_size=16, shuffle=True)

for images, labels in dogs_vs_cats_loader:
    print(images.shape)
    print(labels.shape)
    break

torch.Size([16, 3, 224, 224])
torch.Size([16])


In [4]:
num_train_samples = int(0.9 * len(dataset))
num_test_samples = len(dataset) - num_train_samples

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [num_train_samples, num_test_samples])

dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_test = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [14]:
dataset = ImageFolder(root=path_to_folder)
print(dataset.classes)

['Cat', 'Dog']


## NLP Data

In [3]:
path_to_data = "./aclImdb/train"

path_to_pos_folder = os.path.join(path_to_data, "pos")
path_to_neg_folder = os.path.join(path_to_data, "neg")

path_to_pos_txt = [os.path.join(path_to_pos_folder, file) for file in os.listdir(path_to_pos_folder)]
path_to_neg_txt = [os.path.join(path_to_neg_folder, file) for file in os.listdir(path_to_neg_folder)]

training_files = path_to_pos_txt + path_to_neg_txt

all_text = ""

for file in tqdm(training_files):
    with open(file, "r", encoding="utf-8") as f:
        text = f.readlines()[0]
        all_text += text

100%|██████████| 25000/25000 [01:17<00:00, 324.66it/s]


In [5]:
unique_counts = dict(Counter(all_text))
characters = sorted([key for key, value in unique_counts.items() if value > 1500])
characters.append("<unk>")
characters.append("<pad>")

character_to_index = {character: index for index, character in enumerate(characters)}
index_to_character = {index: character for index, character in enumerate(characters)}

In [11]:
class IMDBDataset(Dataset):
    def __init__(self, path_to_data):
        path_to_pos_folder = os.path.join(path_to_data, "pos")
        path_to_neg_folder = os.path.join(path_to_data, "neg")

        path_to_pos_txt = [os.path.join(path_to_pos_folder, file) for file in os.listdir(path_to_pos_folder)]
        path_to_neg_txt = [os.path.join(path_to_neg_folder, file) for file in os.listdir(path_to_neg_folder)]

        self.training_files = path_to_pos_txt + path_to_neg_txt
        self.tokenizer = character_to_index

        self.pos_level = 1
        self.neg_level = 0

    def __len__(self):
        return len(self.training_files)

    def __getitem__(self, idx):
        path_to_text = self.training_files[idx]
        with open(path_to_text, "r", encoding="utf-8") as f:
            text = f.readlines()[0]

        tokenized = []

        for char in text:
            if char in self.tokenizer.keys():
                tokenized.append(self.tokenizer[char])
            else:
                tokenized.append(self.tokenizer["<unk>"])

        sample = torch.tensor(tokenized)
        label = self.pos_level if "pos" in path_to_text else self.neg_level
        return sample, label

In [13]:
path_to_data = "./aclImdb/train"
dataset = IMDBDataset(path_to_data)

for sample, label in dataset:
    print(len(sample), label)
    break

284 1


## Dynamic Padding

In [15]:
a = torch.ones(10)
b = torch.ones(8)
c = torch.ones(2)

padded = nn.utils.rnn.pad_sequence([a, b, c], padding_value=999, batch_first=True)
print(padded)


tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1., 999., 999.],
        [  1.,   1., 999., 999., 999., 999., 999., 999., 999., 999.]])


In [16]:
def data_collator(batch):
    texts, labels = [], []

    for text, label in batch:
        texts.append(text)
        labels.append(label)

    labels = torch.tensor(labels)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=character_to_index["<pad>"])

    return texts, labels

In [18]:
loader = DataLoader(dataset, batch_size=4, collate_fn=data_collator)

for texts, labels in loader:
    print(texts)
    break

tensor([[32, 65, 68,  ..., 79, 79, 79],
        [28, 59, 76,  ..., 79, 79, 79],
        [27,  0, 69,  ..., 79, 79, 79],
        [35, 70,  4,  ..., 55, 69, 10]])
