# Preprocessing in CPU environment with PyTorch

In [1]:
import os
import numpy as np
from torch.utils.data import Dataset, DataLoader
import time
import matplotlib.pyplot as plt
import torch
from torchvision import transforms

class CustomDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.file_list = [f for f in os.listdir(data_dir) if f.startswith("part_")]
        self.labels = np.load(os.path.join(data_dir, "labels.npy"), allow_pickle = True).item()
        
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = os.path.join(self.data_dir, self.file_list[idx])
        image = np.load(file_path)
        label = self.labels[self.file_list[idx]]
        return image, label

data_dir = "data/train_small_npy"

dataset = CustomDataset(data_dir)
dataloader = DataLoader(dataset, batch_size=48, shuffle=True, num_workers=8)

num_epochs = 10

custom_transform = transforms.Compose([
    transforms.RandomVerticalFlip(0.5),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomRotation(90),
    transforms.RandomCrop((500, 500))
])

start_time = time.time()
for epoch in range(num_epochs):
    epoch_start = time.time()
    for images, labels in dataloader:   
        transformed_images = torch.stack([custom_transform(image)/255.0 for image in images])
        
    epoch_end = time.time()
    epoch_time = epoch_end - epoch_start
    print(f"Epoch {epoch+1} done in {epoch_time} seconds.")
end_time = time.time()

total_time = end_time - start_time

print(f"Total time taken: {total_time} seconds")
print(f"Total time per epoch: {total_time/num_epochs} seconds")

Epoch 1 done in 285.60568165779114 seconds.
Epoch 2 done in 251.79779887199402 seconds.
Epoch 3 done in 242.05861067771912 seconds.
Epoch 4 done in 260.11944103240967 seconds.
Epoch 5 done in 243.05597758293152 seconds.
Epoch 6 done in 234.65044236183167 seconds.
Epoch 7 done in 250.83137035369873 seconds.
Epoch 8 done in 245.9380533695221 seconds.
Epoch 9 done in 247.9998722076416 seconds.
Epoch 10 done in 246.38864469528198 seconds.
Total time taken: 2508.447569131851 seconds
Total time per epoch: 250.8447569131851 seconds
