# Image preprocessing
Optionally, we can resize all the images, getting them ready for learning to make the epochs go much faster.

In [5]:
import os
import torch
import pandas as pd
from tqdm.notebook import tqdm
from torchvision import transforms, utils
from torchvision.io import read_image
from torchvision.utils import save_image
from torchvision.transforms import v2

METADATA_DIRECTORY = 'metadata'
DATASET_RAW_FILE = os.path.join(METADATA_DIRECTORY, 'dataset.csv')
OUTPUT_FILE = os.path.join(METADATA_DIRECTORY, 'dataset-preprocessed.csv')
DATASET_DIRECTORY = 'dataset'
PREPROCESSED_DIRECTORY = os.path.join(DATASET_DIRECTORY, 'preprocessed')

### Prepare directory

In [6]:
if not os.path.exists(PREPROCESSED_DIRECTORY):
    os.makedirs(PREPROCESSED_DIRECTORY)

### Load dataset .csv

In [7]:
df = pd.read_csv(DATASET_RAW_FILE)


### Load images, resize and save

In [16]:
resize_transform = v2.Compose([
    v2.Resize(size=(224, 224)),
    v2.ToDtype(torch.float32, scale=True)
])

def resize_image(path):
    new_path = os.path.join(PREPROCESSED_DIRECTORY, os.path.basename(path))
    if not os.path.exists(new_path):
        image = read_image(path)
        image = resize_transform(image)
        if image.shape[0] == 4:
            image = image[1:4]
        save_image(image, new_path)
    return new_path

In [17]:
tqdm.pandas()

df['path'] = df['path'].progress_map(resize_image)

  0%|          | 0/125287 [00:00<?, ?it/s]

Corrupt JPEG data: 32765 extraneous bytes before marker 0xd9


### Save dataset .csv

In [18]:
df.to_csv(OUTPUT_FILE)