In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Extra non "data science / image processing" libraries
from pathlib import Path
from tqdm.notebook import tqdm

# Global variables
from global_variables import (
    RAW_DATA_FOLDER,
    DATA_FOLDER,
)

# We'll reduce the images from (2604, 4624) to (326, 578)
DOWNSCALE_FACTOR = 8

In [None]:
import cv2


def process_image(
    img: np.ndarray,
    *,
    downscale_factor: float | int,
) -> np.ndarray:
    img = _resize_image(img, downscale_factor)
    img = _fix_exif_rotation(img)
    return img


def _resize_image(img: np.ndarray, downscale_factor: float | int) -> np.ndarray:
    new_height = int(img.shape[0] / downscale_factor)
    new_width = int(img.shape[1] / downscale_factor)
    return cv2.resize(img, (new_width, new_height))


def _fix_exif_rotation(img: np.ndarray) -> np.ndarray:
    return cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)

In [None]:
def validate_processing(
    img: np.ndarray,
    processed_img: np.ndarray,
    *,
    downscale_factor: float | int,
) -> None:
    assert processed_img.shape[1] == int(img.shape[0] / downscale_factor)
    assert processed_img.shape[0] == int(img.shape[1] / downscale_factor)
    assert processed_img.shape[2] == img.shape[2]

In [None]:
def save_image(
    img: np.ndarray,
    filename: Path,
) -> None:
    filename.parent.mkdir(parents=True, exist_ok=True)
    plt.imsave(filename, img)

In [None]:
from concurrent.futures import ProcessPoolExecutor

def process_images(
    source_folder: Path,
    target_folder: Path,
    *,
    downscale_factor: float | int,
) -> None:
    for folder in tqdm(list(source_folder.iterdir())):
        for file in tqdm(list(folder.iterdir())):
            img = plt.imread(file)
            processed_img = process_image(
                img, downscale_factor=downscale_factor
            )
            target_file = target_folder / folder.name / file.name
            save_image(processed_img, target_file)

### Test on Sample Image

In [None]:
sample_img_file = RAW_DATA_FOLDER.iterdir().__next__().iterdir().__next__()
sample_img_file

In [None]:
sample_img = plt.imread(sample_img_file)
plt.imshow(sample_img)
print("Shape: ", sample_img.shape)

In [None]:
processed_sample_img = process_image(
    sample_img, downscale_factor=DOWNSCALE_FACTOR
)
plt.imshow(processed_sample_img)
print("Shape: ", processed_sample_img.shape)

### Process all images

In [None]:
process_images(
    RAW_DATA_FOLDER,
    DATA_FOLDER,
    downscale_factor=DOWNSCALE_FACTOR,
)