In [None]:
import os
import cv2
import numpy as np

In [None]:
# Directory where the original images are stored
image_dir = "amazon_train_images"
# Directory where the preprocessed images will be saved
preprocessed_dir = "amazon_train_preprocessed_images"
os.makedirs(preprocessed_dir, exist_ok=True)

In [None]:
def resize_image(image_path, target_size=(1024, 1024)):
    img = cv2.imread(image_path)
    h, w = img.shape[:2]

    # Calculate the scaling factor while keeping the aspect ratio
    scale = min(target_size[0] / h, target_size[1] / w)

    # Resize the image with better interpolation to avoid blurring text
    resized_img = cv2.resize(img, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_CUBIC)

    # Create a new blank image (filled with white) of the target size
    padded_img = np.ones((target_size[0], target_size[1], 3), dtype=np.uint8) * 255

    # Get the top-left corner for centering the resized image
    x_offset = (target_size[1] - resized_img.shape[1]) // 2
    y_offset = (target_size[0] - resized_img.shape[0]) // 2

    # Place the resized image on the padded image
    padded_img[y_offset:y_offset + resized_img.shape[0], x_offset:x_offset + resized_img.shape[1]] = resized_img

    return padded_img

In [None]:
# List all image files
image_files = [f for f in os.listdir(image_dir) if f.endswith(".jpg")]

In [None]:
# Sort image files by numerical index
# the format is "{index}_{group_id}.jpg"
image_files.sort(key=lambda x: int(x.split('_')[0]))

In [None]:
# Loop through all images and preprocess them
for filename in image_files:
    # Extract the index from the filename
    index = filename.split('_')[0]

    # Full path of the image
    image_path = os.path.join(image_dir, filename)

    # Preprocess the image (resize and pad)
    preprocessed_image = resize_image(image_path, target_size=(512, 512))

    # Save the preprocessed image with just the index as the file name
    new_filename = f"{index}.jpg"
    save_path = os.path.join(preprocessed_dir, new_filename)

    cv2.imwrite(save_path, preprocessed_image)
    print(f"Preprocessed & saved: {new_filename}")