# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import os
import cv2

# Mounting the drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading train and test annotations data

In [3]:
# Paths to train files
train_folder = "/content/drive/MyDrive/VinDr/train_png"
output_train_folder = "/content/drive/MyDrive/resized_train_images"
train_annotations_file = "/content/drive/MyDrive/VinDr/annotations/processed_train_annotations.csv"  # processed train annotations file

# Paths to test files
test_folder = "/content/drive/MyDrive/VinDr/test_png"
output_test_folder = "/content/drive/MyDrive/resized_test_images"
test_annotations_file = "/content/drive/MyDrive/VinDr/annotations/processed_test_annotations.csv"  # processed test annotations file

# Function to return all images paths from specified folder

In [4]:
# Function to return all images paths from specified folder
def get_all_images_from_folder(base_folder):
    image_paths = []
    for subdir, _, files in os.walk(base_folder):  # os.walk() returns a 3-tuple containing dirpath, dirnames, filenames
        for file in files:
            if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            # if file.endswith(('.png', '.jpg', '.jpeg')):
                image_paths.append(os.path.join(subdir, file))
    return image_paths

# Function to resize images

In [5]:
# Function to resize images and update annotations
def resize_images_and_update_annotations(images_paths, annotations, output_folder, target_size):

    os.makedirs(output_folder, exist_ok=True)
    updated_annotations = []

    for image_path in images_paths:
        image_name = os.path.basename(image_path)
        img = cv2.imread(image_path)

        if img is not None:
            original_height, original_width = img.shape[:2]

            # Calculate scaling factors
            scale = min(target_size / original_width, target_size / original_height)
            new_width = int(original_width * scale)
            new_height = int(original_height * scale)

            # Resize image
            resized_img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)

            # Create a padded image
            padded_img = np.zeros((target_size, target_size, 3), dtype=np.uint8)
            x_offset = (target_size - new_width) // 2
            y_offset = (target_size - new_height) // 2
            padded_img[y_offset:y_offset + new_height, x_offset:x_offset + new_width] = resized_img

            # Save the padded image
            cv2.imwrite(os.path.join(output_folder, image_name), padded_img)

            # Update bounding boxes
            image_id = os.path.splitext(image_name)[0]
            image_annotations = annotations[annotations['image_id'] == image_id]

            for _, row in image_annotations.iterrows():
                # Check if bounding box values are None (NaN), and handle accordingly
                if pd.isna(row['x_min']) or pd.isna(row['y_min']) or pd.isna(row['x_max']) or pd.isna(row['y_max']):
                    # If any of the bounding box values are None, retain None in the updated annotations
                    updated_annotations.append({
                        'image_id': image_name,
                        'class_name': row['class_name'],
                        'x_min': None,
                        'y_min': None,
                        'x_max': None,
                        'y_max': None,
                        'class_id': row['class_id']
                    })
                else:
                    x_min = int(row['x_min'] * scale + x_offset)
                    y_min = int(row['y_min'] * scale + y_offset)
                    x_max = int(row['x_max'] * scale + x_offset)
                    y_max = int(row['y_max'] * scale + y_offset)

                    updated_annotations.append({
                        'image_id': image_name,
                        'class_name': row['class_name'],
                        'x_min': x_min,
                        'y_min': y_min,
                        'x_max': x_max,
                        'y_max': y_max,
                        'class_id': row['class_id']
                    })

    return updated_annotations

In [None]:
# for image_path in images_paths:
#     image_name = os.path.basename(image_path)
#     img = cv2.imread(image_path)

#     if img is not None:
#         original_height, original_width = img.shape[:2]

#         # Calculate scaling factors
#         scale = min(target_size / original_width, target_size / original_height)
#         new_width = int(original_width * scale)
#         new_height = int(original_height * scale)

#         # Resize image
#         resized_img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)

#         # Create a padded image
#         padded_img = np.zeros((target_size, target_size, 3), dtype=np.uint8)
#         x_offset = (target_size - new_width) // 2
#         y_offset = (target_size - new_height) // 2
#         padded_img[y_offset:y_offset + new_height, x_offset:x_offset + new_width] = resized_img

#         # Save the padded image
#         cv2.imwrite(os.path.join(output_folder, image_name), padded_img)

#         # Update bounding boxes
#         # Get the image name without the extension
#         image_id = os.path.splitext(image_name)[0]

#         # Use this stripped name for comparison
#         image_annotations = annotations[annotations['image_id'] == image_id]

#         for _, row in image_annotations.iterrows():
#             # Check if bounding box values are None (NaN), and handle accordingly
#             if pd.isna(row['x_min']) or pd.isna(row['y_min']) or pd.isna(row['x_max']) or pd.isna(row['y_max']):
#                 # If any of the bounding box values are None, retain None in the updated annotations
#                 updated_annotations.append({
#                     'image_id': image_name,  # Use full image name here
#                     'class_name': row['class_name'],
#                     'x_min': None,
#                     'y_min': None,
#                     'x_max': None,
#                     'y_max': None,
#                     'class_id': row['class_id']
#                 })
#             else:
#                 x_min = int(row['x_min'] * scale + x_offset)
#                 y_min = int(row['y_min'] * scale + y_offset)
#                 x_max = int(row['x_max'] * scale + x_offset)
#                 y_max = int(row['y_max'] * scale + y_offset)

#                 updated_annotations.append({
#                     'image_id': image_name,
#                     'class_name': row['class_name'],
#                     'x_min': x_min,
#                     'y_min': y_min,
#                     'x_max': x_max,
#                     'y_max': y_max,
#                     'class_id': row['class_id']
#                 })

# Resizing train data

In [None]:
# Load train annotations
train_annotations = pd.read_csv(train_annotations_file)

In [None]:
# Target YOLO size
target_size = 1024

In [None]:
# Get paths of all train images from train directory
train_images_paths = get_all_images_from_folder(train_folder)

In [None]:
# Resize train images
updated_train_annotations = resize_images_and_update_annotations(train_images_paths, train_annotations, output_train_folder, target_size)

In [None]:
# Save updated train annotations
df_updated_train_annotations = pd.DataFrame(updated_train_annotations)
df_updated_train_annotations.to_csv("/content/drive/MyDrive/VinDr/annotations/resized_train_annotations.csv", index=False)

# Resizing test data

In [6]:
# Load test annotations
test_annotations = pd.read_csv(test_annotations_file)

In [7]:
# Get paths of all test images from train directory
test_images_paths = get_all_images_from_folder(test_folder)

In [8]:
# Resize test images
updated_test_annotations = resize_images_and_update_annotations(test_images_paths, test_annotations, output_test_folder, 1024)

In [9]:
# Save updated test annotations
df_updated_test_annotations = pd.DataFrame(updated_test_annotations)
df_updated_test_annotations.to_csv("/content/drive/MyDrive/VinDr/annotations/resized_test_annotations.csv", index=False)