In [7]:
import os
from PIL import Image
from tqdm import tqdm
import pandas as pd
from ast import literal_eval
import warnings # For handling potential PIL warnings
import math # NEW IMPORT for floor/ceil rounding

# --- CONFIGURATION CONSTANTS ---
# The new size is 50 (original) + 5 (left) + 5 (right) = 60
TARGET_PATCH_DIMENSION = 60 
PADDING = 5
# --------------------------------

def preprocess_and_save_patches_optimized(
    df_merged: pd.DataFrame, 
    source_image_dir: str = 'images/', 
    target_patch_dir: str = 'cropped_images/' # Recommended new name
):
    """
    Optimized function: Loads each source image ONCE and extracts all associated patches.
    Applies numeric stability, 5-pixel padding, and size guarantee (60x60).
    """
    print(f"Starting optimized pre-processing and saving of {TARGET_PATCH_DIMENSION}x{TARGET_PATCH_DIMENSION} image patches...")
    
    os.makedirs(target_patch_dir, exist_ok=True)
    
    # 1. Prepare helper columns
    df_merged['file_id'] = df_merged['image_id'].astype(int).apply(lambda x: f"{x:03d}")
    df_merged['patch_id'] = df_merged.index

    # 2. GROUP THE DATAFRAME by the source image ID (file_id)
    grouped_df = df_merged.groupby('file_id')
    
    total_images = len(grouped_df)
    total_patches = len(df_merged)

    # 3. Outer Loop: Iterate through each unique source image
    tqdm_images = tqdm(
        grouped_df, 
        total=total_images, 
        desc=f"Processing {total_patches} patches from {total_images} images"
    )

    current_image = None

    for file_id, image_group in tqdm_images:
        
        # --- A. Load the Image ONCE ---
        original_img_name = os.path.join(source_image_dir, f"{file_id}.tiff")
        
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", Image.DecompressionBombWarning)
                current_image = Image.open(original_img_name).convert('RGB')
            
        except FileNotFoundError:
            tqdm_images.write(f"\nWarning: Original image not found for ID {file_id} at path: {original_img_name}")
            current_image = None
            continue
        except Exception as e:
            tqdm_images.write(f"\nError loading image {file_id}: {e}")
            current_image = None
            continue

        # --- B. Inner Loop: Process all patches for this loaded image ---
        for index, row in image_group.iterrows():
            if current_image is None:
                continue
                
            patch_filename = os.path.join(target_patch_dir, f"{row['patch_id']}.png")
            
            # Skip if the patch already exists (for resuming)
            # NOTE: If you are rerunning to fix the 49x50 errors, you need to manually 
            # delete the bad files or temporarily comment out this 'continue'
            if os.path.exists(patch_filename):
                continue
            
            try:
                # 1. Get Cropping Coordinates
                try:
                    bbox_list = literal_eval(row['bbox'])
                except (ValueError, TypeError):
                    bbox_list = row['bbox']
                    
                # -----------------------------------------------------
                # --- APPLY NUMERIC STABILITY & PADDING ---
                # -----------------------------------------------------
                
                # Apply stable rounding for the core 50x50 box:
                # xmin/ymin (top-left) use floor
                xmin_core = math.floor(float(bbox_list[0]))
                ymin_core = math.floor(float(bbox_list[1]))
                # xmax/ymax (bottom-right) use ceil
                xmax_core = math.ceil(float(bbox_list[2]))
                ymax_core = math.ceil(float(bbox_list[3]))

                # Apply Padding to the core coordinates
                xmin_padded = xmin_core - PADDING
                ymin_padded = ymin_core - PADDING
                xmax_padded = xmax_core + PADDING
                ymax_padded = ymax_core + PADDING
                
                # 2. Crop from the image currently in memory
                # PIL crop uses (left, top, right, bottom)
                cropped_image = current_image.crop((xmin_padded, ymin_padded, xmax_padded, ymax_padded))
                
                # -----------------------------------------------------
                # --- GUARANTEE: Final Resizing Step to Prevent DataLoader Failures ---
                # -----------------------------------------------------
                if cropped_image.size != (TARGET_PATCH_DIMENSION, TARGET_PATCH_DIMENSION):
                     # Resize is crucial to force the expected size (e.g., 60x60) 
                     # even if the crop went outside the original image boundary.
                     # Image.Resampling.BILINEAR is a good quality resampling filter.
                     cropped_image = cropped_image.resize((TARGET_PATCH_DIMENSION, TARGET_PATCH_DIMENSION), Image.Resampling.BILINEAR) 

                # 3. Save the Patch
                cropped_image.save(patch_filename)
                
            except Exception as e:
                tqdm_images.write(f"Error processing patch {row['patch_id']} from image {file_id}: {e}")
                
    print(f"\n✅ Optimized pre-processing complete. Patches ({TARGET_PATCH_DIMENSION}x{TARGET_PATCH_DIMENSION}) saved to:", target_patch_dir)
    
    return df_merged

In [8]:
from loaders import image_to_df

df_merged = image_to_df()
print(df_merged.head())

                             bbox     labels  category_id  image_id  \
0          [4336, 346, 4386, 396]     [2, 2]            2         1   
1            [756, 872, 806, 922]     [2, 2]            2         1   
2          [270, 4044, 320, 4094]     [2, 2]            2         1   
3  [6672.5, 706.5, 6722.5, 756.5]  [2, 1, 2]            2         1   
4          [1872, 319, 1922, 369]     [2, 2]            2         2   

                 Tumor       Scanner       Origin Species  
0  human breast cancer  Hamamatsu XR  UMC Utrecht   Human  
1  human breast cancer  Hamamatsu XR  UMC Utrecht   Human  
2  human breast cancer  Hamamatsu XR  UMC Utrecht   Human  
3  human breast cancer  Hamamatsu XR  UMC Utrecht   Human  
4  human breast cancer  Hamamatsu XR  UMC Utrecht   Human  


In [9]:
df_processed = preprocess_and_save_patches_optimized(df_merged)
df_processed.to_csv('processed_annotations_with_patch_id.csv', index=False)

Starting optimized pre-processing and saving of 60x60 image patches...


Processing 26286 patches from 503 images: 100%|███████████████████████████████████████| 503/503 [17:18<00:00,  2.07s/it]



✅ Optimized pre-processing complete. Patches (60x60) saved to: cropped_images_padded_60x60/
