# Image Processing with Labels

This notebook processes patient images by adding labels based on the information in an Excel file.

In [1]:
import os
import pandas as pd
from PIL import Image
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPM
import io

# Set up paths
path = "C:/Users/janni/OneDrive/Dokumente/PostDoc/Projects/Patho Prompt Injection/First_Dataset"
input_file = os.path.join(path, "Patient_Metadata_long.xlsx")
output_folder = os.path.join(path, "output_images")
os.makedirs(output_folder, exist_ok=True)

In [2]:
def create_label_dict(path):
    def svg_path(filename):
        return os.path.join(path, f"{filename}.svg")
    
    label_dict = {
        "0/1": svg_path("0_1"),
        "1/1": svg_path("1_1"),
        "0/2": svg_path("0_2"),
        "1/2": svg_path("1_2"),
        "2/2": svg_path("2_2"),
        "0/3": svg_path("0_3"),
        "1/3": svg_path("1_3"),
        "0/4": svg_path("0_4"),
        "1/4": svg_path("1_4"),
        "2/4": svg_path("2_4"),
        "0/5": svg_path("0_5"),
        "1/5": svg_path("1_5"),
        "pT1": svg_path("pT1"),
        "pT2": svg_path("pT2"),
        "pT3": svg_path("pT3"),
        "BRAF mut": svg_path("BRAF_mut"),
        "wildtype": svg_path("wildtype"),
        "RAS mut": svg_path("RAS_mut")
    }
    
    missing_files = [label for label, file_path in label_dict.items() if not os.path.exists(file_path)]
    if missing_files:
        print(f"Warning: The following SVG files are missing: {', '.join(missing_files)}")
    else:
        print("All SVG files found successfully.")
    
    return label_dict

# Create the label dictionary
label_dict = create_label_dict(path)

All SVG files found successfully.


In [3]:
import numpy as np
from PIL import Image
import io
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def find_whitest_area(image, grid_size=(3, 3)):
    """Find the whitest area in the image using a grid-based approach."""
    # Convert image to numpy array if it's not already
    if isinstance(image, Image.Image):
        image = np.array(image)
    
    # If image is RGBA, convert to RGB by removing alpha channel
    if image.shape[2] == 4:
        image = image[:,:,:3]
    
    # Calculate the "whiteness" of each pixel (higher value = whiter)
    whiteness = np.mean(image, axis=2)
    
    height, width = whiteness.shape
    cell_height, cell_width = height // grid_size[0], width // grid_size[1]
    
    max_whiteness = 0
    best_cell = (0, 0)
    
    for i in range(grid_size[0]):
        for j in range(grid_size[1]):
            cell = whiteness[i*cell_height:(i+1)*cell_height, j*cell_width:(j+1)*cell_width]
            cell_whiteness = np.mean(cell)
            if cell_whiteness > max_whiteness:
                max_whiteness = cell_whiteness
                best_cell = (i, j)
    
    # Calculate the position for the label within the whitest cell
    x = best_cell[1] * cell_width
    y = best_cell[0] * cell_height
    
    return x, y, cell_width, cell_height

In [4]:
import cairosvg

def process_image(row, label_dict, base_path, output_folder, label_width_pct=0.2, label_height_pct=0.1, label_x_pct=0.05, label_y_pct=0.05, use_whitespace=False):
    image_path = f"{base_path}/{row['Study_ID']}.png"
    output_path = os.path.join(output_folder, f"{row['Study_ID']}_{row['Label_Type']}.png")
   
    try:
        # Open the base image
        base_image = Image.open(image_path).convert('RGBA')
    except FileNotFoundError:
        logger.error(f"Image file not found: {image_path}")
        return
    except Exception as e:
        logger.error(f"Error opening image {image_path}: {str(e)}")
        return
    
    if row['Label_Type'] != 'none':
        prompt = row['True_Prompt'] if row['Label_Type'] == 'true' else row['False_Prompt']
        svg_path = label_dict.get(prompt)
       
        if svg_path and os.path.exists(svg_path):
            try:
                label_width = max(1, int(base_image.width * label_width_pct))
                label_height = max(1, int(base_image.height * label_height_pct))

                png_data = cairosvg.svg2png(url=svg_path, output_width=label_width, output_height=label_height)
                label_image = Image.open(io.BytesIO(png_data)).convert('RGBA')
               
                new_image = base_image.copy()
               
                if use_whitespace:
                    label_x, label_y, _, _ = find_whitest_area(base_image)
                    logger.info(f"Whitest area found for {row['Study_ID']} at ({label_x}, {label_y})")
                else:
                    label_x = int(base_image.width * label_x_pct)
                    label_y = int(base_image.height * label_y_pct)

                # Ensure label fits within the image
                label_x = min(label_x, base_image.width - label_width)
                label_y = min(label_y, base_image.height - label_height)

                new_image.paste(label_image, (label_x, label_y), label_image)
                logger.info(f"Label added successfully to {row['Study_ID']} at position ({label_x}, {label_y})")
            except Exception as e:
                logger.error(f"Error adding label to {row['Study_ID']}: {str(e)}")
                new_image = base_image
        else:
            logger.warning(f"SVG file not found for prompt '{prompt}' for {row['Study_ID']}")
            new_image = base_image
    else:
        new_image = base_image
   
    try:
        new_image.save(output_path, 'PNG')
        logger.info(f"Processed: {output_path}")
    except Exception as e:
        logger.error(f"Error saving processed image {output_path}: {str(e)}")
        
def process_images(df, label_dict, base_path, output_folder, limit=None, label_width_pct=0.2, label_height_pct=0.1, label_x_pct=0.05, label_y_pct=0.05, use_whitespace=False):
    for index, row in df.iterrows():
        if limit is not None and index >= limit:
            break
       
        process_image(row, label_dict, base_path, output_folder, label_width_pct, label_height_pct, label_x_pct, label_y_pct, use_whitespace)
   
    logger.info(f"Image processing complete. Processed {index + 1} images.")

In [None]:
df = pd.read_excel(input_file)
limit = None  # Set to None to process all images
process_images(df, label_dict, path, output_folder, limit=limit, 
               label_width_pct=0.3, label_height_pct=0.2, 
               label_x_pct=0.05, label_y_pct=0.05, 
               use_whitespace=True)


In [5]:
df = pd.read_excel(input_file)
limit = 2  # Set to None to process all images
process_images(df, label_dict, path, output_folder, limit=limit, 
               label_width_pct=0.3, label_height_pct=0.2, 
               label_x_pct=0.05, label_y_pct=0.05, 
               use_whitespace=True)


INFO:__main__:Whitest area found for LN_1_1 at (1422, 846)
INFO:__main__:Label added successfully to LN_1_1 at position (1422, 846)
INFO:__main__:Processed: C:/Users/janni/OneDrive/Dokumente/PostDoc/Projects/Patho Prompt Injection/First_Dataset\output_images\LN_1_1_true.png
INFO:__main__:Whitest area found for LN_1_1 at (1422, 846)
INFO:__main__:Label added successfully to LN_1_1 at position (1422, 846)
INFO:__main__:Processed: C:/Users/janni/OneDrive/Dokumente/PostDoc/Projects/Patho Prompt Injection/First_Dataset\output_images\LN_1_1_false.png
INFO:__main__:Image processing complete. Processed 3 images.
