In [1]:
import openslide
import os
import numpy as np
import cv2
from PIL import Image

def extract_patches(svs_file, file_name,output_dir):
    # Set path for the .svs file
    #svs_file = "TCGA-BH-A0HK-01A-01-TSA.baefe9ea-a565-4ec1-92b2-6b62ed9944e3.svs"
    #output_dir = "output_patches"

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Open the .svs slide
    slide = openslide.OpenSlide(svs_file)

    # Get slide properties
    slide_width, slide_height = slide.dimensions  # Full resolution size
    patch_size = 256  # Patch size (adjust as needed)
    stride = 256  # Step size (can be less for overlapping patches)

    # Function to check if a patch is mostly background (white > 80% OR black > 80% OR combined > 90%)
    def is_background(patch, white_threshold=0.8, black_threshold=0.8, combined_threshold=0.9):
        """Determines if the patch is mostly background (either white, black, or combined)."""
        # Convert to grayscale
        gray_patch = cv2.cvtColor(np.array(patch.convert("RGB")), cv2.COLOR_RGB2GRAY)
        
        # Count white pixels (intensity > 200)
        white_pixels = np.sum(gray_patch > 200) / gray_patch.size

        # Count black pixels (intensity < 50)
        black_pixels = np.sum(gray_patch < 50) / gray_patch.size

        # Discard if:
        # 1. White > 80%
        # 2. Black > 80%
        # 3. (White + Black) > 90% (handles boundary patches with mixed black/white)
        return (white_pixels > white_threshold) or (black_pixels > black_threshold) or ((white_pixels + black_pixels) > combined_threshold)

    # Extract patches
    patch_count = 0
    for y in range(0, slide_height, stride):
        for x in range(0, slide_width, stride):
            patch = slide.read_region((x, y), 0, (patch_size, patch_size)).convert("RGB")
            
            # Skip mostly background patches
            if not is_background(patch):
                patch.save(os.path.join(output_dir, f"{file_name}_patch_{patch_count}.png"))
                patch_count += 1

    print(f"Extracted {patch_count} patches from {svs_file}")


In [2]:
# Path to the folder
folder_path = "./Dataset/normal"
output_dir = "./Dataset_patches/normal"
# Loop through all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    name_without_ext = os.path.splitext(filename)[0]
    processed_image = extract_patches(file_path,name_without_ext,output_dir)

Extracted 1863 patches from ./Dataset/normal\0be77aef-ccad-47a4-bcba-1f550a2c4c6b.svs
Extracted 1747 patches from ./Dataset/normal\TCGA-GM-A2D9-11A-01-TS1.79145174-9304-4CDA-8900-9A6A3ACC2B4B.svs


In [3]:
# Path to the folder
folder_path = "./Dataset/tumor"
output_dir = "./Dataset_patches/tumor"
# Loop through all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    name_without_ext = os.path.splitext(filename)[0]
    processed_image = extract_patches(file_path,name_without_ext,output_dir)

Extracted 3474 patches from ./Dataset/tumor\TCGA-A2-A3XV-01A-02-TSB.FF8434E6-B703-43FE-AC0A-AE53131F1EC6.svs
Extracted 4592 patches from ./Dataset/tumor\TCGA-A7-A3IZ-01A-01-TSA.EFEAFBBB-4EED-4894-BAFD-77AAC5783062.svs
