In [5]:
# Hole Analysis Code - Optimized for Binary Masks
# This code analyzes hole TIFF images that are binary masks (0s and 1s)
# Follows the same structure as the original cell analysis code
# Enhanced with proper binary mask handling and additional hole shape metrics

import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
import re
from skimage import measure, filters
import warnings
warnings.filterwarnings('ignore')

# Processing parameters (adjustable)
MIN_HOLE_SIZE = 3          # Minimum hole size in pixels (changed from 10 to capture smaller holes)
MAX_HOLE_FRACTION = 0.9    # Maximum hole size as fraction of total image
GAUSSIAN_KERNEL = (3, 3)   # Gaussian blur kernel size
GAUSSIAN_SIGMA = 0.5       # Gaussian blur standard deviation
MORPH_KERNEL_SIZE = 3      # Morphological operations kernel size
BRIDGE_KERNEL_SIZE = 2     # Small kernel for bridging nearby pixels

# Mount Google Drive - This will require authorization
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Please ensure you are running this in a Google Colab environment and authorize access.")
    # Exit if drive mounting fails, as further operations will fail
    exit()

# Define the base path and folder names
# IMPORTANT: Verify this base_path is correct after mounting your drive!
base_path = "/content/drive/MyDrive/knowledge/University/Master/Thesis/Segmented/"
folders = ["Static-x40", "Static-x20", "1.4Pa-x40", "1.4Pa-x20"]
subfolder_name = "Holes"

# List to store individual hole data
all_data = []

print("\nStarting hole analysis process...")

def extract_metadata_from_filename(filename):
    """
    Extract pressure and magnification from filename.
    Expected format: denoised_0Pa_A1_20dec21_40x_L2RA_FlatA_seq007_Cadherins_regional_segmented.tif
    """
    # Extract pressure (0Pa, 1.4Pa, etc.)
    pressure_match = re.search(r'(\d+\.?\d*)Pa', filename)
    pressure = f"{pressure_match.group(1)}Pa" if pressure_match else "Unknown"

    # Extract magnification (20x, 40x, etc.)
    mag_match = re.search(r'(\d+)x', filename)
    magnification = f"x{mag_match.group(1)}" if mag_match else "unknown"

    return pressure, magnification

def analyze_holes_in_image(image_path):
    """
    Analyze holes in a binary mask TIFF image (0s and 1s).
    Applies preprocessing to connect nearby pixels and improve hole detection.
    Returns list of hole measurements.
    """
    try:
        # Read image as grayscale
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"Warning: Could not read image {image_path}")
            return []

        # Verify this is a binary mask
        unique_vals = np.unique(img)
        print(f"      Unique pixel values: {unique_vals}")

        # Handle binary masks (0s and 1s)
        if img.max() <= 1:
            # Already 0/1 binary mask
            binary_mask = (img * 255).astype(np.uint8)
        elif set(unique_vals).issubset({0, 255}):
            # Already 0/255 binary mask
            binary_mask = img
        elif len(unique_vals) <= 3:
            # Likely binary with possible different values
            # Assume max value represents holes
            binary_mask = (img == img.max()).astype(np.uint8) * 255
        else:
            print(f"      Warning: Image doesn't appear to be a binary mask. Found {len(unique_vals)} unique values.")
            # Try to threshold anyway
            _, binary_mask = cv2.threshold(img, img.max()//2, 255, cv2.THRESH_BINARY)

        # --- PREPROCESSING TO CONNECT NEARBY PIXELS ---
        print(f"      Applying preprocessing to connect nearby pixels...")

        # 1. Gaussian blur to smooth and connect nearby regions
        # Use small kernel to avoid over-smoothing
        blurred = cv2.GaussianBlur(binary_mask, GAUSSIAN_KERNEL, GAUSSIAN_SIGMA)

        # 2. Re-threshold after blurring to get back to binary
        _, smoothed_mask = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY)

        # 3. Morphological closing to connect nearby pixels and fill small gaps
        # Closing = dilation followed by erosion, fills holes and connects nearby objects
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (MORPH_KERNEL_SIZE, MORPH_KERNEL_SIZE))
        closed_mask = cv2.morphologyEx(smoothed_mask, cv2.MORPH_CLOSE, kernel)

        # 4. Additional step: Small dilation followed by erosion to bridge 1-2 pixel gaps
        # This is more aggressive connection for very close pixels
        kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (BRIDGE_KERNEL_SIZE, BRIDGE_KERNEL_SIZE))
        dilated = cv2.dilate(closed_mask, kernel_small, iterations=1)
        final_mask = cv2.erode(dilated, kernel_small, iterations=1)

        # Count connected components before and after processing for comparison
        _, labels_original = cv2.connectedComponents(binary_mask)
        original_components = np.max(labels_original)

        print(f"      Preprocessing complete. Original components: {original_components}")

        # Find connected components in processed mask (holes are white pixels = 255)
        num_labels, labels = cv2.connectedComponents(final_mask)

        print(f"      Found {num_labels-1} connected components (potential holes)")

        hole_data = []
        total_image_pixels = img.shape[0] * img.shape[1]

        for label_id in range(1, num_labels):  # Skip background (label 0)
            # Create mask for this hole
            hole_mask = (labels == label_id)

            # Calculate area (number of pixels)
            area = np.sum(hole_mask)

            # Quality filters for binary masks:
            # Skip very small holes (likely noise or artifacts) - using adjustable parameter
            if area < MIN_HOLE_SIZE:
                continue

            # Skip extremely large holes (likely inverted background) - using adjustable parameter
            if area > total_image_pixels * MAX_HOLE_FRACTION:
                continue

            # Calculate perimeter using contour detection
            hole_mask_uint8 = hole_mask.astype(np.uint8)
            contours, _ = cv2.findContours(hole_mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            if not contours:
                continue

            # Use the largest contour if multiple exist
            contour = max(contours, key=cv2.contourArea)
            perimeter = cv2.arcLength(contour, True)

            if perimeter == 0:
                continue

            # Calculate shape metrics
            circularity = 4 * np.pi * area / (perimeter ** 2)
            equivalent_diameter = np.sqrt(4 * area / np.pi)

            # Calculate additional shape metrics for holes
            # Bounding box for aspect ratio
            x, y, w, h = cv2.boundingRect(contour)
            aspect_ratio = max(w, h) / min(w, h) if min(w, h) > 0 else 1

            # Solidity (area/convex hull area)
            hull = cv2.convexHull(contour)
            hull_area = cv2.contourArea(hull)
            solidity = area / hull_area if hull_area > 0 else 0

            # Extent (area/bounding box area)
            extent = area / (w * h) if (w * h) > 0 else 0

            hole_data.append({
                'hole_id': label_id,
                'area': area,
                'perimeter': perimeter,
                'circularity': circularity,
                'equivalent_diameter': equivalent_diameter,
                'aspect_ratio': aspect_ratio,
                'solidity': solidity,
                'extent': extent,
                'bounding_width': w,
                'bounding_height': h
            })

        print(f"      ✅ Found {len(hole_data)} valid holes after filtering (min size: {MIN_HOLE_SIZE} pixels)")
        print(f"      📊 Processed components: {original_components} → {num_labels-1} → {len(hole_data)} final holes")
        return hole_data

    except Exception as e:
        print(f"Error analyzing image {image_path}: {e}")
        return []

# Loop through each folder and process the hole images
for folder in folders:
    folder_path = os.path.join(base_path, folder, subfolder_name)
    print(f"Attempting to load data from: {folder_path}")

    if not os.path.exists(os.path.join(base_path, folder)):
        print(f"Warning: Folder '{folder}' not found at '{os.path.join(base_path, folder)}'. Skipping.")
        continue

    # Check if the Holes subfolder exists
    if not os.path.exists(folder_path):
        print(f"Warning: Holes subfolder not found at '{folder_path}'. Skipping.")
        continue

    try:
        # Get all TIFF files in the Holes folder
        tiff_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.tif', '.tiff'))]

        if not tiff_files:
            print(f"Warning: No TIFF files found in '{folder_path}'. Skipping.")
            continue

        print(f"Found {len(tiff_files)} TIFF files in {folder}")

        # Process each TIFF file
        for filename in tiff_files:
            file_path = os.path.join(folder_path, filename)

            # Analyze holes in this image
            holes = analyze_holes_in_image(file_path)

            # Extract pressure and magnification from folder name
            if "Static" in folder or "flow3" in folder:
                pressure = '0Pa'
            elif "1.4Pa" in folder:
                pressure = '1.4Pa'
            else:
                # Try to extract from filename as fallback
                file_pressure, _ = extract_metadata_from_filename(filename)
                pressure = file_pressure if file_pressure != "Unknown" else 'Unknown'

            # Extract magnification source
            if "x40" in folder:
                magnification_source = 'x40'
            elif "x20" in folder:
                magnification_source = 'x20'
            else:
                # Try to extract from filename as fallback
                _, file_magnification = extract_metadata_from_filename(filename)
                magnification_source = file_magnification if file_magnification != "unknown" else 'unknown'

            # Add metadata to each hole
            for hole in holes:
                hole.update({
                    'filename': filename,
                    'folder': folder,
                    'pressure': pressure,
                    'magnification_source': magnification_source,
                    'file_path': file_path
                })
                all_data.append(hole)

        print(f"Successfully processed {len(tiff_files)} images from: {folder}")
        total_holes = sum(1 for item in all_data if item['folder'] == folder)
        print(f"  Folder: {folder} -> Pressure: {pressure}, Magnification: {magnification_source}, Total holes: {total_holes}")

    except Exception as e:
        print(f"Error processing folder {folder_path}: {e}")

# Create DataFrame from all hole data
if all_data:
    combined_df = pd.DataFrame(all_data)
    print("\nCombined DataFrame Info (before adjustments):")
    combined_df.info(verbose=True, show_counts=True)

    # --- Apply magnification adjustments ---
    print("\nApplying magnification adjustments to hole measurements...")

    # Ensure columns exist and are numeric before adjustment
    numeric_columns = ['area', 'perimeter', 'equivalent_diameter']
    for col in numeric_columns:
        if col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        else:
            print(f"Warning: '{col}' column not found. Cannot apply adjustment.")

    # Identify rows from x40 magnification
    is_x40 = combined_df['magnification_source'] == 'x40'
    num_x40_rows = is_x40.sum()
    print(f"Found {num_x40_rows} holes from x40 magnification datasets for adjustment.")

    if num_x40_rows > 0:
        if 'area' in combined_df.columns:
            original_area_sum_x40 = combined_df.loc[is_x40, 'area'].sum()
            combined_df.loc[is_x40, 'area'] = combined_df.loc[is_x40, 'area'] / 4.0
            adjusted_area_sum_x40 = combined_df.loc[is_x40, 'area'].sum()
            print(f"  Hole area for x40 data adjusted (divided by 4). Original sum: {original_area_sum_x40}, Adjusted sum: {adjusted_area_sum_x40}")

        # Adjust linear measurements (divide by 2 for x40 vs x20)
        linear_measurements = ['perimeter', 'equivalent_diameter', 'bounding_width', 'bounding_height']
        for measurement in linear_measurements:
            if measurement in combined_df.columns:
                original_sum = combined_df.loc[is_x40, measurement].sum()
                combined_df.loc[is_x40, measurement] = combined_df.loc[is_x40, measurement] / 2.0
                adjusted_sum = combined_df.loc[is_x40, measurement].sum()
                print(f"  {measurement.capitalize()} for x40 data adjusted (divided by 2). Original sum: {original_sum:.1f}, Adjusted sum: {adjusted_sum:.1f}")

        print(f"  Shape metrics (circularity, aspect_ratio, solidity, extent) remain unchanged as they are dimensionless.")
    # ---------------------------------------

    print("\nCombined DataFrame Head (first 5 rows after adjustments):")
    print(combined_df[['filename', 'folder', 'pressure', 'magnification_source', 'area', 'perimeter']].head())

    print("\nUnique values in 'pressure' column after combining:")
    print(combined_df['pressure'].unique())
    print("\nValue counts for 'pressure':")
    print(combined_df['pressure'].value_counts())

    # Define the columns for analysis (enhanced for hole analysis)
    analysis_columns = [
        'area',  # Now adjusted
        'perimeter',  # Now adjusted
        'equivalent_diameter',  # Now adjusted
        'circularity',
        'aspect_ratio',
        'solidity',
        'extent',
        'bounding_width',  # Now adjusted
        'bounding_height'  # Now adjusted
    ]

    if 'pressure' not in combined_df.columns:
        print("Error: 'pressure' column not found in the combined data. Cannot proceed with analysis.")
        exit()

    combined_df_filtered = combined_df[
        combined_df['pressure'].isin(['0Pa', '1.4Pa'])
    ].copy()

    print(f"\nShape of combined_df: {combined_df.shape}")
    print(f"Shape of combined_df_filtered: {combined_df_filtered.shape}")

    if combined_df_filtered.empty:
        print("DataFrame is empty after filtering for pressure. Check your data.")
    else:
        print("\nValue counts for 'pressure' in filtered data:")
        print(combined_df_filtered['pressure'].value_counts())

        print("\nConverting analysis columns to numeric type (if not already)...")
        for col in analysis_columns:
            if col in combined_df_filtered.columns:
                # If already numeric due to earlier conversion, this won't harm
                combined_df_filtered[col] = pd.to_numeric(combined_df_filtered[col], errors='coerce')
            else:
                print(f"  Warning: Analysis column '{col}' not found in the filtered DataFrame. It will be skipped.")
                if col in analysis_columns:
                    analysis_columns.remove(col)  # remove if truly missing

        valid_analysis_columns = [col for col in analysis_columns if col in combined_df_filtered.columns and pd.api.types.is_numeric_dtype(combined_df_filtered[col])]

        if not valid_analysis_columns:
            print("\nNo valid numeric columns found for analysis after filtering and type conversion. Cannot generate statistics.")
        else:
            print(f"\nPerforming descriptive analysis on (adjusted) columns: {valid_analysis_columns}")
            try:
                descriptive_stats = combined_df_filtered.groupby(['pressure'])[valid_analysis_columns].agg(['mean', 'median', 'std', 'min', 'max', 'count'])
                print("\n--- Descriptive Statistics by Pressure (with x40 adjustments) ---")
                print(descriptive_stats)

                output_stats_path = os.path.join(base_path, "../Analysis/Holes/descriptive_stats_by_pressure_holes_adjusted.csv")
                os.makedirs(os.path.dirname(output_stats_path), exist_ok=True)
                descriptive_stats.to_csv(output_stats_path)
                print(f"\nDescriptive statistics (with adjustments) saved to: {output_stats_path}")

            except Exception as e:
                print(f"Error during groupby or aggregation: {e}")

    output_combined_file_path = os.path.join(base_path, "../Analysis/Holes/combined_hole_data_adjusted.csv")
    os.makedirs(os.path.dirname(output_combined_file_path), exist_ok=True)
    combined_df.to_csv(output_combined_file_path, index=False)
    print(f"\nCombined hole data (with adjustments and new columns) saved to: {output_combined_file_path}")

else:
    print("\nNo hole data was successfully loaded. Please check file paths and ensure Google Drive is mounted correctly.")
    print(f"  Expected base path for your files: {base_path}")
    print(f"  Expected subfolders: {folders}")
    print(f"  Expected subfolder name within each: {subfolder_name}")
    print("  Expected file types: .tif, .tiff")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

Starting hole analysis process...
Attempting to load data from: /content/drive/MyDrive/knowledge/University/Master/Thesis/Segmented/Static-x40/Holes
Attempting to load data from: /content/drive/MyDrive/knowledge/University/Master/Thesis/Segmented/Static-x20/Holes
Attempting to load data from: /content/drive/MyDrive/knowledge/University/Master/Thesis/Segmented/1.4Pa-x40/Holes
Found 13 TIFF files in 1.4Pa-x40
      Unique pixel values: [0 1]
      Applying preprocessing to connect nearby pixels...
      Preprocessing complete. Original components: 1
      Found 1 connected components (potential holes)
      ✅ Found 1 valid holes after filtering (min size: 3 pixels)
      📊 Processed components: 1 → 1 → 1 final holes
      Unique pixel values: [0 1]
      Applying preprocessing to connect nearby pixels...
      Preprocessing c