In [1]:
# nuclei_analysis.py
import os
import argparse
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from skimage import measure, io
from scipy import stats
from pathlib import Path

# Set up matplotlib for better visualization
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['figure.dpi'] = 100
plt.style.use('seaborn-v0_8-whitegrid')

# --- Helper Functions ---

def extract_pressure_from_filename(filename):
    """
    Extracts pressure condition (e.g., '0Pa', '1.4Pa') from a filename.
    Assumes pressure is indicated like '0Pa_' or '1.4Pa_'.
    """
    match = re.search(r'(\d+(\.\d+)?Pa)_', filename, re.IGNORECASE)
    if match:
        return match.group(1)
    return "Unknown"

def analyze_nuclei_properties(mask_path, pressure_condition):
    """
    Analyzes a single nuclei mask file to extract properties of each nucleus.
    """
    try:
        mask = io.imread(mask_path)
    except Exception as e:
        print(f"Error reading mask file {mask_path}: {e}")
        return []

    if mask.dtype != np.int32 and mask.dtype != np.uint16 and mask.dtype != np.uint8:
        print(f"Warning: Mask {Path(mask_path).name} is of type {mask.dtype}. Converting to uint16 for labeling.")
        mask = mask.astype(np.uint16)

    labeled_nuclei = measure.label(mask, connectivity=mask.ndim)
    props = measure.regionprops(labeled_nuclei)
    nuclei_data = []
    for prop in props:
        area = prop.area
        perimeter = prop.perimeter
        eccentricity = prop.eccentricity
        solidity = prop.solidity
        equivalent_diameter = prop.equivalent_diameter
        circularity = (4 * np.pi * area) / (perimeter ** 2) if perimeter > 0 else 0
        minr, minc, maxr, maxc = prop.bbox
        height = maxr - minr
        width = maxc - minc
        aspect_ratio = height / width if width > 0 else 0

        nuclei_data.append({
            'filename': Path(mask_path).name,
            'pressure_condition': pressure_condition,
            'nucleus_id': prop.label,
            'area': area,
            'perimeter': perimeter,
            'eccentricity': eccentricity,
            'circularity': circularity,
            'aspect_ratio': aspect_ratio,
            'solidity': solidity,
            'equivalent_diameter': equivalent_diameter,
            'centroid_row': prop.centroid[0],
            'centroid_col': prop.centroid[1]
        })
    return nuclei_data

def plot_feature_distributions(df, feature_name, base_output_dir, dataset_identifier=""):
    """
    Generates and saves boxplots comparing a feature between pressure conditions.
    Saves plots to base_output_dir/plots/distributions/
    """
    plt.figure()
    sns.boxplot(x='pressure_condition', y=feature_name, data=df, palette="viridis", notch=True)
    title_prefix = f"{dataset_identifier} " if dataset_identifier else ""
    plt.title(f'{title_prefix}{feature_name.replace("_", " ").title()} by Pressure Condition')
    plt.xlabel("Pressure Condition")
    plt.ylabel(feature_name.replace("_", " ").title())

    plot_filename = Path(base_output_dir) / "plots" / "distributions" / f"{dataset_identifier}_{feature_name}_distribution.png"
    os.makedirs(plot_filename.parent, exist_ok=True)
    plt.tight_layout()
    plt.savefig(plot_filename)
    plt.close()
    print(f"  Saved distribution plot: {plot_filename}")

def plot_feature_histograms(df, feature_name, base_output_dir, dataset_identifier=""):
    """
    Generates and saves histograms of a feature, faceted by pressure condition.
    Saves plots to base_output_dir/plots/histograms/
    """
    plt.figure() # Ensure a new figure for FacetGrid
    g = sns.FacetGrid(df, col="pressure_condition", height=4, aspect=1.2, sharey=False, sharex=False)
    g.map(sns.histplot, feature_name, kde=True, bins=30)
    title_prefix = f"{dataset_identifier} " if dataset_identifier else ""
    g.set_titles(f"{title_prefix}{{col_name}}") # Include dataset_identifier in facet titles
    g.set_xlabels(feature_name.replace("_", " ").title())

    plot_filename = Path(base_output_dir) / "plots" / "histograms" / f"{dataset_identifier}_{feature_name}_histograms.png"
    os.makedirs(plot_filename.parent, exist_ok=True)
    # plt.tight_layout() # tight_layout can sometimes conflict with FacetGrid, apply to g.fig if needed
    g.fig.tight_layout()
    g.savefig(plot_filename)
    plt.close()
    print(f"  Saved histogram plot: {plot_filename}")

def perform_statistical_tests(df, feature_name, pressure_groups, base_output_dir, dataset_identifier=""):
    """
    Performs statistical tests between pressure groups for a given feature.
    """
    results = []
    if len(pressure_groups) < 2:
        print(f"  Skipping stats for {feature_name}: Less than two pressure groups found.")
        return []

    unique_conditions = sorted(list(df['pressure_condition'].unique())) # Sort for consistent comparison order

    if len(unique_conditions) < 2:
        print(f"  Skipping stats for {feature_name}: Only one pressure condition ('{unique_conditions[0]}') found.")
        return []

    # Compare the first two unique conditions found (e.g., 0Pa vs 1.4Pa)
    # For more complex scenarios, this loop structure might need adjustment
    # For now, let's stick to comparing the first two distinct groups.
    group1_name = unique_conditions[0]
    group2_name = unique_conditions[1]

    group1_data = df[df['pressure_condition'] == group1_name][feature_name].dropna()
    group2_data = df[df['pressure_condition'] == group2_name][feature_name].dropna()

    if len(group1_data) < 2 or len(group2_data) < 2:
        print(f"  Skipping stats for {feature_name} ({group1_name} vs {group2_name}): Insufficient data.")
        return [{
            'dataset': dataset_identifier, 'feature': feature_name, 'comparison': f"{group1_name}_vs_{group2_name}",
            'statistic_type': 'N/A', 'statistic_value': np.nan, 'p_value': np.nan, 'result': 'Insufficient data'
        }]

    normality1_stat, normality1_p = stats.shapiro(group1_data)
    normality2_stat, normality2_p = stats.shapiro(group2_data)
    test_type, stat_val, p_val = "", np.nan, np.nan

    if normality1_p > 0.05 and normality2_p > 0.05:
        levene_stat, levene_p = stats.levene(group1_data, group2_data)
        equal_var = levene_p > 0.05
        stat_val, p_val = stats.ttest_ind(group1_data, group2_data, equal_var=equal_var)
        test_type = f"T-test (equal_var={equal_var})"
    else:
        stat_val, p_val = stats.mannwhitneyu(group1_data, group2_data, alternative='two-sided')
        test_type = "Mann-Whitney U"

    results.append({
        'dataset': dataset_identifier, 'feature': feature_name, 'comparison': f"{group1_name}_vs_{group2_name}",
        'group1_mean': group1_data.mean(), 'group1_std': group1_data.std(), 'group1_N': len(group1_data),
        'group2_mean': group2_data.mean(), 'group2_std': group2_data.std(), 'group2_N': len(group2_data),
        'statistic_type': test_type, 'statistic_value': stat_val, 'p_value': p_val,
        'result': 'Significant' if p_val < 0.05 else 'Not Significant'
    })
    print(f"  Performed {test_type} for {feature_name} ({group1_name} vs {group2_name}): p={p_val:.4f}")
    return results

# --- Main Function ---
def main(input_dir_str, specified_analysis_parent_dir_str=None):
    """
    Main function to orchestrate the nuclei analysis.
    input_dir_str: Path to the directory containing nuclei masks (e.g., .../Segmented/DatasetName/Nuclei).
    specified_analysis_parent_dir_str: Optional. Path to the parent directory for analysis results
                                       (e.g., .../Analysis/DatasetName/). If None, it's derived.
    """
    input_dir = Path(input_dir_str)

    # Determine the dataset identifier (e.g., "Static-x20")
    # Assumes input_dir is like .../ProjectRoot/Segmented/DatasetIdentifier/MaskType/
    dataset_identifier = input_dir.parent.name

    # Determine the parent directory for this script's outputs
    if specified_analysis_parent_dir_str:
        analysis_parent_dir = Path(specified_analysis_parent_dir_str)
    else:
        # Derive analysis_parent_dir, assuming standard project structure:
        # Input:  .../ProjectName/Segmented/DatasetIdentifier/MasksForNuclei/
        # Output: .../ProjectName/Analysis/DatasetIdentifier/
        project_root_candidate = input_dir.parent.parent.parent # Expected: .../ProjectName/

        if input_dir.parent.parent.name.lower() == "segmented": # Check if ".../Segmented/" is present
            analysis_parent_dir = project_root_candidate / "Analysis" / dataset_identifier
        else:
            # Fallback if "Segmented" is not directly two levels up from input_dir
            print(f"Warning: Input directory structure '{input_dir_str}' does not strictly match '.../Project/Segmented/DatasetIdentifier/MaskType'.")
            print(f"         Attempting to place 'Analysis/{dataset_identifier}' relative to '{project_root_candidate}'.")
            analysis_parent_dir = project_root_candidate / "Analysis" / dataset_identifier

    # Define the specific output directory for this script's results
    script_output_dir = analysis_parent_dir / "Nuclei_Analysis_Output"

    print(f"--- Starting Nuclei Analysis for dataset: {dataset_identifier} ---")
    print(f"Input directory: {input_dir}")
    print(f"Script specific output directory: {script_output_dir}")

    # Create script's main output directory (e.g., .../Analysis/DatasetName/Nuclei_Analysis_Output/)
    os.makedirs(script_output_dir, exist_ok=True)
    # Subdirectories like "plots" and "tables" will be created by helper functions *inside* script_output_dir

    mask_files = [f for f in os.listdir(input_dir) if f.endswith(('.tif', '.tiff')) and 'mask' in f.lower()]
    if not mask_files:
        print(f"No mask files found in '{input_dir}'. Please check file names (e.g., ending with _mask.tif).")
        return
    print(f"Found {len(mask_files)} potential nuclei mask files.")

    all_nuclei_data = []
    pressure_conditions_found = set()
    for mask_file in mask_files:
        mask_path = input_dir / mask_file
        pressure = extract_pressure_from_filename(mask_file)
        if pressure != "Unknown":
            pressure_conditions_found.add(pressure)
        print(f"Processing: {mask_file} (Pressure: {pressure})")
        nuclei_in_file = analyze_nuclei_properties(str(mask_path), pressure)
        all_nuclei_data.extend(nuclei_in_file)

    if not all_nuclei_data:
        print("No nuclei data extracted. Exiting.")
        return

    df_all_nuclei = pd.DataFrame(all_nuclei_data)

    # Save aggregated data to script_output_dir/tables/
    tables_subdir = script_output_dir / "tables"
    os.makedirs(tables_subdir, exist_ok=True)
    aggregated_csv_path = tables_subdir / f"{dataset_identifier}_all_nuclei_properties.csv"
    df_all_nuclei.to_csv(aggregated_csv_path, index=False)
    print(f"\nSaved aggregated nuclei properties to: {aggregated_csv_path}")

    features_to_analyze = ['area', 'perimeter', 'eccentricity', 'circularity', 'aspect_ratio', 'solidity', 'equivalent_diameter']
    print("\n--- Generating Plots and Statistics ---")
    all_stats_results = []

    if not pressure_conditions_found:
        print("No pressure conditions identified from filenames. Cannot perform comparative analysis.")
    else:
        for feature in features_to_analyze:
            if feature in df_all_nuclei.columns:
                print(f"Analyzing feature: {feature}")
                # Pass script_output_dir as the base for plots (it will create subdirs like plots/distributions)
                plot_feature_distributions(df_all_nuclei, feature, script_output_dir, dataset_identifier)
                plot_feature_histograms(df_all_nuclei, feature, script_output_dir, dataset_identifier)
                if len(pressure_conditions_found) >= 2:
                     stats_for_feature = perform_statistical_tests(df_all_nuclei, feature, list(pressure_conditions_found), script_output_dir, dataset_identifier)
                     all_stats_results.extend(stats_for_feature)
            else:
                print(f"Warning: Feature '{feature}' not found in the DataFrame. Skipping.")

    if all_stats_results:
        df_stats = pd.DataFrame(all_stats_results)
        # Save stats to script_output_dir/tables/
        stats_csv_path = tables_subdir / f"{dataset_identifier}_statistical_tests.csv"
        df_stats.to_csv(stats_csv_path, index=False)
        print(f"\nSaved statistical test results to: {stats_csv_path}")
    else:
        print("\nNo statistical tests were performed or results generated (e.g., only one pressure group found).")

    print("\n--- Nuclei Analysis Finished ---")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Perform nuclei analysis from mask images. \n"
                    "Assumes an input directory structure like .../Project/Segmented/DatasetName/NucleiMasks/ \n"
                    "and will output to .../Project/Analysis/DatasetName/Nuclei_Analysis_Output/ (derived automatically), \n"
                    "or to a user-specified parent analysis directory."
    )
    parser.add_argument(
        "input_dir",
        type=str,
        help="Directory containing nuclei mask .tif files (e.g., .../Segmented/MyDataset/Nuclei)."
    )
    parser.add_argument(
        "--output_parent_dir",
        type=str,
        default=None,
        help="Optional. Parent directory for the analysis results (e.g., .../Analysis/MyDataset/). "
             "If not provided, it's derived based on the input_dir structure."
    )

    args = parser.parse_args()
    main(args.input_dir, args.output_parent_dir)

    # Example usage from command line:
    # python nuclei_analysis.py "/path/to/MyProject/Segmented/Static-x20/Nuclei"
    # (This will try to save to "/path/to/MyProject/Analysis/Static-x20/Nuclei_Analysis_Output/")

    # Or specifying the parent output directory:
    # python nuclei_analysis.py "/path/to/MyProject/Segmented/Static-x20/Nuclei" --output_parent_dir "/path/to/MyCustomAnalysisLocation/Static-x20"
    # (This will save to "/path/to/MyCustomAnalysisLocation/Static-x20/Nuclei_Analysis_Output/")


usage: colab_kernel_launcher.py [-h] [--output_parent_dir OUTPUT_PARENT_DIR]
                                input_dir
colab_kernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
