Authors: Antoine A. Ruzette
Date: 2025-02-21

This notebook processes cell measurement tables exported from QuPath to plot the spatial distribution of cell-level pixel intensity in relation to a modelled stromal border. It also supports the comparison of confusion matrices between threshold- and machine learning-based cell classification.

Contains the code to plot data from images containing four channels: DAPI (nuclei), TRITC (cytokeratin), FITC (fibronectin) and CY5 (Ki67). 

In [None]:
! pip install fitter==1.6.0 ipykernel==6.17.1 matplotlib==3.7.2 natsort==8.4.0 numpy==1.26.4 pandas==2.2.2 scipy==1.11.4 seaborn==0.13.2 setuptools==75.0.0

In [1]:
import os
import pandas as pd
import numpy as np
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import natsort
import re
from matplotlib.ticker import LogFormatterSciNotation
from fitter import Fitter, get_common_distributions, get_distributions

# colorblind-friendly colors
CB_palette = ['#377eb8', '#ff7f00', '#4daf4a',
                '#f781bf', '#a65628', '#984ea3',
                '#999999', '#e41a1c', '#dede00']

## Back-end functions

In [2]:
def load_and_preprocess_files(folder_path, file_paths, expected_columns):
    """
    Load and preprocess CSV files.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        file_paths (list): List of file names.
        expected_columns (dict): Dictionary mapping expected column keys to possible names.

    Returns:
        list: A list of processed DataFrames (one per image).
        dict: Column mapping for use in plotting.
    """
    dfs = []
    final_column_mapping = {}

    for idx, image in enumerate(file_paths):
        print(f"\n🔹 Processing: {image}")
        file_path = os.path.join(folder_path, image)

        # Load CSV to check available columns first
        try:
            df_sample = pd.read_csv(file_path, nrows=1)
            available_columns = df_sample.columns.tolist()
            print(f"✅ Available Columns: {available_columns}")
        except Exception as e:
            print(f"❌ Error loading {image}: {e}")
            continue

        # Dynamically map expected column names to available ones
        column_mapping = {}
        for key, possible_names in expected_columns.items():
            for name in possible_names:
                if name in available_columns:
                    column_mapping[key] = name
                    break
            else:
                print(f"⚠️ Warning: {key} column not found in {image}. Skipping.")

        if not column_mapping:
            print(f"⚠️ Skipping {image} as no expected columns were found.")
            continue

        # Reload dataframe with only found columns
        try:
            df = pd.read_csv(file_path, usecols=list(column_mapping.values()))
        except Exception as e:
            print(f"❌ Error loading selected columns in {image}: {e}")
            continue

        # Skip this file if essential columns are missing
        essential_columns = ["DAPI", "Ki67_647"]
        missing_essential = [col for col in essential_columns if col not in column_mapping]
        if missing_essential:
            print(f"⚠️ Skipping {image} due to missing essential columns: {missing_essential}")
            continue

        # Remove outliers dynamically (only for present columns)
        outlier_limits = {}
        for key in ["DAPI", "Ki67_647", "KER_488", "FN_568"]:
            if key in column_mapping:
                col_name = column_mapping[key]
                p01 = df[col_name].quantile(0.01)
                p99 = df[col_name].quantile(0.99)
                outlier_limits[col_name] = (p01, p99)

        print(f"📊 Outlier Thresholds: {outlier_limits}")

        # Filter outliers
        df_no_outlier = df.copy()
        for col, (p01, p99) in outlier_limits.items():
            df_no_outlier = df_no_outlier[(df_no_outlier[col] >= p01) & (df_no_outlier[col] <= p99)]

        if df_no_outlier.empty:
            print(f"⚠️ Skipping {image} as it became empty after outlier removal.")
            continue

        # Add a column to track which image this data comes from
        df_no_outlier["Image"] = f"{idx+1}G"

        dfs.append(df_no_outlier)

        # Save column mapping for later use
        final_column_mapping = column_mapping  

    if dfs:
        return dfs, final_column_mapping
    else:
        print("⚠️ No valid data loaded.")
        return [], {}
    

# Helper function to generate floating point ranges
def frange(start, stop, step):
    """Generate a range of floats with a given step size."""
    while start <= stop:
        yield start
        start += step


# Cell measurement distributions for each channel

## Linear scale distributions

In [None]:
# Define folder and file paths
folder_path = "your/folder/path"

file_paths = [
    "data1.csv",
    "data2.csv",
]

# Define bin sizes for each channel
bin_sizes = {
    "DAPI": 50,
    "Ki67_647": 5,
    "KER_488": 75,
    "FN_568": 40
}

expected_columns = {
    "Class": ["Class"],
    "DAPI": ["DAPI: Nucleus: Median"],
    "KER_488": ["FITC KER: Cytoplasm: Median"],
    "Ki67_647": ["CY5 Ki67: Nucleus: Max"],
    "FN_568": ["TRITC FN: Cell: Median"],
    "Nucleus_Area": ["Nucleus: Area µm^2"]
}

y_ticks = {
    "DAPI": [0, 0.005, 0.01, 0.015],
    "KER_488": [0, 0.01, 0.02, 0.03, 0.04], 
    "Ki67_647": [0, 0.005, 0.01, 0.015, 0.02], 
    "FN_568": [0, 0.01, 0.02, 0.03, 0.04, 0.05]
}

# Load and preprocess data
dfs_data_list, column_mapping = load_and_preprocess_files(folder_path, file_paths, expected_columns)

# Ensure data is available before plotting
if dfs_data_list:
    fig, axs = plt.subplots(2, 2, figsize=(30, 26))

    plot_keys = ["DAPI", "Ki67_647", "KER_488", "FN_568"]  # Use same keys as column_mapping
    axes_positions = [(0, 0), (0, 1), (1, 0), (1, 1)]
    axis_limits = {
        "DAPI": (0, 20000),
        "Ki67_647": (0, 3000),
        "KER_488": (0, 20000),
        "FN_568": (0, 8000)
    }

    # Descriptive labels for axes
    axis_labels = {
        "DAPI": "DAPI median nuclear intensity, a.u.",
        "Ki67_647": "Ki67 max nuclear intensity, a.u.",
        "KER_488": "Cytokeratin median cytoplasmic intensity, a.u.",
        "FN_568": "Fibronectin median cellular intensity, a.u."
    }

    plotted_something = False

    for df in dfs_data_list:
        image_label = df["Image"].iloc[0]  # Get image label

        for key, (row, col) in zip(plot_keys, axes_positions):
            if key in column_mapping and column_mapping[key] in df.columns:
                sns.histplot(
                    df[column_mapping[key]],
                    binwidth=bin_sizes.get(key, 10),  # Use specified bin size, default to 10 if not found
                    kde=True,
                    line_kws={"linewidth": 5}, 
                    ax=axs[row, col],
                    label=image_label,
                    alpha=0.5,
                    stat="probability"  # Normalize histogram to show frequency instead of absolute count
                )
                plotted_something = True

    # Set log scale, font sizes, tick sizes, and format plots
    for (key, (row, col)) in zip(plot_keys, axes_positions):
        ax = axs[row, col]
        ax.set_yscale('linear')
        ax.set_xscale('linear')
        ax.set_ylabel('Probability', fontsize=45)
        ax.set_xlabel(axis_labels[key], fontsize=45)
        ax.tick_params(axis='both', labelsize=45)

        # Limit x-axis range and set 5 ticks
        if key in axis_limits:
            ax.set_xlim(axis_limits[key])
            xticks = [axis_limits[key][0] + i * (axis_limits[key][1] - axis_limits[key][0]) / 4 for i in range(5)]
            ax.set_xticks(xticks)

        # Set y-ticks if defined
        if key in y_ticks:
            ax.set_yticks(y_ticks[key])
            ax.set_ylim(bottom=min(y_ticks[key]), top=max(y_ticks[key]))

        # Format y-tick labels to 2 significant digits
        ax.set_yticklabels([f"{tick:.2g}" for tick in ax.get_yticks()])


    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{folder_path}/exploratory_histograms.png", dpi=300)
    plt.show()
else:
    print("⚠️ No plots generated due to lack of valid data.")
