Authors: Antoine A. Ruzette, Simon F. Nørrelykke
Date: 2024-07-21

This notebook plots the correlation coefficients between cell pixel intensity and the signed distance to stroma for two partitions: inside a stromal region (distance < 0) and outside a stromal region (distance < 0) as defined by the modelled stromal border. It supports plotting results from parameter screens in QuPath. 

Contains the code to plot data from pNDRG1 images only. 

In [None]:
! pip install "fitter>=1.6.0" "ipykernel>=6.29.5" "matplotlib>=3.10.0" "natsort>=8.4.0" "numpy>=2.2.3" "pandas>=2.2.3" "scipy>=1.15.2" "seaborn>=0.13.2" "setuptools>=75.8.0"

In [10]:
import os
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from matplotlib.pyplot import ScalarFormatter

# colorblind-friendly colors
CB_palette = ['#377eb8', '#ff7f00', '#4daf4a',
                '#f781bf', '#a65628', '#984ea3',
                '#999999', '#e41a1c', '#dede00']

In [None]:
def bootstrap_sem(data, n_bootstrap=500):
    """Calculate the standard error of the median using bootstrapping."""
    medians = [data.sample(frac=1, replace=True).median() for _ in range(n_bootstrap)]
    return np.std(medians)

# Path to the folder containing CSV files
folder_path = "path/to/your/folder"

# Get the list of CSV files
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Create color mapping for each CSV file
color_palette = sns.color_palette("husl", len(csv_files))
image_colors = {file: color for file, color in zip(csv_files, color_palette)}

# Initialize an empty list to store DataFrames
dfs = []

# Iterate over files in the folder
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        # Read CSV file and append to the list
        df = pd.read_csv(os.path.join(folder_path, file))
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df_stroma = pd.concat(dfs, ignore_index=True)

# Remove leading/trailing whitespace from column names
df_stroma.columns = df_stroma.columns.str.strip()

# Print the column names to check
print("Column names in the DataFrame after stripping whitespace:")
print(df_stroma.columns)

# Check if the required columns are present
required_columns = ['pearsonInsideStromaKerPNDRG1', 'pearsonOutsideStromaKerPNDRG1']
for col in required_columns:
    if col not in df_stroma.columns:
        raise KeyError(f"Column {col} is not present in the DataFrame.")

# Group the data by 'sigma' and calculate the median
grouped_sigma = df_stroma.groupby('sigma')
avg_corr_sigma = grouped_sigma[['pearsonInsideStromaKerPNDRG1', 'pearsonOutsideStromaKerPNDRG1']].median()

# Calculate the SEM for each group using bootstrapping
sem_corr_sigma = grouped_sigma.apply(lambda x: pd.Series({
    'pearsonInsideStromaKerPNDRG1': bootstrap_sem(x['pearsonInsideStromaKerPNDRG1']),
    'pearsonOutsideStromaKerPNDRG1': bootstrap_sem(x['pearsonOutsideStromaKerPNDRG1'])
}))

# Group the data by 'FN_568' and calculate the median
grouped_FN_568 = df_stroma.groupby('TRITC FN')
avg_corr_FN_568 = grouped_FN_568[['pearsonInsideStromaKerPNDRG1', 'pearsonOutsideStromaKerPNDRG1']].median()

# Calculate the SEM for each group using bootstrapping
sem_corr_FN_568 = grouped_FN_568.apply(lambda x: pd.Series({
    'pearsonInsideStromaKerPNDRG1': bootstrap_sem(x['pearsonInsideStromaKerPNDRG1']),
    'pearsonOutsideStromaKerPNDRG1': bootstrap_sem(x['pearsonOutsideStromaKerPNDRG1'])
}))

# Print median value range for sigma plot
inside_range_sigma = avg_corr_sigma["pearsonInsideStromaKerPNDRG1"].min(), avg_corr_sigma["pearsonInsideStromaKerPNDRG1"].max()
outside_range_sigma = avg_corr_sigma["pearsonOutsideStromaKerPNDRG1"].min(), avg_corr_sigma["pearsonOutsideStromaKerPNDRG1"].max()

print(f"[Sigma] Inside stroma median correlation spans from {inside_range_sigma[0]:.2f} to {inside_range_sigma[1]:.2f}")
print(f"[Sigma] Outside stroma median correlation spans from {outside_range_sigma[0]:.2f} to {outside_range_sigma[1]:.2f}")

# Create the first plot for 'sigma'
fig1, ax1 = plt.subplots(figsize=(10, 9))

# Plot data on the first plot with error bars
ax1.errorbar(avg_corr_sigma["pearsonInsideStromaKerPNDRG1"], avg_corr_sigma.index, xerr=sem_corr_sigma["pearsonInsideStromaKerPNDRG1"], fmt='^', color=color_palette[0], label='Inside stroma', markersize=10, capsize=5)
ax1.errorbar(avg_corr_sigma["pearsonOutsideStromaKerPNDRG1"], avg_corr_sigma.index, xerr=sem_corr_sigma["pearsonOutsideStromaKerPNDRG1"], fmt='o', color=color_palette[3], label='Outside stroma', markersize=10, capsize=5)
ax1.axvline(x=0, color='black', ls='--')
ax1.legend(prop={'size': 20}, loc='lower left')
ax1.set_xlim(-0.10, 0.15)

# Set the y-axis formatter to display scientific notation with a fixed number of decimals
formatter = ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-2, 2))
ax1.yaxis.set_major_formatter(formatter)
ax1.tick_params(axis='both', which='major', labelsize=30)
ax1.tick_params(axis='both', which='minor', labelsize=30)
ax1.yaxis.get_offset_text().set_size(30)
ax1.yaxis.get_offset_text().set_fontsize(30)
ax1.yaxis.get_offset_text().set_color('black')

ax1.set_ylabel(r'Smoothing sigma $\sigma$, a.u.', fontsize=30)
ax1.set_xlabel('Correlation', fontsize=30)

plt.tight_layout()
plt.savefig(f'{folder_path}/sensitivity_stroma_annotation_sigma.png', dpi=300)
plt.show()

# Create the second plot for 'FN_568'
fig2, ax2 = plt.subplots(figsize=(10, 9))

# Plot data on the second plot with error bars
ax2.errorbar(avg_corr_FN_568["pearsonInsideStromaKerPNDRG1"], avg_corr_FN_568.index, xerr=sem_corr_FN_568["pearsonInsideStromaKerPNDRG1"], fmt='^', color=color_palette[0], label='Inside stroma', markersize=10, capsize=5)
ax2.errorbar(avg_corr_FN_568["pearsonOutsideStromaKerPNDRG1"], avg_corr_FN_568.index, xerr=sem_corr_FN_568["pearsonOutsideStromaKerPNDRG1"], fmt='o', color=color_palette[3], label='Outside stroma', markersize=10, capsize=5)
ax2.axvline(x=0, color='black', ls='--')
ax2.legend(prop={'size': 20}, loc='lower right')
ax2.set_xlim(-0.05, 0.2)

# Set the y-axis formatter to display scientific notation with a fixed number of decimals
ax2.yaxis.set_major_formatter(formatter)
ax2.tick_params(axis='both', which='major', labelsize=30)
ax2.tick_params(axis='both', which='minor', labelsize=30)
ax2.yaxis.get_offset_text().set_size(30)
ax2.yaxis.get_offset_text().set_fontsize(30)
ax2.yaxis.get_offset_text().set_color('black')

ax2.set_ylabel('Fibronectin intensity threshold, a.u.', fontsize=30)
ax2.set_xlabel('Correlation', fontsize=30)

plt.tight_layout()
plt.savefig(f'{folder_path}/sensitivity_stroma_annotation_FN_568.png', dpi=300)
plt.show()

## Wilcoxon and paired t-test to compare the two distributions

In [9]:
from scipy.stats import wilcoxon

# For the sigma plot
inside_sigma = avg_corr_sigma["pearsonInsideStromaKerPNDRG1"]
outside_sigma = avg_corr_sigma["pearsonOutsideStromaKerPNDRG1"]

stat_sigma, pvalue_sigma = wilcoxon(inside_sigma, outside_sigma, alternative='two-sided')
print(f"[Sigma] Wilcoxon p-value: {pvalue_sigma:.3e}")

# For the FN_568 plot
inside_FN_568 = avg_corr_FN_568["pearsonInsideStromaKerPNDRG1"]
outside_FN_568 = avg_corr_FN_568["pearsonOutsideStromaKerPNDRG1"]

stat_FN568, pvalue_FN_568 = wilcoxon(inside_FN_568, outside_FN_568, alternative='two-sided')
print(f"[FN_568] Wilcoxon p-value: {pvalue_FN_568:.3e}")


[Sigma] Wilcoxon p-value: 9.766e-04
[FN_568] Wilcoxon p-value: 1.953e-03


In [20]:
from scipy.stats import ttest_rel

# Paired t-test for sigma plot
t_stat_sigma, pval_ttest_sigma = ttest_rel(inside_sigma, outside_sigma)
print(f"[Sigma] Paired t-test p-value: {pval_ttest_sigma:.4e}")

# Paired t-test for FN_568 plot
t_stat_FN, pval_ttest_FN = ttest_rel(inside_FN_568, outside_FN_568)
print(f"[FN_568] Paired t-test p-value: {pval_ttest_FN:.4e}")


[Sigma] Paired t-test p-value: 9.6627e-08
[FN_568] Paired t-test p-value: 2.5919e-03
