Create partial correlation tables


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from statannotations.Annotator import Annotator
import itertools

# ignore warnings for easier plotting
import warnings

warnings.filterwarnings("ignore")

from load_data import load_data, return_asterisks_p

In [3]:
# Hue colors order and names
# auxiliar dict to put correct order in the figure
map_hue_color = {
    "G_norm": ["G_z<=0", "G_z>0"],
    "Corr_norm": ["C_z<=0", "C_z>0"],
    "disease": ["HC", "MS"],
    "GROUP": ["HC", "CIS", "RRMS", "SPMS", "PPMS"],
    "CENTER": [
        "CLINIC",
        "MAINZ",
        "MILAN",
        "NAPLES",
        "OSLO",
        "LONDON",
        "AMSTERDAM",
    ],
}

# get the progressives together.
mapping_prog = {
    "HC": "HC",
    "CIS": "CIS",
    "RRMS": "RRMS",
    "SPMS": "PMS",
    "PPMS": "PMS",
}

df_merged = load_data()

df_merged = pd.read_csv("/home/gerard/VHIR/MAGNIMS_DEFINITIVE_RESULTS/df_merged_combat.csv")

print(df_merged.columns.values)

df_merged["q_Comm_spl"] = df_merged["Comm_ratio"] / df_merged["SC_spl_full"]
df_merged["q_Comm_eff"] = df_merged["Comm_ratio"] / df_merged["SC_eff_full"]
df_merged["q_CC_spl"] = df_merged["CC_ratio_area"] / df_merged["SC_spl_full"]
df_merged["q_CC_eff"] = df_merged["CC_ratio_area"] / df_merged["SC_eff_full"]

print(len(df_merged))

Initial length: 730
disease
HC    216
MS    514
dtype: int64
QC N: 5
QC Y: 724
LONDON2: 27
['SubjID' 'DOB' 'SEX' 'GROUP' 'ONSET_DATE' 'MR_DATE' 'TREATMENT_AT_MRI'
 'TREATMENT' 'EDSS' 'YED' 'SDMT' 'RAO_SRTS' 'RAO_SRTR' 'RAO_SRTD'
 'RAO_10_36_SRTR' 'RAO_10_36_SRTD' 'RAO_WLG' 'RAO_PASAT' 'CENTER'
 'FMRI_TR' 'FMRI_NSCANS' 'FMRI_SCANTIME' 'AGE' 'DD' 'BICAMS_CVLT2'
 'BICAMS_BVMTR' 'NPS_x' '9HPT_dH' '9HPT_ndH' 'T25FW' 'motor_x'
 'Datecognitiveevaluation' 'TypeRAO' 'RAO_PASAT2sec'
 'RAO_WLGperseveration' 'RAO_WLGintrusion' 'Right9HPT' 'Left9HPT'
 'MFIS_TOTAL' 'MFIS_PHYS_FUNCT' 'MFIS_COGN_FUNCT' 'MFIS_SOCIAL_FUNCT'
 'BDI_II' '9HPT_average' 'notes_NPS' '9HPT_dH_' 'DWI_PROT' 'SCANNER'
 'SRTL_sum' 'PASAT2' 'WRDLG' 'QC' 'CENTER2' 'GM' 'GMF' 'WM' 'WMF' 'CSF'
 'CSFF' 'LV' 'LVF' 'BPF' 'RDwm_mean' 'RDwm_std' 'RDlwm_mean' 'RDlwm_std'
 'RDl_mean' 'RDl_std' 'RD_gm' 'RD_gmwm' 'Meta_x' 'Meta_noreg'
 'Unnamed: 21' 'Unnamed: 22' 'Unnamed: 23' 'FAwm_mean' 'FAwm_std'
 'FAlwm_mean' 'FAlwm_std' 'FAl_mean' 'FAl_st

In [7]:
def benjamini_hochberg_correction(p_values, alpha=0.05):
    """
    Perform Benjamini-Hochberg FDR correction on a list of p-values.
    
    Parameters:
    - p_values: list or numpy array of p-values
    - alpha: significance level
    
    Returns:
    - critical_values: BH critical values
    - reject: boolean array indicating which null hypotheses are rejected
    """
    p_values = np.array(p_values)
    m = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    critical_values = np.arange(1, m+1) * alpha / m
    reject = sorted_p_values <= critical_values
    reject_max = np.max(np.where(reject)[0]) if np.any(reject) else -1
    reject = np.zeros(m, dtype=bool)
    if reject_max >= 0:
        reject[sorted_indices[:reject_max+1]] = True
    corrected_p_values = np.zeros(m)
    for i in range(m):
        if i <= reject_max:
            corrected_p_values[sorted_indices[i]] = sorted_p_values[i] * m / (i + 1)
        else:
            corrected_p_values[sorted_indices[i]] = sorted_p_values[i]
    corrected_p_values = np.minimum(1, corrected_p_values)  # Ensure p-values don't exceed 1
    return critical_values, reject, corrected_p_values


def bonferroni_correction(p_values, alpha=0.05):
    """
    Perform Bonferroni correction on a list of p-values.
    
    Parameters:
    - p_values: list or numpy array of p-values
    - alpha: significance level
    
    Returns:
    - corrected_p_values: list of Bonferroni corrected p-values
    - reject: boolean array indicating which null hypotheses are rejected
    """
    m = len(p_values)
    corrected_p_values = np.minimum(1, np.array(p_values) * m)  # Ensure p-values don't exceed 1
    reject = corrected_p_values < alpha
    return corrected_p_values, reject

def partial_corr(df, dependent_vars, cs=False):
    """
    Generate a table of partial correlations between selected biomarkers and dependent variables.
    """
    structural_dmg_vars = ["GMF", "WMF", "LVF", "BPF", "EDSS", "SDMT"]

    df_results_glm = pd.DataFrame()
    p_values = []

    for biomarker in structural_dmg_vars:
        results_to_save = {}
        for xvar in dependent_vars:
            df_figure = df.dropna(subset=[biomarker, xvar])

            results = smf.ols(
                f"{biomarker} ~ AGE + C(SEX)", data=df_figure
            ).fit()
            df_figure[f"{biomarker}_C"] = results.resid

            results = smf.ols(
                f"{xvar} ~ AGE + C(SEX)", data=df_figure
            ).fit()
            df_figure[f"{xvar}_C"] = results.resid

            r, p = stats.pearsonr(
                df_figure[f"{biomarker}_C"], df_figure[f"{xvar}_C"]
            )
            p_values.append(p)
            str_save = f"r={r:.2f}, p={p:.1e}"
            results_to_save[xvar] = str_save

        dict_to_append = {x: results_to_save[x] for x in dependent_vars}
        dict_to_append = {"Biomarker": biomarker, **dict_to_append}
        df_results_glm = df_results_glm.append(
            dict_to_append, ignore_index=True
        )

    # Apply Benjamini-Hochberg correction
    corrected_p_values, _ = bonferroni_correction(p_values)

    # Update results with corrected p-values
    index = 0
    for biomarker in structural_dmg_vars:
        for xvar in dependent_vars:
            r = df_results_glm.loc[df_results_glm['Biomarker'] == biomarker, xvar].iloc[0].split(",")[0].split("=")[1]
            p = corrected_p_values[index]
            ast = return_asterisks_p(p)
            df_results_glm.loc[df_results_glm['Biomarker'] == biomarker, xvar] = f"r={r}, p={p:.1e}{ast}"
            index += 1

    # Remove index from df
    df_results_glm = df_results_glm.reset_index(drop=True)
    df_results_glm = df_results_glm.set_index("Biomarker")
    return df_results_glm.T

df_results = partial_corr(
    df_merged,
    [
        "Comm_ratio",
        "SC_spl_full",
        "SC_eff_full",
        "CC_ratio_area",
    ],
    False,
)
df_results_q = partial_corr(
    df_merged, ["q_Comm_spl", "q_Comm_eff", "q_CC_spl", "q_CC_eff"], False
)

print(len(df_results))
print(len(df_results_q))

4
4


In [8]:
## Create LaTeX tables

# rename columns
rename_dict_index = {
    "Comm_ratio": "Commisural ratio",
    "SC_spl_full": "Mean shortest path length",
    "SC_eff_full": "Mean efficiency",
    "CC_ratio_area": "Corpus callosum area ratio",
}
df_results.rename(index=rename_dict_index, inplace=True)

# reorder index of the df_results in the order: Commisural ratio, Corpus callosum area ratio, Mean shortest path length, Mean efficiency
df_results = df_results.reindex(
    [
        "Commisural ratio",
        "Corpus callosum area ratio",
        "Mean shortest path length",
        "Mean efficiency",
    ]
)

# print latex
print(df_results.to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
Biomarker &                   GMF &                    WMF &                    LVF &                    BPF &                   EDSS &                   SDMT \\
\midrule
Commisural ratio           &  r=0.23, p=1.8e-08*** &   r=0.51, p=1.3e-45*** &  r=-0.41, p=1.4e-28*** &   r=0.44, p=2.1e-32*** &   r=-0.16, p=8.2e-03** &   r=0.27, p=1.1e-09*** \\
Corpus callosum area ratio &    r=-0.05, p=1.0e+00 &   r=0.33, p=3.0e-17*** &  r=-0.22, p=1.0e-07*** &   r=0.16, p=4.6e-04*** &  r=-0.20, p=3.0e-04*** &   r=0.33, p=2.1e-14*** \\
Mean shortest path length  &     r=0.00, p=1.0e+00 &   r=0.39, p=1.2e-25*** &   r=-0.14, p=5.7e-03** &   r=0.24, p=7.4e-09*** &    r=-0.15, p=3.0e-02* &   r=0.22, p=5.1e-06*** \\
Mean efficiency            &    r=-0.06, p=1.0e+00 &  r=-0.46, p=8.0e-36*** &    r=0.15, p=2.3e-03** &  r=-0.31, p=1.9e-15*** &    r=0.18, p=1.5e-03** &  r=-0.24, p=1.9e-07*** \\
\bottomrule
\end{tabular}



In [9]:
## Create LaTeX tables

# rename columns
rename_dict_index = {
    "q_Comm_spl": "Commisural ratio / SPL",
    "q_Comm_eff": "Commisural ratio / Eff.",
    "q_CC_spl": "CC area ratio / SPL",
    "q_CC_eff": "CC area ratio / Eff.",
}
df_results_q.rename(index=rename_dict_index, inplace=True)

# reorder index of the df_results in the order: Commisural ratio, Corpus callosum area ratio, Mean shortest path length, Mean efficiency
df_results_q = df_results_q.reindex(
    [
        "Commisural ratio / SPL",
        "Commisural ratio / Eff.",
        "CC area ratio / SPL",
        "CC area ratio / Eff.",
    ]
)

# print latex
print(df_results_q.to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
Biomarker &                   GMF &                   WMF &                    LVF &                   BPF &                   EDSS &                  SDMT \\
\midrule
Commisural ratio / SPL  &  r=0.26, p=1.7e-10*** &  r=0.40, p=1.9e-27*** &  r=-0.40, p=1.1e-26*** &  r=0.39, p=2.6e-25*** &     r=-0.12, p=2.3e-01 &  r=0.22, p=6.2e-06*** \\
Commisural ratio / Eff. &  r=0.20, p=3.9e-06*** &  r=0.54, p=1.2e-53*** &  r=-0.38, p=1.5e-23*** &  r=0.44, p=8.6e-33*** &   r=-0.18, p=1.2e-03** &  r=0.29, p=8.7e-11*** \\
CC area ratio / SPL     &    r=-0.05, p=1.0e+00 &     r=0.03, p=1.0e+00 &     r=-0.10, p=1.8e-01 &    r=-0.01, p=1.0e+00 &     r=-0.09, p=1.0e+00 &   r=0.15, p=7.6e-03** \\
CC area ratio / Eff.    &    r=-0.01, p=1.0e+00 &  r=0.47, p=1.4e-37*** &  r=-0.23, p=1.6e-08*** &  r=0.27, p=2.0e-11*** &  r=-0.22, p=1.6e-05*** &  r=0.36, p=2.6e-17*** \\
\bottomrule
\end{tabular}

