Create partial correlation tables


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from statannotations.Annotator import Annotator
import itertools

# ignore warnings for easier plotting
import warnings

warnings.filterwarnings("ignore")

from load_data import load_data, return_asterisks_p

In [None]:
# Hue colors order and names
# auxiliar dict to put correct order in the figure
map_hue_color = {
    "G_norm": ["G_z<=0", "G_z>0"],
    "Corr_norm": ["C_z<=0", "C_z>0"],
    "disease": ["HC", "MS"],
    "GROUP": ["HC", "CIS", "RRMS", "SPMS", "PPMS"],
    "CENTER": [
        "CLINIC",
        "MAINZ",
        "MILAN",
        "NAPLES",
        "OSLO",
        "LONDON",
        "AMSTERDAM",
    ],
}

# get the progressives together.
mapping_prog = {
    "HC": "HC",
    "CIS": "CIS",
    "RRMS": "RRMS",
    "SPMS": "PMS",
    "PPMS": "PMS",
}

df_merged = load_data()

df_merged = pd.read_csv("/home/gerard/VHIR/MAGNIMS_DEFINITIVE_RESULTS/df_merged_combat.csv")

print(df_merged.columns.values)

df_merged["q_Comm_spl"] = df_merged["Comm_ratio"] / df_merged["SC_spl_full"]
df_merged["q_Comm_eff"] = df_merged["Comm_ratio"] / df_merged["SC_eff_full"]
df_merged["q_CC_spl"] = df_merged["CC_ratio_area"] / df_merged["SC_spl_full"]
df_merged["q_CC_eff"] = df_merged["CC_ratio_area"] / df_merged["SC_eff_full"]

print(len(df_merged))

In [None]:
def benjamini_hochberg_correction(p_values, alpha=0.05):
    """
    Perform Benjamini-Hochberg FDR correction on a list of p-values.
    
    Parameters:
    - p_values: list or numpy array of p-values
    - alpha: significance level
    
    Returns:
    - critical_values: BH critical values
    - reject: boolean array indicating which null hypotheses are rejected
    """
    p_values = np.array(p_values)
    m = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    critical_values = np.arange(1, m+1) * alpha / m
    reject = sorted_p_values <= critical_values
    reject_max = np.max(np.where(reject)[0]) if np.any(reject) else -1
    reject = np.zeros(m, dtype=bool)
    if reject_max >= 0:
        reject[sorted_indices[:reject_max+1]] = True
    corrected_p_values = np.zeros(m)
    for i in range(m):
        if i <= reject_max:
            corrected_p_values[sorted_indices[i]] = sorted_p_values[i] * m / (i + 1)
        else:
            corrected_p_values[sorted_indices[i]] = sorted_p_values[i]
    corrected_p_values = np.minimum(1, corrected_p_values)  # Ensure p-values don't exceed 1
    return critical_values, reject, corrected_p_values


def bonferroni_correction(p_values, alpha=0.05):
    """
    Perform Bonferroni correction on a list of p-values.
    
    Parameters:
    - p_values: list or numpy array of p-values
    - alpha: significance level
    
    Returns:
    - corrected_p_values: list of Bonferroni corrected p-values
    - reject: boolean array indicating which null hypotheses are rejected
    """
    m = len(p_values)
    corrected_p_values = np.minimum(1, np.array(p_values) * m)  # Ensure p-values don't exceed 1
    reject = corrected_p_values < alpha
    return corrected_p_values, reject

def partial_corr(df, dependent_vars, cs=False):
    """
    Generate a table of partial correlations between selected biomarkers and dependent variables.
    """
    structural_dmg_vars = ["GMF", "WMF", "LVF", "BPF", "EDSS", "SDMT"]

    df_results_glm = pd.DataFrame()
    p_values = []

    for biomarker in structural_dmg_vars:
        results_to_save = {}
        for xvar in dependent_vars:
            df_figure = df.dropna(subset=[biomarker, xvar])

            results = smf.ols(
                f"{biomarker} ~ AGE + C(SEX)", data=df_figure
            ).fit()
            df_figure[f"{biomarker}_C"] = results.resid

            results = smf.ols(
                f"{xvar} ~ AGE + C(SEX)", data=df_figure
            ).fit()
            df_figure[f"{xvar}_C"] = results.resid

            r, p = stats.pearsonr(
                df_figure[f"{biomarker}_C"], df_figure[f"{xvar}_C"]
            )
            p_values.append(p)
            str_save = f"r={r:.2f}, p={p:.1e}"
            results_to_save[xvar] = str_save

        dict_to_append = {x: results_to_save[x] for x in dependent_vars}
        dict_to_append = {"Biomarker": biomarker, **dict_to_append}
        df_results_glm = df_results_glm.append(
            dict_to_append, ignore_index=True
        )

    # Apply Benjamini-Hochberg correction
    corrected_p_values, _ = bonferroni_correction(p_values)

    # Update results with corrected p-values
    index = 0
    for biomarker in structural_dmg_vars:
        for xvar in dependent_vars:
            r = df_results_glm.loc[df_results_glm['Biomarker'] == biomarker, xvar].iloc[0].split(",")[0].split("=")[1]
            p = corrected_p_values[index]
            ast = return_asterisks_p(p)
            df_results_glm.loc[df_results_glm['Biomarker'] == biomarker, xvar] = f"r={r}, p={p:.1e}{ast}"
            index += 1

    # Remove index from df
    df_results_glm = df_results_glm.reset_index(drop=True)
    df_results_glm = df_results_glm.set_index("Biomarker")
    return df_results_glm.T

df_results = partial_corr(
    df_merged,
    [
        "Comm_ratio",
        "SC_spl_full",
        "SC_eff_full",
        "CC_ratio_area",
    ],
    False,
)
df_results_q = partial_corr(
    df_merged, ["q_Comm_spl", "q_Comm_eff", "q_CC_spl", "q_CC_eff"], False
)

print(len(df_results))
print(len(df_results_q))

In [None]:
## Create LaTeX tables

# rename columns
rename_dict_index = {
    "Comm_ratio": "Commisural ratio",
    "SC_spl_full": "Mean shortest path length",
    "SC_eff_full": "Mean efficiency",
    "CC_ratio_area": "Corpus callosum area ratio",
}
df_results.rename(index=rename_dict_index, inplace=True)

# reorder index of the df_results in the order: Commisural ratio, Corpus callosum area ratio, Mean shortest path length, Mean efficiency
df_results = df_results.reindex(
    [
        "Commisural ratio",
        "Corpus callosum area ratio",
        "Mean shortest path length",
        "Mean efficiency",
    ]
)

# print latex
print(df_results.to_latex(escape=False))

In [None]:
## Create LaTeX tables

# rename columns
rename_dict_index = {
    "q_Comm_spl": "Commisural ratio / SPL",
    "q_Comm_eff": "Commisural ratio / Eff.",
    "q_CC_spl": "CC area ratio / SPL",
    "q_CC_eff": "CC area ratio / Eff.",
}
df_results_q.rename(index=rename_dict_index, inplace=True)

# reorder index of the df_results in the order: Commisural ratio, Corpus callosum area ratio, Mean shortest path length, Mean efficiency
df_results_q = df_results_q.reindex(
    [
        "Commisural ratio / SPL",
        "Commisural ratio / Eff.",
        "CC area ratio / SPL",
        "CC area ratio / Eff.",
    ]
)

# print latex
print(df_results_q.to_latex(escape=False))