Create partial correlation tables


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from statannotations.Annotator import Annotator
import itertools

# ignore warnings for easier plotting
import warnings

warnings.filterwarnings("ignore")

from load_data import load_data, return_asterisks_p

In [6]:
# Hue colors order and names
# auxiliar dict to put correct order in the figure
map_hue_color = {
    "G_norm": ["G_z<=0", "G_z>0"],
    "Corr_norm": ["C_z<=0", "C_z>0"],
    "disease": ["HC", "MS"],
    "GROUP": ["HC", "CIS", "RRMS", "SPMS", "PPMS"],
    "CENTER": [
        "CLINIC",
        "MAINZ",
        "MILAN",
        "NAPLES",
        "OSLO",
        "LONDON",
        "AMSTERDAM",
    ],
}

# get the progressives together.
mapping_prog = {
    "HC": "HC",
    "CIS": "CIS",
    "RRMS": "RRMS",
    "SPMS": "PMS",
    "PPMS": "PMS",
}

df_merged = load_data()
print(df_merged.columns.values)

df_merged["q_Comm_spl"] = df_merged["Comm_ratio"] / df_merged["SC_spl_full"]
df_merged["q_Comm_eff"] = df_merged["Comm_ratio"] / df_merged["SC_eff_full"]
df_merged["q_CC_spl"] = df_merged["CC_ratio_area"] / df_merged["SC_spl_full"]
df_merged["q_CC_eff"] = df_merged["CC_ratio_area"] / df_merged["SC_eff_full"]

print(len(df_merged))

Initial length: 730
disease
HC    216
MS    514
dtype: int64
QC N: 5
QC Y: 724
LONDON2: 27
['SubjID' 'DOB' 'SEX' 'GROUP' 'ONSET_DATE' 'MR_DATE' 'TREATMENT_AT_MRI'
 'TREATMENT' 'EDSS' 'YED' 'SDMT' 'RAO_SRTS' 'RAO_SRTR' 'RAO_SRTD'
 'RAO_10_36_SRTR' 'RAO_10_36_SRTD' 'RAO_WLG' 'RAO_PASAT' 'CENTER'
 'FMRI_TR' 'FMRI_NSCANS' 'FMRI_SCANTIME' 'AGE' 'DD' 'BICAMS_CVLT2'
 'BICAMS_BVMTR' 'NPS_x' '9HPT_dH' '9HPT_ndH' 'T25FW' 'motor_x'
 'Datecognitiveevaluation' 'TypeRAO' 'RAO_PASAT2sec'
 'RAO_WLGperseveration' 'RAO_WLGintrusion' 'Right9HPT' 'Left9HPT'
 'MFIS_TOTAL' 'MFIS_PHYS_FUNCT' 'MFIS_COGN_FUNCT' 'MFIS_SOCIAL_FUNCT'
 'BDI_II' '9HPT_average' 'notes_NPS' '9HPT_dH_' 'DWI_PROT' 'SCANNER'
 'SRTL_sum' 'PASAT2' 'WRDLG' 'QC' 'CENTER2' 'GM' 'GMF' 'WM' 'WMF' 'CSF'
 'CSFF' 'LV' 'LVF' 'BPF' 'RDwm_mean' 'RDwm_std' 'RDlwm_mean' 'RDlwm_std'
 'RDl_mean' 'RDl_std' 'RD_gm' 'RD_gmwm' 'Meta_x' 'Meta_noreg'
 'Unnamed: 21' 'Unnamed: 22' 'Unnamed: 23' 'FAwm_mean' 'FAwm_std'
 'FAlwm_mean' 'FAlwm_std' 'FAl_mean' 'FAl_st

In [3]:
## 4.2 Direct partial correlations between selected biomarkers and G/cs
## Here it is part of what we want to do with 5.2 without correction, so no need
def partial_corr(df, dependent_vars, cs=False):
    """
    Do I generate the figures or only the tables?
    Generate the table and small number of figures (maybe only for one or two biomarkers)
    """
    # structural_dmg_vars = ["EDSS", "SDMT"]

    structural_dmg_vars = ["GMF", "WMF", "LVF", "BPF", "EDSS", "SDMT"]

    df_results_glm = pd.DataFrame()

    for biomarker in structural_dmg_vars:
        results_to_save = {}
        for xvar in dependent_vars:
            df_figure = df.dropna(subset=[biomarker, xvar])

            results = smf.ols(
                f"{biomarker} ~ AGE + C(SEX) + C(CENTER)", data=df_figure
            ).fit()
            df_figure[f"{biomarker}_C"] = results.resid

            results = smf.ols(
                f"{xvar} ~ AGE + C(SEX) + C(CENTER)", data=df_figure
            ).fit()
            df_figure[f"{xvar}_C"] = results.resid

            r, p = stats.pearsonr(
                df_figure[f"{biomarker}_C"], df_figure[f"{xvar}_C"]
            )
            ast = return_asterisks_p(p)
            # str_save = f"r={r:.2f}{ast}"
            str_save = f"r={r:.2f}, p={p:.1e}{ast}"
            results_to_save[xvar] = str_save

        dict_to_append = {x: results_to_save[x] for x in dependent_vars}
        dict_to_append = {"Biomarker": biomarker, **dict_to_append}
        df_results_glm = df_results_glm.append(
            dict_to_append, ignore_index=True
        )

    # remove index from df
    df_results_glm = df_results_glm.reset_index(drop=True)
    df_results_glm = df_results_glm.set_index("Biomarker")
    # df_results_glm = df_results_glm.style.set_caption(f'{title}: partial correlations')
    # display(df_results_glm)
    return df_results_glm.T


df_results = partial_corr(
    df_merged,
    [
        "Comm_ratio",
        "SC_spl_full",
        "SC_eff_full",
        "CC_ratio_area",
        "FC_spl_full",
        "FC_eff_full",
    ],
    False,
)
df_results_q = partial_corr(
    df_merged, ["q_Comm_spl", "q_Comm_eff", "q_CC_spl", "q_CC_eff"], False
)

print(len(df_results))
print(len(df_results_q))

6
4


In [4]:
## Create LaTeX tables

# rename columns
rename_dict_index = {
    "Comm_ratio": "Commisural ratio",
    "SC_spl_full": "Mean shortest path length",
    "SC_eff_full": "Mean efficiency",
    "CC_ratio_area": "Corpus callosum area ratio",
}
df_results.rename(index=rename_dict_index, inplace=True)

# reorder index of the df_results in the order: Commisural ratio, Corpus callosum area ratio, Mean shortest path length, Mean efficiency
df_results = df_results.reindex(
    [
        "Commisural ratio",
        "Corpus callosum area ratio",
        "Mean shortest path length",
        "Mean efficiency",
        "FC_spl_full",
        "FC_eff_full",
    ]
)

# print latex
print(df_results.to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
Biomarker &                   GMF &                    WMF &                    LVF &                    BPF &                   EDSS &                   SDMT \\
\midrule
Commisural ratio           &  r=0.21, p=3.9e-08*** &   r=0.48, p=8.0e-42*** &  r=-0.44, p=7.6e-35*** &   r=0.40, p=3.3e-28*** &  r=-0.17, p=2.0e-04*** &   r=0.25, p=2.9e-09*** \\
Corpus callosum area ratio &    r=-0.01, p=8.4e-01 &   r=0.35, p=6.5e-21*** &  r=-0.23, p=4.0e-10*** &   r=0.20, p=1.0e-07*** &  r=-0.16, p=4.9e-04*** &   r=0.33, p=2.5e-15*** \\
Mean shortest path length  &    r=-0.01, p=7.6e-01 &   r=0.38, p=3.2e-25*** &    r=-0.09, p=2.3e-02* &   r=0.22, p=6.5e-09*** &   r=-0.13, p=3.5e-03** &   r=0.17, p=3.6e-05*** \\
Mean efficiency            &    r=-0.06, p=1.1e-01 &  r=-0.47, p=1.2e-39*** &   r=0.13, p=7.9e-04*** &  r=-0.31, p=3.2e-17*** &   r=0.19, p=1.9e-05*** &  r=-0.23, p=4.4e-08*** \\
FC_spl_full                &   r=-0.09, p=2.0e-02* &     r=-0.02, p=6.0e-01 &  

In [5]:
## Create LaTeX tables

# rename columns
rename_dict_index = {
    "q_Comm_spl": "Commisural ratio / SPL",
    "q_Comm_eff": "Commisural ratio / Eff.",
    "q_CC_spl": "CC area ratio / SPL",
    "q_CC_eff": "CC area ratio / Eff.",
}
df_results_q.rename(index=rename_dict_index, inplace=True)

# reorder index of the df_results in the order: Commisural ratio, Corpus callosum area ratio, Mean shortest path length, Mean efficiency
df_results_q = df_results_q.reindex(
    [
        "Commisural ratio / SPL",
        "Commisural ratio / Eff.",
        "CC area ratio / SPL",
        "CC area ratio / Eff.",
    ]
)

# print latex
print(df_results_q.to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
Biomarker &                   GMF &                   WMF &                    LVF &                   BPF &                   EDSS &                  SDMT \\
\midrule
Commisural ratio / SPL  &  r=0.25, p=2.3e-11*** &  r=0.41, p=5.3e-30*** &  r=-0.42, p=1.4e-30*** &  r=0.38, p=7.6e-26*** &   r=-0.12, p=6.4e-03** &  r=0.22, p=1.3e-07*** \\
Commisural ratio / Eff. &  r=0.16, p=2.5e-05*** &  r=0.47, p=7.5e-40*** &  r=-0.43, p=4.9e-32*** &  r=0.37, p=9.8e-24*** &  r=-0.19, p=2.9e-05*** &  r=0.23, p=5.2e-08*** \\
CC area ratio / SPL     &     r=0.00, p=9.4e-01 &     r=0.06, p=1.1e-01 &  r=-0.14, p=2.2e-04*** &     r=0.04, p=3.2e-01 &     r=-0.06, p=1.8e-01 &  r=0.18, p=2.5e-05*** \\
CC area ratio / Eff.    &    r=-0.01, p=8.5e-01 &  r=0.46, p=2.2e-37*** &  r=-0.25, p=2.1e-11*** &  r=0.27, p=8.3e-13*** &  r=-0.19, p=1.5e-05*** &  r=0.33, p=6.3e-16*** \\
\bottomrule
\end{tabular}

