## LINEAR MODELS
Compare values of hemispheres to structural values

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import glob
import statsmodels.api as sm
import statsmodels.formula.api as smf
from functools import reduce

# ignore warnings for easier plotting
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Load the csv
base_dir_linux =  ''
base_dir_win = ''

csv_cc = f'{base_dir_linux}/graph_values/cc.csv'
# csv_hemis = f'{base_dir_win}/graph_values/graph_intrainter_cort_G_SC.csv'
csv_hemis = f'{base_dir_linux}/graph_values/graph_dti_G_SC.csv'
csv_hemis_FC = f'{base_dir_linux}/graph_values/graph_G_FC.csv'
csv_hemis_nodes = f'{base_dir_linux}/graph_values/graph_nodes_SC.csv'
csv_total = f'{base_dir_linux}/data_total.csv'
extracted_values_path  = f'{base_dir_linux}/extracted_values.csv'

df_cc = pd.read_csv(csv_cc)
df_hemis = pd.read_csv(csv_hemis)
df_hemis_FC = pd.read_csv(csv_hemis_FC)
df_extracted = pd.read_csv(extracted_values_path)
df_hemis_nodes = pd.read_csv(csv_hemis_nodes)
df_total = pd.read_csv(csv_total)

In [5]:
# merge results and total
data_frames = [df_total, df_cc, df_hemis, df_extracted, df_hemis_FC]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=["SubjID", "CENTER"]), data_frames)

df_merged = df_merged[df_merged.QC == "Y"]

len(df_merged)
df_merged['disease'] = np.where(df_merged['GROUP']=='HC', "HC", "MS")

df_merged["Full_hemis"] = ( df_merged["SC_L_avg_spl"] + df_merged["SC_R_avg_spl"] ) / 2
df_merged["FC_spl_full"] = ( df_merged["FC_L_avg_spl"] + df_merged["FC_R_avg_spl"] ) / 2
df_merged["FC_eff_full"] = ( df_merged["FC_L_efficiency"] + df_merged["FC_R_efficiency"] ) / 2
df_merged["Full_CC"] = df_merged["CC_Posterior"] + df_merged["CC_Mid_Posterior"] + df_merged["CC_Central"] + df_merged["CC_Mid_Anterior"] + df_merged["CC_Anterior"]
df_merged["Comm_ratio_approx"] = ( df_merged["CC_Posterior"] + df_merged["CC_Mid_Posterior"] + df_merged["CC_Central"] + df_merged["CC_Mid_Anterior"] + df_merged["CC_Anterior"] ) / df_merged["EstimatedTotalIntraCranialVol"]
df_merged["CC_ratio_area"] = np.sqrt(df_merged["CC_Sag_area"]) / (df_merged["BrainSegVol"]**(1./3.))
df_merged["CC_Sag_area_sqrt"] = np.log10(np.sqrt(df_merged["CC_Sag_area"]))
df_merged["TIV_cubicroot"] = np.log10(df_merged["BrainSegVol"]**(1./3.))


In [None]:
# Normality analysis
from scipy import stats
from statsmodels import stats as st

list_of_variables_to_test = ["Full_hemis", "Comm_ratio", "CC_ratio_area", "EDSS", "SDMT", "BPF", "LVF", "RDlwm_mean", "FAlwm_mean"]
for x in list_of_variables_to_test:
    print(x)
    if x not in ["EDSS", "SDMT"]:
        test = stats.shapiro(df_merged[x].values)
    else:
        test = st.diagnostic.kstest_normal(df_merged[x].values)
    print(test)

In [None]:
# direct analysis of CC_ratio_area
sns.kdeplot(data=df_merged, x="CC_ratio_area", y="RDlwm_mean", hue="disease")

In [None]:
"""
ANALYSIS of the data
Biomarkers to analyze:

Full_CC
Comm_ratio
CC_ratio_area
Full_hemis
FC_eff_full
FC_spl_full

Explicar que és cada variable?  
"""
### Scatterplot G
## Compare different values 

from scipy import stats
sns.set_style("darkgrid")
varx = "dummy"
vary = "FC_spl_full"

def annotate(data, **kws):
    r, p = stats.pearsonr(data[f"{varx}_C"], data[f"{vary}_C"])
    ax = plt.gca()
    ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
            transform=ax.transAxes)

# variables = ["EDSS", "SDMT", "DD", "GMF", "BPF", "WMF", "LVF", "RDlwm_mean", "FAlwm_mean"]
variables = ["EDSS", "SDMT", "BPF", "LVF", "RDlwm_mean", "FAlwm_mean"]
sns.set(font_scale = 1.5)

for varx in variables:
    # AGE
    if varx in ["DD", "SDMT", "EDSS"]:
        data_to_plot = df_merged.dropna(subset=[varx])
    
    elif varx == "LVF":
        data_to_plot = df_merged.dropna(subset=[varx])
        data_to_plot = data_to_plot[data_to_plot.LVF < 1.0]
    else:
        data_to_plot = df_merged

    results = smf.ols(f'{varx} ~ AGE + C(SEX) + C(CENTER2)', data=data_to_plot).fit()
    data_to_plot[f"{varx}_C"] = results.resid
    
    results = smf.ols(f'{vary} ~ AGE + C(SEX) + C(CENTER2)', data=data_to_plot).fit()
    data_to_plot[f"{vary}_C"] = results.resid

    g = sns.lmplot(data=data_to_plot, x=f"{varx}_C", y=f"{vary}_C")
    g.map_dataframe(annotate)
    g.set_axis_labels(varx, vary)
    plt.tight_layout()

    if varx in ["DD", "SDMT", "EDSS", "GMF", "BPF", "WMF", "RDlwm_mean", "FAlwm_mean"]:
        g = sns.lmplot(data=data_to_plot, x=f"{varx}_C", y=f"{vary}_C", col="disease", hue="disease", hue_order=["HC", "MS"], col_order=["HC", "MS"], sharex=True, sharey=True)
        g.map_dataframe(annotate)
        g.set_axis_labels(varx, vary)
        plt.tight_layout()

    if varx == "RDwm_mean" or varx == "RDlwm_mean": g.set_xlabels("RD")
    if varx == "FAlwm_mean": g.set_xlabels("FA")

    results = smf.ols(f'{varx} ~ AGE + C(SEX)', data=data_to_plot).fit()
    data_to_plot[f"{varx}_C"] = results.resid
    
    results = smf.ols(f'{vary} ~ AGE + C(SEX)', data=data_to_plot).fit()
    data_to_plot[f"{vary}_C"] = results.resid

    g = sns.lmplot(data=data_to_plot, x=f"{varx}_C", y=f"{vary}_C", col="CENTER2", row="disease", hue="disease", hue_order=["HC", "MS"], row_order=["HC", "MS"],)
    g.map_dataframe(annotate)
    g.set_axis_labels(varx, vary)
    plt.tight_layout()




In [None]:
"""
Full_CC
Comm_ratio
CC_ratio_area
Full_hemis
FC_eff_full
FC_spl_full

"""

structural_dmg_vars = ["BPF", "GMF", "WMF", "LVF", "RDwm_mean", "FAwm_mean"]
df_results_glm = pd.DataFrame()
dependent_vars = ["EDSS", "SDMT"]
variable_to_test_for = "FC_spl_full"
type_of_model = "glm" # "logit" or "glm"

for biomarker in structural_dmg_vars:
    
    results_to_save = {}
    for xvar in dependent_vars:
        if xvar in ["DD", "SDMT", "EDSS"]:
            df_figure = df_merged.dropna(subset=[xvar])
        
        elif xvar == "LVF":
            df_figure = df_merged.dropna(subset=[xvar])
            df_figure = df_figure[df_figure.LVF < 1.0]

        #normalize between 0 and 1 the values of the biomarker
        if type_of_model == "logit":
            df_figure[dependent_vars] = (df_figure[dependent_vars] - df_figure[dependent_vars].min()) / (df_figure[dependent_vars].max() - df_figure[dependent_vars].min())

        formula = f'{xvar} ~ {variable_to_test_for} + {biomarker} + AGE + C(SEX) + C(CENTER2)' 

        if type_of_model == "logit":
            results = smf.logit(formula, data=df_figure).fit()
        elif type_of_model == "glm":
            results = smf.glm(formula, data=df_figure).fit()
        else: results = "NA"
        tstat = results.tvalues[variable_to_test_for] # tstat value for the test, biomarker
        pval = results.pvalues[variable_to_test_for] # pval for the test, bimoarker

        str_save = f"t={tstat:.3f}, p={pval:.3f}"
        
        results = smf.ols(f'{variable_to_test_for} ~ {biomarker} + AGE + C(SEX) + C(CENTER2)', data=df_figure).fit()
        df_figure[f"{variable_to_test_for}_C"] = results.resid
        
        results = smf.ols(f'{xvar} ~ {biomarker} + AGE + C(SEX) + C(CENTER2)', data=df_figure).fit()
        df_figure[f"{biomarker}_C"] = results.resid

        r, p = stats.pearsonr(df_figure[f"{variable_to_test_for}_C"], df_figure[f"{biomarker}_C"])
        str_save = f"r={r:.3f}, p={p:.3f}"
        
        results_to_save[xvar] = str_save

    df_results_glm = df_results_glm.append({'Biomarker': biomarker, 'EDSS': results_to_save["EDSS"], 'SDMT': results_to_save["SDMT"]}, ignore_index=True)

# remove index from df
df_results_glm = df_results_glm.reset_index(drop=True)
display(df_results_glm)


In [None]:
"""
Full_CC
Comm_ratio_approx
CC_ratio_area
Full_hemis
FC_eff_full
FC_spl_full

"""
print(df_merged["CENTER2"])
# direct linear model
structural_dmg_vars = ["EDSS", "SDMT", "BPF", "GMF", "WMF", "LVF", "RDwm_mean", "FAwm_mean"]
dependent_vars = ["Full_hemis", "Comm_ratio", "CC_ratio_area", "FC_eff_full", "FC_spl_full"]

## linear models accounting for structural dmg
# structural_dmg_vars = ["BPF", "GMF", "WMF", "LVF", "RDwm_mean", "FAwm_mean"]
# dependent_vars = ["EDSS", "SDMT"]
df_results_glm = pd.DataFrame()
variable_to_test_for = "CC_ratio_area"
type_of_model = "glm" # "logit" or "glm"

for biomarker in structural_dmg_vars:
    results_to_save = {}
    for xvar in dependent_vars:
        df_figure = df_merged.dropna(subset=[biomarker,xvar])

        #normalize between 0 and 1 the values of the biomarker
        if type_of_model == "logit":
            df_figure[dependent_vars] = (df_figure[dependent_vars] - df_figure[dependent_vars].min()) / (df_figure[dependent_vars].max() - df_figure[dependent_vars].min())

        formula = f'{biomarker} ~ {xvar} + AGE + C(SEX) + C(CENTER2)' 

        if type_of_model == "logit":
            results = smf.logit(formula, data=df_figure).fit()
        elif type_of_model == "glm":
            results = smf.ols(formula, data=df_figure).fit()
        else: results = "NA"
        tstat = results.tvalues[xvar] # tstat value for the test, biomarker
        pval = results.pvalues[xvar] # pval for the test, bimoarker

        str_save = f"t={tstat:.3f}, p={pval:.3f}"

        results = smf.ols(f'{biomarker} ~ AGE + C(SEX) + C(CENTER2)', data=df_figure).fit()
        df_figure[f"{biomarker}_C"] = results.resid
        
        results = smf.ols(f'{xvar} ~ AGE + C(SEX) + C(CENTER2)', data=df_figure).fit()
        df_figure[f"{xvar}_C"] = results.resid

        r, p = stats.pearsonr(df_figure[f"{biomarker}_C"], df_figure[f"{xvar}_C"])
        str_save = f"r={r:.3f}, p={p:.3f}"

        results_to_save[xvar] = str_save

    df_results_glm = df_results_glm.append({'Biomarker': biomarker, 'Full_hemis': results_to_save["Full_hemis"], 'Comm_ratio': results_to_save["Comm_ratio"], 'CC_ratio_area': results_to_save["CC_ratio_area"],
                                            'FC_eff_full': results_to_save["FC_eff_full"], 'FC_spl_full': results_to_save["FC_spl_full"]}, ignore_index=True)

# remove index from df
df_results_glm = df_results_glm.reset_index(drop=True)
display(df_results_glm)


In [None]:
"""
ANALYSIS of the data
Biomarkers to analyze:

Full_CC
Comm_ratio
CC_ratio_area
Full_hemis
FC_eff_full
FC_spl_full

Explicar que és cada variable?  
"""
### Scatterplot G
## Compare different values 

from scipy import stats
sns.set_style("darkgrid")
varx = "dummy"
vary = "Full_hemis"

def annotate(data, **kws):
    r, p = stats.pearsonr(data[f"{varx}_C"], data[f"{vary}_C"])
    ax = plt.gca()
    ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
            transform=ax.transAxes)

# variables = ["EDSS", "SDMT", "DD", "GMF", "BPF", "WMF", "LVF", "RDlwm_mean", "FAlwm_mean"]
variables = ["FC_eff_full", "FC_spl_full"]
sns.set(font_scale = 1.5)

for varx in variables:
    # AGE
    if varx in ["DD", "SDMT", "EDSS"]:
        data_to_plot = df_merged.dropna(subset=[varx])
    
    elif varx == "LVF":
        data_to_plot = df_merged.dropna(subset=[varx])
        data_to_plot = data_to_plot[data_to_plot.LVF < 1.0]
    else:
        data_to_plot = df_merged

    results = smf.ols(f'{varx} ~ AGE + C(SEX) + C(CENTER2)', data=data_to_plot).fit()
    data_to_plot[f"{varx}_C"] = results.resid
    
    results = smf.ols(f'{vary} ~ AGE + C(SEX) + C(CENTER2)', data=data_to_plot).fit()
    data_to_plot[f"{vary}_C"] = results.resid

    g = sns.lmplot(data=data_to_plot, x=f"{varx}_C", y=f"{vary}_C")
    g.map_dataframe(annotate)
    g.set_axis_labels(varx, vary)
    plt.tight_layout()

    if varx in ["FC_eff_full", "FC_spl_full"]:
        g = sns.lmplot(data=data_to_plot, x=f"{varx}_C", y=f"{vary}_C", col="disease", hue="disease", hue_order=["HC", "MS"], col_order=["HC", "MS"], sharex=True, sharey=True)
        g.map_dataframe(annotate)
        g.set_axis_labels(varx, vary)
        plt.tight_layout()

    if varx == "RDwm_mean" or varx == "RDlwm_mean": g.set_xlabels("RD")
    if varx == "FAlwm_mean": g.set_xlabels("FA")

    results = smf.ols(f'{varx} ~ AGE + C(SEX)', data=data_to_plot).fit()
    data_to_plot[f"{varx}_C"] = results.resid
    
    results = smf.ols(f'{vary} ~ AGE + C(SEX)', data=data_to_plot).fit()
    data_to_plot[f"{vary}_C"] = results.resid

    g = sns.lmplot(data=data_to_plot, x=f"{varx}_C", y=f"{vary}_C", col="CENTER2", row="disease", hue="disease", hue_order=["HC", "MS"], row_order=["HC", "MS"],)
    g.map_dataframe(annotate)
    g.set_axis_labels(varx, vary)
    plt.tight_layout()




In [None]:
"""
Full_CC
Comm_ratio_approx
CC_ratio_area
Full_hemis
FC_eff_full
FC_spl_full

"""
print(df_merged["CENTER2"])
# direct linear model
structural_dmg_vars = ["FC_eff_full", "FC_spl_full"]
dependent_vars = ["Full_hemis", "Comm_ratio", "CC_ratio_area"]

## linear models accounting for structural dmg
# structural_dmg_vars = ["BPF", "GMF", "WMF", "LVF", "RDwm_mean", "FAwm_mean"]
# dependent_vars = ["EDSS", "SDMT"]
df_results_glm = pd.DataFrame()
variable_to_test_for = "CC_ratio_area"
type_of_model = "glm" # "logit" or "glm"

for biomarker in structural_dmg_vars:
    
    results_to_save = {}
    for xvar in dependent_vars:
        df_figure = df_merged.dropna(subset=[biomarker])

        #normalize between 0 and 1 the values of the biomarker
        if type_of_model == "logit":
            df_figure[dependent_vars] = (df_figure[dependent_vars] - df_figure[dependent_vars].min()) / (df_figure[dependent_vars].max() - df_figure[dependent_vars].min())

        formula = f'{biomarker} ~ {xvar} + AGE + C(SEX) + C(CENTER2)' 

        if type_of_model == "logit":
            results = smf.logit(formula, data=df_figure).fit()
        elif type_of_model == "glm":
            results = smf.ols(formula, data=df_figure).fit()
        else: results = "NA"
        tstat = results.tvalues[xvar] # tstat value for the test, biomarker
        pval = results.pvalues[xvar] # pval for the test, bimoarker
        str_save = f"t={tstat:.3f}, p={pval:.3f}"

        results_to_save[xvar] = str_save

    df_results_glm = df_results_glm.append({'Biomarker': biomarker, 'Full_hemis': results_to_save["Full_hemis"], 'Comm_ratio': results_to_save["Comm_ratio"], 'CC_ratio_area': results_to_save["CC_ratio_area"]}, ignore_index=True)

# remove index from df
df_results_glm = df_results_glm.reset_index(drop=True)
display(df_results_glm)


In [None]:
"""
Full_CC
Comm_ratio
CC_ratio_area
Full_hemis
FC_eff_full
FC_spl_full

"""
### PER CENTER

structural_dmg_vars = ["EDSS", "SDMT", "BPF", "GMF", "WMF", "LVF", "RDwm_mean", "FAwm_mean"]
dependent_var = "SDMT" #"EDSS", "SDMT"
variable_to_test_for = "Full_hemis"
type_of_model = "glm" # "logit" or "glm"

df_results_c = pd.DataFrame()
for center in df_merged.CENTER2.unique():
    
    results_to_save = {}
    for biomarker in structural_dmg_vars:
    
        df_figure = df_merged[df_merged.CENTER2 == center]
        df_figure = df_figure.dropna(subset=[biomarker])
        #normalize between 0 and 1 the values of the biomarker  
        if type_of_model == "logit":
            df_figure[dependent_var] = (df_figure[dependent_var] - df_figure[dependent_var].min()) / (df_figure[dependent_var].max() - df_figure[dependent_var].min())
        formula = f'{biomarker} ~ {variable_to_test_for} + AGE + C(SEX)' 
        if type_of_model == "logit":
            results = smf.logit(formula, data=df_figure).fit()
        elif type_of_model == "glm":
            results = smf.ols(formula, data=df_figure).fit()
        else: results = "NA"

        tstat = results.tvalues[variable_to_test_for] # tstat value for the test, biomarker
        pval = results.pvalues[variable_to_test_for] # pval for the test, biomarker
        str_save = f"t={tstat:.3f}, p={pval:.3f}"

        results = smf.ols(f'{biomarker} ~ AGE + C(SEX)', data=df_figure).fit()
        df_figure[f"{biomarker}_C"] = results.resid
        
        results = smf.ols(f'{variable_to_test_for} ~ AGE + C(SEX)', data=df_figure).fit()
        df_figure[f"{variable_to_test_for}_C"] = results.resid

        r, p = stats.pearsonr(df_figure[f"{biomarker}_C"], df_figure[f"{variable_to_test_for}_C"])
        str_save = f"r={r:.3f}, p={p:.3f}"
        results_to_save[biomarker] = str_save

    results_to_save["CENTER"] = center
    df_results_c = df_results_c.append(results_to_save, ignore_index=True)
    
#display(df_results_glm.groupby(by="CENTER"))
#set CENTER as the index
df_results_c = df_results_c.set_index("CENTER")
#reorder biomarker column as structural_dmg_vars
df_results_c = df_results_c[structural_dmg_vars]
#  reorder rows as in [CLINIC MAINZ OSLO MILAN NAPLES LONDON AMSTERDAM] in center
df_results_c = df_results_c.reindex(["CLINIC", "MAINZ", "OSLO", "MILAN", "NAPLES", "LONDON", "LONDON2", "AMSTERDAM"])
display(df_results_c.T)