In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from statannotations.Annotator import Annotator
import itertools 

# ignore warnings for easier plotting
import warnings
warnings.filterwarnings("ignore")

from load_data import load_data

In [8]:
# Hue colors order and names
# auxiliar dict to put correct order in the figure
map_hue_color = {
    "EDSS_group": ["EDSS<=3", "EDSS>3"],
    "SDMT_group": ["SDMT>=40", "SDMT<40"],
    "G_norm": ['G_z<=0', 'G_z>0'], 
    "Corr_norm": ['C_z<=0', 'C_z>0'],
    "disease": ["HC", "MS"],
    "GROUP": ["HC", "CIS", "RRMS", "SPMS", "PPMS"],
    "CENTER": ["CLINIC", "MAINZ", "MILAN", "NAPLES", "OSLO", "LONDON", "AMSTERDAM"],
    "groups" : ["HC", "EDSS<=3", "EDSS>3", "SDMT>=40", "SDMT<40"]
    # "GROUP": ["HC", "RRMS", "SPMS", "PPMS"]
}

# get the progressives together.
mapping_prog = {
    "HC": "HC",
    "CIS": "CIS", 
    "RRMS": "RRMS", 
    "SPMS": "PMS", 
    "PPMS": "PMS"
}

df_merged = load_data()
print(df_merged.columns.values)

['SubjID' 'DOB' 'SEX' 'GROUP' 'ONSET_DATE' 'MR_DATE' 'TREATMENT_AT_MRI'
 'TREATMENT' 'EDSS' 'YED' 'SDMT' 'RAO_SRTS' 'RAO_SRTR' 'RAO_SRTD'
 'RAO_10_36_SRTR' 'RAO_10_36_SRTD' 'RAO_WLG' 'RAO_PASAT' 'CENTER'
 'FMRI_TR' 'FMRI_NSCANS' 'FMRI_SCANTIME' 'AGE' 'DD' 'BICAMS_CVLT2'
 'BICAMS_BVMTR' 'NPS_x' '9HPT_dH' '9HPT_ndH' 'T25FW' 'motor_x'
 'Datecognitiveevaluation' 'TypeRAO' 'RAO_PASAT2sec'
 'RAO_WLGperseveration' 'RAO_WLGintrusion' 'Right9HPT' 'Left9HPT'
 'MFIS_TOTAL' 'MFIS_PHYS_FUNCT' 'MFIS_COGN_FUNCT' 'MFIS_SOCIAL_FUNCT'
 'BDI_II' '9HPT_average' 'notes_NPS' '9HPT_dH_' 'DWI_PROT' 'SCANNER'
 'SRTL_sum' 'PASAT2' 'WRDLG' 'QC' 'CENTER2' 'GM' 'GMF' 'WM' 'WMF' 'CSF'
 'CSFF' 'LV' 'LVF' 'BPF' 'RDwm_mean' 'RDwm_std' 'RDlwm_mean' 'RDlwm_std'
 'RDl_mean' 'RDl_std' 'RD_gm' 'RD_gmwm' 'Meta_x' 'Meta_noreg'
 'Unnamed: 21' 'Unnamed: 22' 'Unnamed: 23' 'FAwm_mean' 'FAwm_std'
 'FAlwm_mean' 'FAlwm_std' 'FAl_mean' 'FAl_std' 'FA_gm' 'FA_gmwm'
 'CC_Sag_area' 'CC_Posterior' 'CC_Mid_Posterior' 'CC_Central'
 'CC_Mid

In [9]:
## Compute all quocients
to_correct_for = 'AGE + C(CENTER) + C(SEX)'


results = smf.ols(f'EDSS ~ {to_correct_for}', data=df_merged).fit()
df_merged[f"EDSS"] = results.resid + results.params.Intercept

results = smf.ols(f'SDMT ~ {to_correct_for}', data=df_merged).fit()
df_merged[f"SDMT"] = results.resid + results.params.Intercept

# maybe we need to normalize it beforehand 

df_merged["q_Comm_spl"] = df_merged["Comm_ratio"] / df_merged["SC_spl_full"]
df_merged["q_Comm_eff"] = df_merged["Comm_ratio"] / df_merged["SC_eff_full"]
df_merged["q_CC_spl"] = df_merged["CC_ratio_area"] / df_merged["SC_spl_full"]
df_merged["q_CC_eff"] = df_merged["CC_ratio_area"] / df_merged["SC_eff_full"]

x1 = "q_Comm_spl"
x2 = "q_Comm_eff"
x3 = "q_CC_spl"
x4 = "q_CC_eff"

results = smf.ols(f'{x1} ~ {to_correct_for}', data=df_merged).fit()
df_merged[f"{x1}"] = results.resid + results.params.Intercept
results = smf.ols(f'{x2} ~ {to_correct_for}', data=df_merged).fit()
df_merged[f"{x2}"] = results.resid + results.params.Intercept
results = smf.ols(f'{x3} ~ {to_correct_for}', data=df_merged).fit()
df_merged[f"{x3}"] = results.resid + results.params.Intercept
results = smf.ols(f'{x4} ~ {to_correct_for}', data=df_merged).fit()
df_merged[f"{x4}"] = results.resid + results.params.Intercept


In [10]:
# Prepare the new category and duplicate
# create different dataframes and concatenate later?
def create_dataframe_categories(df, type="EDSS"):
    """
    Create the dataframe with the new category and 
    return it

    Type is either EDSS or SDMT
    """

    var_name = "groups"

    # results = smf.ols(f'g ~ C(CENTER)', data=df).fit()
    # df[f"g"] = results.resid + results.params.Intercept

    # results = smf.ols(f'cs ~ C(CENTER)', data=df).fit()
    # df[f"cs"] = results.resid + results.params.Intercept

    df_HC = df[df.GROUP == "HC"]
    df_HC[var_name] = "HC"

    df_lowEDSS = df[df.EDSS_group == "EDSS<=3"]
    df_highEDSS = df[df.EDSS_group == "EDSS>3"]

    df_lowEDSS[var_name] = "EDSS<=3"
    df_highEDSS[var_name] = "EDSS>3"

    # For the SDMT, remember not to include HC 
    df_lowSDMT = df[(df.SDMT_group == "SDMT<40") & (df.GROUP != "HC")]
    df_highSDMT = df[(df.SDMT_group == "SDMT>=40") & (df.GROUP != "HC")]

    df_lowSDMT[var_name] = "SDMT<40"
    df_highSDMT[var_name] = "SDMT>=40"

    # Concatenate all of the dfs
    if type == "EDSS": df_out = pd.concat([df_HC, df_lowEDSS, df_highEDSS])
    elif type == "SDMT": df_out = pd.concat([df_HC, df_lowSDMT, df_highSDMT])

    return df_out

df_sdmt = create_dataframe_categories(df_merged, "SDMT")
df_edss = create_dataframe_categories(df_merged, "EDSS")

In [11]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## RUN ANOVA
def basic_analysis(df, title, to_plot):
    """
    Perform a basic analysis of the data
    """
    for x in to_plot:
        
        df = df.dropna(subset=[x])

        to_correct_for = 'AGE + C(CENTER) + C(SEX)'

        moore_lm = smf.ols(f'{x} ~ C(groups) + {to_correct_for}', data=df).fit()
        # print(moore_lm.summary())
        table = sm.stats.anova_lm(moore_lm, typ=2)
        print('-----------------------------------------------------')
        print(x)
        print('-----------------------------------------------------')

        print(title)
        print(table)

        tukey = pairwise_tukeyhsd(endog=df[x],
                          groups=df['groups'],
                          alpha=0.05)
        print(tukey)
        print('-----------------------------------------------------')




In [12]:
### EDSS GROUPS
### SDMT GROUPS

# variables
variables = ["q_Comm_spl", "q_Comm_eff", "q_CC_spl", "q_CC_eff"]

basic_analysis(df_sdmt, "ANOVA EDSS", variables)
basic_analysis(df_edss, "ANOVA SDMT", variables)

-----------------------------------------------------
q_Comm_spl
-----------------------------------------------------
ANOVA EDSS
              sum_sq     df          F        PR(>F)
C(groups)   2.180183    2.0  28.600922  1.159259e-12
Residual   26.451015  694.0        NaN           NaN
  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2  meandiff p-adj   lower   upper  reject
-------------------------------------------------------
     HC  SDMT<40  -0.1728    0.0 -0.2275 -0.1181   True
     HC SDMT>=40  -0.0804    0.0 -0.1199 -0.0409   True
SDMT<40 SDMT>=40   0.0924 0.0001  0.0421  0.1428   True
-------------------------------------------------------
-----------------------------------------------------
-----------------------------------------------------
q_Comm_eff
-----------------------------------------------------
ANOVA EDSS
                 sum_sq     df          F        PR(>F)
C(groups)  5.886274e-08    2.0  28.686683  1.070964e-12
Residual   7.120158e-07