## Small script that checks if SDMT values are correlated with YED values

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from statannotations.Annotator import Annotator
import itertools 

# ignore warnings for easier plotting
import warnings
warnings.filterwarnings("ignore")

from load_data import load_data, return_asterisks_p

In [2]:
sns.set(style='darkgrid', context='talk', palette='Set1', font="serif")

In [3]:
# Hue colors order and names
# auxiliar dict to put correct order in the figure
map_hue_color = {
    "EDSS_group": ["EDSS<=3", "EDSS>3"],
    "SDMT_group": ["SDMT>=40", "SDMT<40"],
    "G_norm": ['G_z<=0', 'G_z>0'], 
    "Corr_norm": ['C_z<=0', 'C_z>0'],
    "disease": ["HC", "MS"],
    "GROUP": ["HC", "CIS", "RRMS", "SPMS", "PPMS"],
    "CENTER": ["CLINIC", "MAINZ", "MILAN", "NAPLES", "OSLO", "LONDON", "AMSTERDAM"],
    "groups" : ["HC", "EDSS<=3", "EDSS>3", "SDMT>=40", "SDMT<40"]
    # "GROUP": ["HC", "RRMS", "SPMS", "PPMS"]
}

# get the progressives together.
mapping_prog = {
    "HC": "HC",
    "CIS": "CIS", 
    "RRMS": "RRMS", 
    "SPMS": "PMS", 
    "PPMS": "PMS"
}

df_merged = load_data()
print(len(df_merged))
print(df_merged.columns)


Initial length: 730
disease
HC    216
MS    514
dtype: int64
QC N: 5
QC Y: 724
LONDON2: 27
697
Index(['SubjID', 'DOB', 'SEX', 'GROUP', 'ONSET_DATE', 'MR_DATE',
       'TREATMENT_AT_MRI', 'TREATMENT', 'EDSS', 'YED',
       ...
       'SC_eff_full', 'FC_spl_full', 'FC_eff_full', 'FC_entropy_full',
       'FC_integration_full', 'Full_CC', 'Comm_ratio_approx', 'CC_ratio_area',
       'CC_Sag_area_sqrt', 'TIV_cubicroot'],
      dtype='object', length=143)


In [4]:
# Get only patients with disease == MS
df_merged = df_merged[df_merged["disease"] == "MS"]

# Get only patients where YED is available AND numeric
df_merged = df_merged[df_merged["YED"].notnull()]
df_merged = df_merged[df_merged["YED"].apply(lambda x: x.isnumeric())]

# change yed to numeric
df_merged["YED"] = df_merged["YED"].astype(int)

# how many patients I have? (divided by center)
print(df_merged.groupby("CENTER").size())


CENTER
AMSTERDAM    173
LONDON        37
MILAN         56
NAPLES        51
OSLO          58
dtype: int64


In [8]:
# CHeck if there is a significant association between SDMT and YED
# I will use a linear regression model
# I will use the same model as in the paper
model = 'SDMT ~ YED'
model_more = 'SDMT ~ YED + AGE + C(CENTER) + C(SEX)'
results = smf.ols(model_more, data=df_merged).fit()
print(results.pvalues["YED"])
print(results.tvalues["YED"])

print(results.summary())

0.005673219020182783
2.7825300463279223
                            OLS Regression Results                            
Dep. Variable:                   SDMT   R-squared:                       0.281
Model:                            OLS   Adj. R-squared:                  0.267
Method:                 Least Squares   F-statistic:                     20.43
Date:                Fri, 12 May 2023   Prob (F-statistic):           3.71e-23
Time:                        10:51:00   Log-Likelihood:                -1431.9
No. Observations:                 374   AIC:                             2880.
Df Residuals:                     366   BIC:                             2911.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

In [11]:
# Do the same, but separately per center
# I will use a linear regression model
# I will use the same model as in the paper
model = 'SDMT ~ YED'
model_more = 'SDMT ~ YED + AGE + C(SEX)'

df_results_glm = pd.DataFrame()

for center in df_merged["CENTER"].unique():
    df_center = df_merged[df_merged["CENTER"] == center]
    results = smf.ols(model_more, data=df_center).fit()

    pval = results.pvalues["YED"]
    
    ast = return_asterisks_p(results.pvalues["YED"])
    #str_save = f"r={r:.2f}{ast}"
    str_save = f"{pval:.2f}{ast}"

    dict_to_append = {'CENTER': center, "t": results.tvalues["YED"], "pval": str_save}

    df_results_glm = df_results_glm.append(dict_to_append, ignore_index=True)

# remove index from df
df_results_glm = df_results_glm.reset_index(drop=True)
df_results_glm = df_results_glm.set_index('CENTER')

print(df_results_glm.to_latex(escape=False))

\begin{tabular}{lrl}
\toprule
{} &         t &  pval \\
CENTER    &           &       \\
\midrule
MILAN     &  0.902732 &  0.37 \\
NAPLES    &  1.910250 &  0.06 \\
OSLO      &  0.962597 &  0.34 \\
LONDON    & -0.288726 &  0.77 \\
AMSTERDAM &  1.592308 &  0.11 \\
\bottomrule
\end{tabular}



In [9]:
model_more = 'SDMT ~ YED + AGE + C(CENTER) + C(SEX)'
results = smf.ols(model_more, data=df_merged).fit()
print(results.pvalues["YED"])
print(results.tvalues["YED"])


0.005673219020182783
2.7825300463279223
