In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import itertools
import glob

import statsmodels.api as sm
import statsmodels.formula.api as smf


In [None]:
csv_total = 'data_total.csv'
csv_values = 'extracted_values.csv'

df_total = pd.read_csv(csv_total)
df_values = pd.read_csv(csv_values)

df_total.head()
df_values.head()

In [None]:
df_merged = pd.merge(df_total, df_values, on=["SubjID", "CENTER"])
df_merged.head()

In [4]:
# create continuous in df_merged
df_merged["GROUP_int"] = df_merged['GROUP'].replace(to_replace=['HC', 'RRMS', 'SPMS', 'PPMS'], value=[0, 1, 2, 3])
df_merged["CENTER_int"] = df_merged['CENTER'].replace(to_replace=['CLINIC', 'MAINZ', 'MILAN', 'NAPLES', 'OSLO'], value=[0, 1, 2, 3, 4])
df_merged["SEX_int"] = df_merged['SEX'].replace(to_replace=['F', 'M'], value=[0, 1])
df_merged.to_csv('/home/extop/GERARD/DATA/MAGNIMS2021/merged_analysis.csv')

In [None]:
df_merged["GMF"]

In [None]:
# model = f'RDl_mean ~ SEX_int + AGE + CENTER_int + GROUP_int'
model = f'EDSS ~ SEX + AGE + CENTER + DD + GMF'
# df_merged["AGE"] -= df_merged["AGE"].mean()
df_merged["AGE"] -= df_merged["AGE"].mean()
print(df_merged["GMF"])
results = smf.ols(model, data=df_merged).fit()
results.summary()

## Linear models
Create table for two different models, one for all subjects, and one only for actual patients.

Use paradigm explained by Jaume:
* dependent variable should be the value that we want to explain
* categorical variables should not be considered like that; convert to numeric. (is this correct?)




In [None]:
## Base OLS
values_to_compare = ["GMF", "WMF", "BPF", "RDl_mean", "Meta"]
col_names = ["SEX", "AGE", "CENTER", "VALUE"] # needs to be checked manually

cols = pd.MultiIndex.from_product([col_names, ['T-value', 'P-value']])
df_results_1 = pd.DataFrame(index=values_to_compare, columns=cols)

for val in values_to_compare:
    model = f'GROUP_int ~ SEX_int + AGE + CENTER_int + {val}'
    results = smf.ols(model, data=df_merged).fit()

    idx = 1
    for c in col_names:
        # save value to dataframe
        df_results_1.loc[val, (c, "T-value")] = results.tvalues[idx]
        df_results_1.loc[val, (c, "P-value")] = results.pvalues[idx]
        idx += 1
    # save results as df 

df_results_1 = df_results_1.astype(float).round(3)
## VALUE IS THE INDEX VALUE, IN ALL CASES THE DEPENDNET VARIABLE IS GROUP!!
df_results_1


In [None]:
### SDMT AND EDSS WITHOUT HC
df_merged_patients = df_merged[df_merged.GROUP != "HC"]
df_merged_patients["GROUP_int"] = df_merged_patients["GROUP_int"] - 1

## Base OLS
values_to_compare = ["GMF", "WMF", "BPF", "RDl_mean", "Meta"]
dependent = ["EDSS", "SDMT"]
col_names = ["SEX.M", "AGE", "CENTER", "DD", "VALUE"]

cols = pd.MultiIndex.from_product([col_names, ['T-value', 'P-value']])

df_results_edss = pd.DataFrame(index=values_to_compare, columns=cols)
df_results_sdmt = pd.DataFrame(index=values_to_compare, columns=cols)

for val in values_to_compare:
    model = f'{dependent[0]} ~ SEX_int + AGE + CENTER_int + DD + {val}'
    results = smf.ols(model, data=df_merged_patients).fit()
    idx = 1
    for c in col_names:
        # save value to dataframe
        df_results_edss.loc[val, (c, "T-value")] = results.tvalues[idx]
        df_results_edss.loc[val, (c, "P-value")] = results.pvalues[idx]
        idx += 1

    model = f'{dependent[1]} ~ SEX_int + AGE + CENTER_int + DD + {val}'
    results = smf.ols(model, data=df_merged_patients).fit()
    idx = 1
    for c in col_names:
        # save value to dataframe
        df_results_sdmt.loc[val, (c, "T-value")] = results.tvalues[idx]
        df_results_sdmt.loc[val, (c, "P-value")] = results.pvalues[idx]
        idx += 1


df_results_edss = df_results_edss.astype(float).round(3)
df_results_edss


In [None]:
df_results_sdmt = df_results_sdmt.astype(float).round(3)
df_results_sdmt

## MR metrics and values
Analyze and extract MR values and compare them to cognitive and degenerative
values.



## T1 VALUES

In [None]:
## Compare different values 
values_to_compare = ["GMF", "WMF", "BPF"]
sns.set(style="whitegrid")

# draw boxplots
## compare by DX, CENTER, EDSS>3, Sex
for val in values_to_compare:
    fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(25,5))
    plt.suptitle("Analyzing " + str(val), y=1.6)

    ### BY GROUPS
    sns.boxplot(data=df_merged, x="GROUP", order=["HC", "RRMS", "SPMS", "PPMS"], y=val, ax=ax1)
    add_stat_annotation(ax1, data=df_merged, x="GROUP", y=val,
                        box_pairs=[("HC", "RRMS"), ("HC", "SPMS"), ("HC", "PPMS"), ("RRMS", "SPMS"), ("RRMS", "PPMS"), ("SPMS", "PPMS")],
                        test='t-test_ind', comparisons_correction=None, text_format='star', loc='outside', verbose=1)
    # BY HC - MS
    sns.boxplot(data=df_merged, x="CENTER", y=val, ax=ax2)
    add_stat_annotation(ax2, data=df_merged, x="CENTER", y=val,
                        box_pairs=itertools.combinations(["CLINIC", "MAINZ", "MILAN", "NAPLES", "OSLO"], 2), comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY EDSS
    df_merged['EDSSbin'] = np.where(df_merged['EDSS'] < 3, "EDSS<3", "EDSS>=3")
    sns.boxplot(data=df_merged, x="EDSSbin", y=val, ax=ax3)
    add_stat_annotation(ax3, data=df_merged, x="EDSSbin", y=val,
                        box_pairs=[("EDSS<3", "EDSS>=3")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY SEX
    sns.boxplot(data=df_merged, x="SEX", y=val, ax=ax4)
    add_stat_annotation(ax4, data=df_merged, x="SEX", y=val,
                        box_pairs=[("M", "F")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)


In [None]:
## Compare different values 
values_to_compare = ["GMF", "WMF", "BPF"]
sns.set_theme(style="white")
# And draw scatter plots: EDSS, SDMT
for val in values_to_compare:
    fig, ((ax1, ax2, ax3, ax4, ax5, ax6)) = plt.subplots(1, 6, figsize=(35,5))
    plt.suptitle("Analyzing cognitive and disability with " + str(val))

    sns.scatterplot(data=df_merged, x=val, y="EDSS", hue="GROUP", palette="magma", hue_order=["HC", "RRMS", "SPMS", "PPMS"], alpha=0.8, s=75, ax=ax1)
    sns.scatterplot(data=df_merged, x=val, y="EDSS", hue="CENTER", style="CENTER", palette="muted", alpha=0.8, s=75, ax=ax2)
    sns.scatterplot(data=df_merged, x=val, y="SDMT", hue="GROUP", palette="magma", hue_order=["HC", "RRMS", "SPMS", "PPMS"], alpha=0.8, s=75, ax=ax3)
    sns.scatterplot(data=df_merged, x=val, y="SDMT", hue="CENTER", style="CENTER", palette="muted", alpha=0.8, s=75, ax=ax4)
    sns.scatterplot(data=df_merged, x=val, y="DD", hue="GROUP", palette="magma", hue_order=["HC", "RRMS", "SPMS", "PPMS"], alpha=0.8, s=75, ax=ax5)
    sns.scatterplot(data=df_merged, x=val, y="DD", hue="CENTER", style="CENTER", palette="muted", alpha=0.8, s=75, ax=ax6)


In [None]:
## RD VALUES
## Compare different values 
## TODO: WE HAVE NORMAL DISTRIBUTIONS. FIND TESTS THAT CAN DO NORMAL DISTRIBUTIONS
# HOW TO COMPARE BETWEEN GROUPS OF NORMAL DISTRIBUTIONS?
values_to_compare = ["RDwm_mean","RDlwm_mean", "RDl_mean", "RD_gm", "RD_gmwm"]

# convert Rdl_mean to Nan if zero
# BECAUSE WE DO NOT WANT ZERO
df_merged["RDl_mean"][df_merged["RDl_mean"] == 0] = np.nan

# draw boxplots
## compare by DX, CENTER, EDSS>3, Sex
for val in values_to_compare:
    fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(25,5))
    plt.suptitle("Analyzing " + str(val), y=1.6)

    ### BY GROUPS
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="GROUP", order=["HC", "RRMS", "SPMS", "PPMS"], y=val, ax=ax1)
    add_stat_annotation(ax1, data=df_merged, x="GROUP", y=val, order=["HC", "RRMS", "SPMS", "PPMS"],
                        box_pairs=[("HC", "RRMS"), ("HC", "SPMS"), ("HC", "PPMS"), ("RRMS", "SPMS"), ("RRMS", "PPMS"), ("SPMS", "PPMS")],
                        test='t-test_ind', comparisons_correction=None, text_format='star', loc='outside', verbose=1)
    # BY HC - MS
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="CENTER", y=val, ax=ax2)
    add_stat_annotation(ax2, data=df_merged, x="CENTER", y=val,
                        box_pairs=itertools.combinations(["CLINIC", "MAINZ", "MILAN", "NAPLES", "OSLO"], 2), comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY EDSS
    df_merged['EDSSbin'] = np.where(df_merged['EDSS'] < 3, "EDSS<3", "EDSS>=3")
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="EDSSbin", y=val, ax=ax3)
    add_stat_annotation(ax3, data=df_merged, x="EDSSbin", y=val,
                        box_pairs=[("EDSS<3", "EDSS>=3")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY SEX
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="SEX", y=val, ax=ax4)
    add_stat_annotation(ax4, data=df_merged, x="SEX", y=val,
                        box_pairs=[("M", "F")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)



In [None]:
# METASTABILITY
# For meta, jsut just use boxplots
values_to_compare = ["Meta"]

# draw boxplots
## compare by DX, CENTER, EDSS>3, Sex
for val in values_to_compare:
    fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(25,5))
    plt.suptitle("Analyzing " + str(val), y=1.6)

    ### BY GROUPS
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="GROUP", y=val, ax=ax1)
    ax1.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax1, data=df_merged, x="GROUP", y=val, order=["HC", "RRMS", "SPMS", "PPMS"],
                        box_pairs=[("HC", "RRMS"), ("HC", "SPMS"), ("HC", "PPMS"), ("RRMS", "SPMS"), ("RRMS", "PPMS"), ("SPMS", "PPMS")],
                        test='t-test_ind', comparisons_correction=None, text_format='star', loc='outside', verbose=1)
    # BY HC - MS
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="CENTER", y=val, ax=ax2)
    ax2.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax2, data=df_merged, x="CENTER", y=val,
                        box_pairs=itertools.combinations(["CLINIC", "MAINZ", "MILAN", "NAPLES", "OSLO"], 2), comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY EDSS
    df_merged['EDSSbin'] = np.where(df_merged['EDSS'] < 3, "EDSS<3", "EDSS>=3")
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="EDSSbin", y=val, ax=ax3)
    ax3.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax3, data=df_merged, x="EDSSbin", y=val,
                        box_pairs=[("EDSS<3", "EDSS>=3")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY SEX
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="SEX", y=val, ax=ax4)
    ax4.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax4, data=df_merged, x="SEX", y=val,
                        box_pairs=[("M", "F")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

In [None]:
# METASTABILITY
# For meta, jsut just use boxplots
values_to_compare = ["Meta_t1_band"]

# draw boxplots
## compare by DX, CENTER, EDSS>3, Sex
for val in values_to_compare:
    fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(25,5))
    plt.suptitle("Analyzing " + str(val), y=1.6)

    ### BY GROUPS
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="GROUP", y=val, ax=ax1)
    ax1.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax1, data=df_merged, x="GROUP", y=val, order=["HC", "RRMS", "SPMS", "PPMS"],
                        box_pairs=[("HC", "RRMS"), ("HC", "SPMS"), ("HC", "PPMS"), ("RRMS", "SPMS"), ("RRMS", "PPMS"), ("SPMS", "PPMS")],
                        test='t-test_ind', comparisons_correction=None, text_format='star', loc='outside', verbose=1)
    # BY HC - MS
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="CENTER", y=val, ax=ax2)
    ax2.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax2, data=df_merged, x="CENTER", y=val,
                        box_pairs=itertools.combinations(["CLINIC", "MAINZ", "MILAN", "NAPLES", "OSLO"], 2), comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY EDSS
    df_merged['EDSSbin'] = np.where(df_merged['EDSS'] < 3, "EDSS<3", "EDSS>=3")
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="EDSSbin", y=val, ax=ax3)
    ax3.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax3, data=df_merged, x="EDSSbin", y=val,
                        box_pairs=[("EDSS<3", "EDSS>=3")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)

    # BY SEX
    sns.set(style="whitegrid")
    sns.boxplot(data=df_merged, x="SEX", y=val, ax=ax4)
    ax4.set(ylim=(0.05, 0.3))
    add_stat_annotation(ax4, data=df_merged, x="SEX", y=val,
                        box_pairs=[("M", "F")], comparisons_correction=None,
                        test='t-test_ind', text_format='star', loc='outside', verbose=1)