# Description
Immuno biomarkers analysis for samples with COVID-19 in different stages and Down syndrome.

## 1. Update data with COVID and DS columns. Add SImAge and IPAge values and accelerations.
## 2. COVID-19 analysis:
- Non-longitudinal analysis:
    - Biomarkers and SImAge acceleration distributions in different COVID-19 groups.
    - Correlation between biomarkers and SImAge acceleration.
- Longitudinal analysis:
    - Trajectories with time points.
    - Repeated measures tests.
## 3. Down syndrome analysis:
- Biomarkers and SImAge acceleration distributions in different groups.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
from statannotations.Annotator import Annotator
import functools
import matplotlib.lines as mlines
import patchworklib as pw


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Init data

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
df = pd.read_excel(f"{path}/data/immuno/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
feats = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
path_save = f"{path}/special/052_immuno_covid_down_syndrome"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

## 1. Update data with COVID and DS columns. Add SImAge and IPAge values and accelerations.

In [None]:
df_covid_ds = pd.read_excel(f"{path}/data/immuno/update_COVID_DownSyndrome.xlsx", index_col=0)
index_diff_base_covid_ds = df.index.difference(df_covid_ds.index)
print(f"index_diff_base_covid_ds: {index_diff_base_covid_ds}")
index_diff_covid_ds_base = df_covid_ds.index.difference(df.index)
print(f"index_diff_covid_ds_base: {index_diff_covid_ds_base}")

cols = {
    'COVID-19 stage': 'Сovid stage',
    'COVID-19 sample ID': 'ID Covid',
    'Down syndrome status': 'Relation to Down Syndrome'
}
for new_col, old_col in cols.items():
    df.loc[df.index.values, new_col] = df_covid_ds.loc[df.index.values, old_col]

df_simage = pd.read_excel(f"{path}/data/immuno/update_SImAge.xlsx", index_col=0)
index_diff_base_simage = df.index.difference(df_simage.index)
print(f"index_diff_base_simage: {index_diff_base_simage}")
index_diff_simage_base = df_simage.index.difference(df.index)
print(f"index_diff_simage_base: {index_diff_simage_base}")

cols = {
    'SImAge': 'Prediction',
    'SImAge acceleration': 'Prediction error',
    '|SImAge acceleration|': 'Prediction error abs'
}
for new_col, old_col in cols.items():
    df.loc[df.index.values, new_col] = df_simage.loc[df.index.values, old_col]

df.to_excel(f"{path}/data/immuno/df.xlsx", index_label='index')

## 2. COVID-19 analysis:
- Non-longitudinal analysis:
    - Biomarkers and SImAge acceleration distributions in different COVID-19 groups.
    - Correlation between biomarkers and SImAge acceleration.
- Longitudinal analysis:
    - Trajectories with time points.
    - Repeated measures tests.

### Prepare data, plot count and age distributions

In [None]:
path_curr = f"{path_save}/01_COVID19"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

colors = {
    'Acute': 'crimson',
    'Dynamics': 'gold',
    'Reconvalescent': 'cyan',
    'Reconvalescent after 1 year': 'lime'
}

df_covid = df.loc[df['COVID-19 stage'].isin(list(colors.keys())), :]
df_covid.to_excel(f"{path_curr}/df_covid.xlsx", index_label='index')

fig = plt.figure(figsize=(3, 4))
sns.set_theme(style='whitegrid', font_scale=1)
countplot = sns.countplot(
    data=df_covid,
    y='COVID-19 stage',
    edgecolor='black',
    palette=colors,
    orient='v',
    order=list(colors.keys())
)
countplot.bar_label(countplot.containers[0])
countplot.set_xlabel("Count")
countplot.set_ylabel("")
countplot.set_title(f"")
plt.savefig(f"{path_curr}/countplot.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_curr}/countplot.pdf", bbox_inches='tight')
plt.close(fig)

hist_bins = np.linspace(5, 115, 23)
fig = plt.figure(figsize=(6, 4))
sns.set_theme(style='whitegrid')
histplot = sns.histplot(
    data=df_covid,
    hue_order=list(colors.keys())[::-1],
    bins=hist_bins,
    x="Age",
    hue="COVID-19 stage",
    edgecolor='black',
    palette=colors,
    multiple="stack"
)
sns.move_legend(
    histplot, "lower center",
    bbox_to_anchor=(.5, 1), ncol=4, title="COVID-19 stage", frameon=True,
)
plt.setp(histplot.get_legend().get_texts(), fontsize='7') # for legend text
plt.setp(histplot.get_legend().get_title(), fontsize='10')
plt.savefig(f"{path_curr}/histplot.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_curr}/histplot.pdf", bbox_inches='tight')
plt.close(fig)

df_rep_meas = df_covid['COVID-19 sample ID'].value_counts().to_frame()
df_rep_meas.rename(columns={'COVID-19 sample ID': 'Time points'}, inplace=True)
df_rep_meas.to_excel(f"{path_curr}/df_rep_meas.xlsx", index_label='index')

samples_mtp = df_rep_meas.index[df_rep_meas['Time points'] != 1].values # 'mtp' means multiple time points
df_covid_rep_meas = df_covid.loc[df_covid['COVID-19 sample ID'].isin(samples_mtp), :]
df_covid_rep_meas.to_excel(f"{path_curr}/df_covid_rep_meas.xlsx", index_label='index')

fig = plt.figure(figsize=(3, 4))
sns.set_theme(style='whitegrid', font_scale=1)
countplot = sns.countplot(
    data=df_rep_meas,
    y='Time points',
    edgecolor='black',
    orient='v',
)
countplot.bar_label(countplot.containers[0])
countplot.set_xlabel("Count")
countplot.set_ylabel("")
countplot.set_title(f"")
plt.savefig(f"{path_curr}/rep_meas.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_curr}/rep_meas.pdf", bbox_inches='tight')
plt.close(fig)

### Non-longitudinal analysis: Statistical tests for immunology biomarkers and age accelerations

#### Mann-Whitney and Kruskal-Wallis

In [None]:
path_curr = f"{path_save}/01_COVID19/non_longitudinal/kw_mw"
pathlib.Path(f"{path_curr}/feats").mkdir(parents=True, exist_ok=True)

df_stat = pd.DataFrame(index=list(feats))

for feat in list(feats) + ['SImAge acceleration']:
    vals = {}
    for group in colors.keys():
        vals[group] = df_covid.loc[df_covid['COVID-19 stage'] == group, feat].values
        df_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_stat.at[feat, f"q75_{group}"], df_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at[feat, f"iqr_{group}"] = df_stat.at[feat, f"q75_{group}"] - df_stat.at[feat, f"q25_{group}"]
    _, df_stat.at[feat, "kw_pval"] = kruskal(*vals.values())
    _, df_stat.at[feat, "mw_01_pval"] = mannwhitneyu(vals['Acute'], vals['Dynamics'], alternative='two-sided')
    _, df_stat.at[feat, "mw_12_pval"] = mannwhitneyu(vals['Dynamics'], vals['Reconvalescent'], alternative='two-sided')
    _, df_stat.at[feat, "mw_23_pval"] = mannwhitneyu(vals['Reconvalescent'], vals['Reconvalescent after 1 year'], alternative='two-sided')
    _, df_stat.at[feat, "mw_03_pval"] = mannwhitneyu(vals['Acute'], vals['Reconvalescent after 1 year'], alternative='two-sided')

_, df_stat.loc[feats, "kw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "kw_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_01_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_01_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_12_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_12_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_23_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_23_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_03_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_03_pval"], 0.05, method='fdr_bh')

df_stat.sort_values([f"kw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path_curr}/kw_mw.xlsx", index_label='Features')

for feat_id, (feat, row) in enumerate(df_stat.iterrows()):
    plt.figure(figsize=(8, 4))
    sns.set_theme(style='whitegrid')
    violin = sns.violinplot(
        data=df_covid,
        x='COVID-19 stage',
        y=feat,
        palette=colors,
        scale='width',
        order=list(colors.keys()),
        saturation=0.75,
    )
    violin.set_ylabel(feat)
    if feat != 'SImAge acceleration':
        violin.set_xlabel(f"Kruskal-Wallis p-value: {row['kw_pval_fdr_bh']:.2e}")
        mw_01_pval = row["mw_01_pval_fdr_bh"]
        mw_12_pval = row["mw_12_pval_fdr_bh"]
        mw_23_pval = row["mw_23_pval_fdr_bh"]
        mw_03_pval = row["mw_03_pval_fdr_bh"]
    else:
        violin.set_xlabel(f"Kruskal-Wallis p-value: {row['kw_pval']:.2e}")
        mw_01_pval = row["mw_01_pval"]
        mw_12_pval = row["mw_12_pval"]
        mw_23_pval = row["mw_23_pval"]
        mw_03_pval = row["mw_03_pval"]

    pval_formatted = [f'{mw_01_pval:.2e}', f'{mw_12_pval:.2e}', f'{mw_23_pval:.2e}', f'{mw_03_pval:.2e}']
    annotator = Annotator(
        violin,
        pairs=[('Acute', 'Dynamics'), ('Dynamics', 'Reconvalescent'), ('Reconvalescent', 'Reconvalescent after 1 year'), ('Acute', 'Reconvalescent after 1 year')],
        data=df_covid,
        x='COVID-19 stage',
        y=feat,
        order=list(colors.keys())
    )
    annotator.set_custom_annotations(pval_formatted)
    annotator.configure(loc='outside')
    annotator.annotate()
    if feat != 'SImAge acceleration':
        # plt.savefig(f"{path_curr}/feats/{feat_id}_{feat}.png", bbox_inches='tight')
        # plt.savefig(f"{path_curr}/feats/{feat_id}_{feat}.pdf", bbox_inches='tight')
        pass
    else:
        plt.savefig(f"{path_curr}/{feat}.png", bbox_inches='tight')
        plt.savefig(f"{path_curr}/{feat}.pdf", bbox_inches='tight')
    plt.close()

In [None]:
feats_sorted = df_stat.index[df_stat.index.isin(feats)].values
axs = {}
pw_rows = []
n_cols = 4
n_rows = int(np.ceil(len(feats_sorted) / n_cols))
for r_id in range(n_rows):
    pw_cols = []
    for c_id in range(n_cols):
        rc_id = r_id * n_cols + c_id
        if rc_id < len(feats_sorted):
            feat = feats_sorted[rc_id]
            axs[feat] = pw.Brick(figsize=(8, 4))
            sns.set_theme(style='whitegrid')
            sns.violinplot(
                data=df_covid,
                x='COVID-19 stage',
                y=feat,
                palette=colors,
                scale='width',
                order=list(colors.keys()),
                saturation=0.75,
                ax=axs[feat]
            )
            axs[feat].set_ylabel(feat)
            axs[feat].set_xlabel(f"Kruskal-Wallis p-value: {df_stat.at[feat, 'kw_pval_fdr_bh']:.2e}")
            mw_01_pval = df_stat.at[feat, "mw_01_pval_fdr_bh"]
            mw_12_pval = df_stat.at[feat, "mw_12_pval_fdr_bh"]
            mw_23_pval = df_stat.at[feat, "mw_23_pval_fdr_bh"]
            mw_03_pval = df_stat.at[feat, "mw_03_pval_fdr_bh"]
            pval_formatted = [f'{mw_01_pval:.2e}', f'{mw_12_pval:.2e}', f'{mw_23_pval:.2e}', f'{mw_03_pval:.2e}']
            annotator = Annotator(
                axs[feat],
                pairs=[('Acute', 'Dynamics'), ('Dynamics', 'Reconvalescent'), ('Reconvalescent', 'Reconvalescent after 1 year'), ('Acute', 'Reconvalescent after 1 year')],
                data=df_covid,
                x='COVID-19 stage',
                y=feat,
                order=list(colors.keys()),
            )
            annotator.set_custom_annotations(pval_formatted)
            annotator.configure(loc='outside')
            annotator.annotate()
            pw_cols.append(axs[feat])
        else:
            empty_fig = pw.Brick(figsize=(8, 4))
            empty_fig.axis('off')
            pw_cols.append(empty_fig)

    pw_rows.append(pw.stack(pw_cols, operator="|"))
pw_fig = pw.stack(pw_rows, operator="/")
pw_fig.savefig(f"{path_curr}/feats.pdf")

#### Pearson correlation

In [None]:
path_curr = f"{path_save}/01_COVID19/non_longitudinal/pearson"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

df_stat = pd.DataFrame(index=list(feats))
for group in colors.keys():
    pathlib.Path(f"{path_curr}/{group}").mkdir(parents=True, exist_ok=True)
    for feat in feats:
        xs = df_covid.loc[df_covid['COVID-19 stage'] == group, feat].values
        ys = df_covid.loc[df_covid['COVID-19 stage'] == group, 'SImAge acceleration'].values
        df_stat.at[feat, f"{group}_corr"], df_stat.at[feat, f"{group}_pval"] = stats.pearsonr(xs, ys, alternative='two-sided')
    _, df_stat[f"{group}_pval_fdr_bh"], _, _ = multipletests(df_stat[f"{group}_pval"], 0.05, method='fdr_bh')
    df_stat[f"{group}_pval_fdr_bh_log"] = -np.log10(df_stat[f"{group}_pval_fdr_bh"].values)
    df_stat["Color"] = 'white'
    df_stat.loc[df_stat[f"{group}_pval_fdr_bh"] < 0.05, 'Color'] = colors[group]
    df_stat.loc[df_stat[f"{group}_pval_fdr_bh"] >= 0.05, 'Color'] = 'gray'
    df_stat.sort_values([f"{group}_pval_fdr_bh"], ascending=[True], inplace=True)
    plt.figure(figsize=(14, 4))
    plt.xticks(rotation=90)
    sns.set_theme(style='white')
    barplot = sns.barplot(
        data=df_stat,
        x=df_stat.index,
        y=f"{group}_pval_fdr_bh_log",
        edgecolor='black',
        palette=df_stat['Color'].values,
        dodge=False
    )
    barplot.set_ylabel(r'$-\log_{10}(\mathrm{p-value})$')
    plt.savefig(f"{path_curr}/{group}.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/{group}.pdf", bbox_inches='tight')
    plt.close()

    for feat in df_stat.index[df_stat['Color'] == colors[group]].values:
        df_fig = df_covid.loc[df_covid['COVID-19 stage'] == group, [feat, 'SImAge acceleration']]
        plt.figure()
        sns.set_theme(style='whitegrid')
        regplot = sns.regplot(
            data=df_fig,
            x=feat,
            y='SImAge acceleration',
            scatter_kws={
                "color": colors[group],
                "alpha":0.75,
                "edgecolor": "black",
                "linewidth": 0.2
            },
            line_kws={"color": "black"}
        )
        plt.savefig(f"{path_curr}/{group}/{feat}.png", bbox_inches='tight', dpi=400)
        plt.savefig(f"{path_curr}/{group}/{feat}.pdf", bbox_inches='tight')
        plt.close()

df_stat.to_excel(f"{path_curr}/pearson.xlsx", index_label='Features')

### Longitudinal analysis: repeated measures tests

In [None]:
path_curr = f"{path_save}/01_COVID19/longitudinal/"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

df_pivot_nans = df_covid_rep_meas.pivot(index='COVID-19 sample ID', columns='COVID-19 stage', values='SImAge acceleration')
samples_rep_meas_names_dict = {}
samples_rep_meas_names_dict['0-1'] = ['Acute', 'Dynamics']
samples_rep_meas_names_dict['0-1-2'] = ['Acute', 'Dynamics', 'Reconvalescent']
samples_rep_meas_names_dict['0-1-2-3'] = ['Acute', 'Dynamics', 'Reconvalescent', 'Reconvalescent after 1 year']
samples_rep_meas_dict = {}
samples_rep_meas_dict['0-1'] = df_pivot_nans.index[df_pivot_nans[['Acute', 'Dynamics']].notnull().all(1)].values
samples_rep_meas_dict['0-1-2'] = df_pivot_nans.index[df_pivot_nans[['Acute', 'Dynamics', 'Reconvalescent']].notnull().all(1)].values
samples_rep_meas_dict['0-1-2-3'] = df_pivot_nans.index[df_pivot_nans[['Acute', 'Dynamics', 'Reconvalescent', 'Reconvalescent after 1 year']].notnull().all(1)].values

for group in samples_rep_meas_dict:
    pathlib.Path(f"{path_curr}/{group}/feats").mkdir(parents=True, exist_ok=True)
    df_stat = pd.DataFrame(index=list(feats))
    for feat in list(feats) + ['SImAge acceleration']:
        df_pivot = df_covid_rep_meas.pivot(index='COVID-19 sample ID', columns='COVID-19 stage', values=feat).loc[samples_rep_meas_dict[group], :]
        if group == '0-1':
            res = wilcoxon(
                x=df_pivot.loc[:, 'Acute'].values,
                y=df_pivot.loc[:, 'Dynamics'].values,
                alternative='two-sided'
            )
        elif group == '0-1-2':
            res = friedmanchisquare(
                df_pivot.loc[:, 'Acute'].values,
                df_pivot.loc[:, 'Dynamics'].values,
                df_pivot.loc[:, 'Reconvalescent'].values,
            )
        else:
            res = friedmanchisquare(
                df_pivot.loc[:, 'Acute'].values,
                df_pivot.loc[:, 'Dynamics'].values,
                df_pivot.loc[:, 'Reconvalescent'].values,
                df_pivot.loc[:, 'Reconvalescent after 1 year'].values,
            )
        df_stat.at[feat, "pval"] = res.pvalue
    _, df_stat.loc[feats, "pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "pval"], 0.05, method='fdr_bh')
    df_stat.sort_values([f"pval_fdr_bh"], ascending=[True], inplace=True)
    df_stat.to_excel(f"{path_curr}/{group}/stat.xlsx", index_label='Features')

    for feat_id, (feat, row) in enumerate(df_stat.iterrows()):
        df_pivot = df_covid_rep_meas.pivot(index='COVID-19 sample ID', columns='COVID-19 stage', values=feat).loc[samples_rep_meas_dict[group], samples_rep_meas_names_dict[group]]
        df_melt = df_pivot.melt(
            var_name='COVID-19 stage',
            value_name=feat,
            ignore_index=False
        )
        df_melt['COVID-19 sample ID'] = df_melt.index
        plt.figure(figsize=(8, 4))
        sns.set_theme(style='whitegrid')
        violin = sns.violinplot(
            data=df_melt,
            x='COVID-19 stage',
            y=feat,
            palette=colors,
            scale='width',
            order=list(colors.keys()),
            saturation=0.75,
        )
        pointplot = sns.pointplot(
            data=df_melt,
            x='COVID-19 stage',
            y=feat,
            hue='COVID-19 sample ID',
            legend=False
        )
        pointplot.set_xlabel("")
        plt.legend([],[], frameon=False)
        if feat == 'SImAge acceleration':
            pointplot.set_title(f"Samples: {samples_rep_meas_dict[group].shape[0]} \n p-value: {row['pval']:.2e}")
            plt.savefig(f"{path_curr}/{group}/{feat}.png", bbox_inches='tight')
            plt.savefig(f"{path_curr}/{group}/{feat}.pdf", bbox_inches='tight')
        else:
            pointplot.set_title(f"Samples: {samples_rep_meas_dict[group].shape[0]} \n p-value: {row['pval_fdr_bh']:.2e}")
            # plt.savefig(f"{path_curr}/{group}/feats/{feat_id}_{feat}.png", bbox_inches='tight')
            # plt.savefig(f"{path_curr}/{group}/feats/{feat_id}_{feat}.pdf", bbox_inches='tight')
        plt.close()

    feats_sorted = df_stat.index[df_stat.index.isin(feats)].values
    axs = {}
    pw_rows = []
    n_cols = 4
    n_rows = int(np.ceil(len(feats_sorted) / n_cols))
    for r_id in range(n_rows):
        pw_cols = []
        for c_id in range(n_cols):
            rc_id = r_id * n_cols + c_id
            if rc_id < len(feats_sorted):
                feat = feats_sorted[rc_id]
                df_pivot = df_covid_rep_meas.pivot(
                    index='COVID-19 sample ID',
                    columns='COVID-19 stage',
                    values=feat
                ).loc[samples_rep_meas_dict[group], samples_rep_meas_names_dict[group]]
                df_melt = df_pivot.melt(
                    var_name='COVID-19 stage',
                    value_name=feat,
                    ignore_index=False
                )
                df_melt['COVID-19 sample ID'] = df_melt.index

                axs[feat] = pw.Brick(figsize=(6, 3))
                sns.set_theme(style='whitegrid')
                violin = sns.violinplot(
                    data=df_melt,
                    x='COVID-19 stage',
                    y=feat,
                    palette=colors,
                    scale='width',
                    order=list(colors.keys()),
                    saturation=0.75,
                    ax=axs[feat]
                )
                pointplot = sns.pointplot(
                    data=df_melt,
                    x='COVID-19 stage',
                    y=feat,
                    hue='COVID-19 sample ID',
                    legend=False,
                    ax=axs[feat]
                )
                axs[feat].set_xlabel("")
                axs[feat].legend([],[], frameon=False)
                axs[feat].set_title(f"Samples: {samples_rep_meas_dict[group].shape[0]} \n p-value: {df_stat.at[feat, 'pval_fdr_bh']:.2e}")
                pw_cols.append(axs[feat])
            else:
                empty_fig = pw.Brick(figsize=(6, 3))
                empty_fig.axis('off')
                pw_cols.append(empty_fig)

        pw_rows.append(pw.stack(pw_cols, operator="|"))
    pw_fig = pw.stack(pw_rows, operator="/")
    pw_fig.savefig(f"{path_curr}/{group}/feats.pdf")

## 3. Down syndrome analysis:
- Biomarkers and SImAge acceleration distributions in different groups.

### Plot count and age distributions

In [None]:
path_curr = f"{path_save}/02_DownSyndrome"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

colors = {
    'Parent': 'greenyellow',
    'Sibling': 'deepskyblue',
    'Down Syndrome': 'darkorchid'
}

df_ds = df.loc[df['Down syndrome status'].isin(list(colors.keys())), :]
df_ds.to_excel(f"{path_curr}/df_DownSyndrome.xlsx", index_label='index')

fig = plt.figure(figsize=(3, 4))
sns.set_theme(style='whitegrid', font_scale=1)
countplot = sns.countplot(
    data=df_ds,
    y='Down syndrome status',
    edgecolor='black',
    palette=colors,
    orient='v',
    order=list(colors.keys())
)
countplot.bar_label(countplot.containers[0])
countplot.set_xlabel("Count")
countplot.set_ylabel("")
countplot.set_title(f"")
plt.savefig(f"{path_curr}/countplot.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_curr}/countplot.pdf", bbox_inches='tight')
plt.close(fig)

hist_bins = np.linspace(5, 115, 23)
fig = plt.figure(figsize=(6, 4))
sns.set_theme(style='whitegrid')
histplot = sns.histplot(
    data=df_ds,
    hue_order=list(colors.keys())[::-1],
    bins=hist_bins,
    x="Age",
    hue="Down syndrome status",
    edgecolor='black',
    palette=colors,
    multiple="stack"
)
sns.move_legend(
    histplot, "lower center",
    bbox_to_anchor=(.5, 1), ncol=4, title="Down syndrome status", frameon=True,
)
plt.setp(histplot.get_legend().get_texts(), fontsize='7') # for legend text
plt.setp(histplot.get_legend().get_title(), fontsize='10')
plt.savefig(f"{path_curr}/histplot.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_curr}/histplot.pdf", bbox_inches='tight')
plt.close(fig)

#### Mann-Whitney and Kruskal-Wallis

In [None]:
path_curr = f"{path_save}/02_DownSyndrome/kw_mw"
pathlib.Path(f"{path_curr}/feats").mkdir(parents=True, exist_ok=True)

df_stat = pd.DataFrame(index=list(feats))

for feat in list(feats) + ['SImAge acceleration']:
    vals = {}
    for group in colors.keys():
        vals[group] = df_ds.loc[df_ds['Down syndrome status'] == group, feat].values
        df_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_stat.at[feat, f"q75_{group}"], df_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at[feat, f"iqr_{group}"] = df_stat.at[feat, f"q75_{group}"] - df_stat.at[feat, f"q25_{group}"]
    _, df_stat.at[feat, "kw_pval"] = kruskal(*vals.values())
    _, df_stat.at[feat, "mw_p-s_pval"] = mannwhitneyu(vals['Parent'], vals['Sibling'], alternative='two-sided')
    _, df_stat.at[feat, "mw_s-ds_pval"] = mannwhitneyu(vals['Sibling'], vals['Down Syndrome'], alternative='two-sided')
    _, df_stat.at[feat, "mw_p-ds_pval"] = mannwhitneyu(vals['Parent'], vals['Down Syndrome'], alternative='two-sided')

_, df_stat.loc[feats, "kw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "kw_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_p-s_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_p-s_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_s-ds_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_s-ds_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "mw_p-ds_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_p-ds_pval"], 0.05, method='fdr_bh')

df_stat.sort_values([f"kw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path_curr}/kw_mw.xlsx", index_label='Features')

for feat_id, (feat, row) in enumerate(df_stat.iterrows()):
    plt.figure(figsize=(8, 4))
    sns.set_theme(style='whitegrid')
    violin = sns.violinplot(
        data=df_ds,
        x='Down syndrome status',
        y=feat,
        palette=colors,
        scale='width',
        order=list(colors.keys()),
        saturation=0.75,
    )
    violin.set_ylabel(feat)
    if feat != 'SImAge acceleration':
        violin.set_xlabel(f"Kruskal-Wallis p-value: {row['kw_pval_fdr_bh']:.2e}")
        mw_01_pval = row["mw_p-s_pval_fdr_bh"]
        mw_12_pval = row["mw_s-ds_pval_fdr_bh"]
        mw_02_pval = row["mw_p-ds_pval_fdr_bh"]
    else:
        violin.set_xlabel(f"Kruskal-Wallis p-value: {row['kw_pval']:.2e}")
        mw_01_pval = row["mw_p-s_pval"]
        mw_12_pval = row["mw_s-ds_pval"]
        mw_02_pval = row["mw_p-ds_pval"]

    pval_formatted = [f'{mw_01_pval:.2e}', f'{mw_12_pval:.2e}', f'{mw_02_pval:.2e}']
    annotator = Annotator(
        violin,
        pairs=[('Parent', 'Sibling'), ('Sibling', 'Down Syndrome'), ('Parent', 'Down Syndrome')],
        data=df_ds,
        x='Down syndrome status',
        y=feat,
        order=list(colors.keys())
    )
    annotator.set_custom_annotations(pval_formatted)
    annotator.configure(loc='outside')
    annotator.annotate()
    if feat != 'SImAge acceleration':
        # plt.savefig(f"{path_curr}/feats/{feat_id}_{feat}.png", bbox_inches='tight', dpi=400)
        # plt.savefig(f"{path_curr}/feats/{feat_id}_{feat}.pdf", bbox_inches='tight')
        pass
    else:
        plt.savefig(f"{path_curr}/{feat}.png", bbox_inches='tight', dpi=400)
        plt.savefig(f"{path_curr}/{feat}.pdf", bbox_inches='tight')
    plt.close()

In [None]:
feats_sorted = df_stat.index[df_stat.index.isin(feats)].values
axs = {}
pw_rows = []
n_cols = 4
n_rows = int(np.ceil(len(feats_sorted) / n_cols))
for r_id in range(n_rows):
    pw_cols = []
    for c_id in range(n_cols):
        rc_id = r_id * n_cols + c_id
        if rc_id < len(feats_sorted):
            feat = feats_sorted[rc_id]
            axs[feat] = pw.Brick(figsize=(6, 3))
            sns.set_theme(style='whitegrid')
            sns.violinplot(
                data=df_ds,
                x='Down syndrome status',
                y=feat,
                palette=colors,
                scale='width',
                order=list(colors.keys()),
                saturation=0.75,
                ax=axs[feat]
            )
            axs[feat].set_ylabel(feat)
            axs[feat].set_xlabel(f"Kruskal-Wallis p-value: {df_stat.at[feat, 'kw_pval_fdr_bh']:.2e}")
            mw_01_pval = df_stat.at[feat, "mw_p-s_pval_fdr_bh"]
            mw_12_pval = df_stat.at[feat, "mw_s-ds_pval_fdr_bh"]
            mw_23_pval = df_stat.at[feat, "mw_p-ds_pval_fdr_bh"]
            pval_formatted = [f'{mw_01_pval:.2e}', f'{mw_12_pval:.2e}', f'{mw_23_pval:.2e}']
            annotator = Annotator(
                axs[feat],
                pairs=[('Parent', 'Sibling'), ('Sibling', 'Down Syndrome'), ('Parent', 'Down Syndrome')],
                data=df_ds,
                x='Down syndrome status',
                y=feat,
                order=list(colors.keys())
            )
            annotator.set_custom_annotations(pval_formatted)
            annotator.configure(loc='outside')
            annotator.annotate()
            pw_cols.append(axs[feat])
        else:
            empty_fig = pw.Brick(figsize=(6, 3))
            empty_fig.axis('off')
            pw_cols.append(empty_fig)

    pw_rows.append(pw.stack(pw_cols, operator="|"))
pw_fig = pw.stack(pw_rows, operator="/")
pw_fig.savefig(f"{path_curr}/feats.pdf")

#### Pearson correlation

In [None]:
path_curr = f"{path_save}/02_DownSyndrome/pearson"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

df_stat = pd.DataFrame(index=list(feats))
for group in colors.keys():
    pathlib.Path(f"{path_curr}/{group}").mkdir(parents=True, exist_ok=True)
    for feat in feats:
        xs = df_ds.loc[df_ds['Down syndrome status'] == group, feat].values
        ys = df_ds.loc[df_ds['Down syndrome status'] == group, 'SImAge acceleration'].values
        if (xs == xs[0]).all() or (ys == ys[0]).all():
            print(f"An input array is constant for {feat} and {group}")
            df_stat.at[feat, f"{group}_corr"], df_stat.at[feat, f"{group}_pval"] = 0.0, 1.0
        else:
            df_stat.at[feat, f"{group}_corr"], df_stat.at[feat, f"{group}_pval"] = stats.pearsonr(xs, ys, alternative='two-sided')
    _, df_stat[f"{group}_pval_fdr_bh"], _, _ = multipletests(df_stat[f"{group}_pval"], 0.05, method='fdr_bh')
    df_stat[f"{group}_pval_fdr_bh_log"] = -np.log10(df_stat[f"{group}_pval_fdr_bh"].values)
    df_stat["Color"] = 'white'
    df_stat.loc[df_stat[f"{group}_pval_fdr_bh"] < 0.05, 'Color'] = colors[group]
    df_stat.loc[df_stat[f"{group}_pval_fdr_bh"] >= 0.05, 'Color'] = 'gray'
    df_stat.sort_values([f"{group}_pval_fdr_bh"], ascending=[True], inplace=True)
    plt.figure(figsize=(14, 4))
    plt.xticks(rotation=90)
    sns.set_theme(style='white')
    barplot = sns.barplot(
        data=df_stat,
        x=df_stat.index,
        y=f"{group}_pval_fdr_bh_log",
        edgecolor='black',
        palette=df_stat['Color'].values,
        dodge=False
    )
    barplot.set_ylabel(r'$-\log_{10}(\mathrm{p-value})$')
    plt.savefig(f"{path_curr}/{group}.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/{group}.pdf", bbox_inches='tight')
    plt.close()

    for feat in df_stat.index[df_stat['Color'] == colors[group]].values:
        df_fig = df_ds.loc[df_ds['Down syndrome status'] == group, [feat, 'SImAge acceleration']]
        plt.figure()
        sns.set_theme(style='whitegrid')
        regplot = sns.regplot(
            data=df_fig,
            x=feat,
            y='SImAge acceleration',
            scatter_kws={
                "color": colors[group],
                "alpha":0.75,
                "edgecolor": "black",
                "linewidth": 0.2
            },
            line_kws={"color": "black"}
        )
        plt.savefig(f"{path_curr}/{group}/{feat}.png", bbox_inches='tight', dpi=400)
        plt.savefig(f"{path_curr}/{group}/{feat}.pdf", bbox_inches='tight')
        plt.close()

df_stat.to_excel(f"{path_curr}/pearson.xlsx", index_label='Features')

In [None]:
samples_simage = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/draft/06_small_immuno_clocks/df_mapping.xlsx", index_col=0).index.values
for group in colors.keys():
    samples_curr = df_ds.index[df_ds['Down syndrome status'] == group].values
    print(f"Samples in {group}: {len(samples_curr)}")
    print(f"Samples in {group} intersected with SImAge samples: {len(set.intersection(set(samples_simage), set(samples_curr)))}")