# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import pathlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from functools import reduce
from plottable import Table, ColumnDefinition
from statsmodels.stats.multitest import multipletests
from scipy.stats import pearsonr, mannwhitneyu, levene, variation

# Calculate scores

## Load data

In [None]:
path = "E:/YandexDisk/bbd/fmba"
path_save = "E:/YandexDisk/bbd/fmba/05_scores"

df = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df.index = df.index.astype(str)

df['дата рождения'] = pd.to_datetime(df['дата рождения'])
df['Age'] = (pd.to_datetime("2024-11-11") - df['дата рождения']) / np.timedelta64(1, 'D') / 365.25

df['smoking'] = df['терапевт - фактор_риска'].str.contains('Курение табака')
df['smoking'] = df['smoking'].replace({True: 1, False: 0})

## SCORE

In [None]:
cols_for_score = ["терапевт - артериальное давление верхнее",
                  "биохимический анализ крови - холестерин",
                  "smoking",
                  "Age"
]
for col in cols_for_score:
    df[col] = df[col].replace('нет', np.nan)
    df[col] = df[col].replace('норма', np.nan)
    df[col] = pd.to_numeric(df[col])

chd_s0_age = np.exp(-np.exp(-21.0) * np.power((df['Age'] - 20.0), 4.62))
chd_s0_age_10 = np.exp(-np.exp(-21.0) * np.power((df['Age'] - 10.0), 4.62))
chd_w = 0.24 * (df['биохимический анализ крови - холестерин'] - 6.0) + 0.018 * (df['терапевт - артериальное давление верхнее'] - 120.0) + 0.71 * df['smoking']
chd_s_age = np.power(chd_s0_age, np.exp(chd_w))
chd_s_age_10 = np.power(chd_s0_age_10, np.exp(chd_w))
chd_s10_age = chd_s_age_10 / chd_s_age
chd_risk_10 = 1.0 - chd_s10_age

nchd_s0_age = np.exp(-np.exp(-25.7) * np.power((df['Age'] - 20.0), 5.47))
nchd_s0_age_10 = np.exp(-np.exp(-25.7) * np.power((df['Age'] - 10.0), 5.47))
nchd_w = 0.02 * (df['биохимический анализ крови - холестерин'] - 6.0) + 0.022 * (df['терапевт - артериальное давление верхнее'] - 120.0) + 0.63 * df['smoking']
nchd_s_age = np.power(nchd_s0_age, np.exp(nchd_w))
nchd_s_age_10 = np.power(nchd_s0_age_10, np.exp(nchd_w))
nchd_s10_age = nchd_s_age_10 / nchd_s_age
nchd_risk_10 = 1.0 - nchd_s10_age

risk_10_age = chd_risk_10 + nchd_risk_10

df['SCORE'] = risk_10_age
df_res_score = df[['SCORE']].copy()

In [None]:
threshold = 0.02
df_res_score[f'SCORE group thr={threshold}'] = df_res_score['SCORE']
df_res_score[f'SCORE group thr={threshold}'].mask(df_res_score['SCORE'] >= threshold, 'High', inplace=True)
df_res_score[f'SCORE group thr={threshold}'].mask(df_res_score['SCORE'] < threshold, 'Low', inplace=True)

In [None]:
df_res_score.to_excel(f'{path_save}/df_scores.xlsx')

## SCORE2

### Without HDL cholesterol

In [None]:
cols_for_score = ["терапевт - артериальное давление верхнее",
                  "биохимический анализ крови - холестерин",
                  "smoking",
                  "Age"
]
for col in cols_for_score:
    df[col] = df[col].replace('нет', np.nan)
    df[col] = df[col].replace('норма', np.nan)
    df[col] = pd.to_numeric(df[col])

lincomb = 0.3742 * (df['Age'] - 60.0) / 5.0 + \
      0.1458 * (df['биохимический анализ крови - холестерин'] - 6.0) - \
      0.2698 * 0.0 + \
      0.2777 * (df['терапевт - артериальное давление верхнее'] - 120.0) / 20.0 + \
      0.6457 * 0.0 + \
      0.6012 * df['smoking'] - \
      0.0281 * ((df['Age'] - 60.0) / 5.0) * (df['биохимический анализ крови - холестерин'] - 6.0) + \
      0.0426 * ((df['Age'] - 60.0) / 5.0) * 0.0 - \
      0.0255 * ((df['Age'] - 60.0) / 5.0) * ((df['терапевт - артериальное давление верхнее'] - 120.0) / 20.0) - \
      0.0983 * ((df['Age'] - 60.0) / 5.0) * 0.0 -\
      0.0755 * ((df['Age'] - 60.0) / 5.0) * df['smoking']

risk_10 = 1.0 - np.power(0.9605, np.exp(lincomb))
risk_10_calib = 1.0 - np.exp(-np.exp(0.5836 + 0.8294 * np.log(-np.log(1.0 - risk_10))))

df['SCORE2 wo HDL'] = risk_10_calib
df_res_score = df[['SCORE2 wo HDL']].copy()

In [None]:
threshold = 0.05
df_res_score[f'SCORE2 wo HDL group thr={threshold}'] = df_res_score['SCORE2 wo HDL']
df_res_score[f'SCORE2 wo HDL group thr={threshold}'].mask(df_res_score['SCORE2 wo HDL'] >= threshold, 'High', inplace=True)
df_res_score[f'SCORE2 wo HDL group thr={threshold}'].mask(df_res_score['SCORE2 wo HDL'] < threshold, 'Low', inplace=True)

In [None]:
df_scores = pd.read_excel(f"{path}/05_scores/df_scores.xlsx", index_col=0)
df_scores.index = df_scores.index.astype(str)
df_scores = pd.concat([df_scores, df_res_score], axis=1)
df_scores.to_excel(f'{path_save}/df_scores.xlsx')

### With population-average HDL cholesterol

In [None]:
cols_for_score = ["терапевт - артериальное давление верхнее",
                  "биохимический анализ крови - холестерин",
                  "smoking",
                  "Age"
]
for col in cols_for_score:
    df[col] = df[col].replace('нет', np.nan)
    df[col] = df[col].replace('норма', np.nan)
    df[col] = pd.to_numeric(df[col])

lincomb = 0.3742 * (df['Age'] - 60.0) / 5.0 + \
      0.1458 * (df['биохимический анализ крови - холестерин'] - 6.0) - \
      0.2698 * (1.0 - 1.3) + \
      0.2777 * (df['терапевт - артериальное давление верхнее'] - 120.0) / 20.0 + \
      0.6457 * 0.0 + \
      0.6012 * df['smoking'] - \
      0.0281 * ((df['Age'] - 60.0) / 5.0) * (df['биохимический анализ крови - холестерин'] - 6.0) + \
      0.0426 * ((df['Age'] - 60.0) / 5.0) * (1.0 - 1.3) - \
      0.0255 * ((df['Age'] - 60.0) / 5.0) * ((df['терапевт - артериальное давление верхнее'] - 120.0) / 20.0) - \
      0.0983 * ((df['Age'] - 60.0) / 5.0) * 0.0 -\
      0.0755 * ((df['Age'] - 60.0) / 5.0) * df['smoking']

risk_10 = 1.0 - np.power(0.9605, np.exp(lincomb))
risk_10_calib = 1.0 - np.exp(-np.exp(0.5836 + 0.8294 * np.log(-np.log(1.0 - risk_10))))

df['SCORE2 aver HDL'] = risk_10_calib
df_res_score = df[['SCORE2 aver HDL']].copy()

In [None]:
threshold = 0.05
df_res_score[f'SCORE2 aver HDL group thr={threshold}'] = df_res_score['SCORE2 aver HDL']
df_res_score[f'SCORE2 aver HDL group thr={threshold}'].mask(df_res_score['SCORE2 aver HDL'] >= threshold, 'High', inplace=True)
df_res_score[f'SCORE2 aver HDL group thr={threshold}'].mask(df_res_score['SCORE2 aver HDL'] < threshold, 'Low', inplace=True)

In [None]:
df_scores = pd.read_excel(f"{path}/05_scores/df_scores.xlsx", index_col=0)
df_scores.index = df_scores.index.astype(str)
df_scores = pd.concat([df_scores, df_res_score], axis=1)
df_scores.to_excel(f'{path_save}/df_scores.xlsx')

# Epigenetic clocks

## Load data

In [None]:
path = f"E:/YandexDisk/bbd/fmba"
path_pyaging = "E:/YandexDisk/pydnameth/datasets/pyaging"

epi_ages = []
epi_metrics = []

df_pheno = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
cols_pheno = ['дата рождения']

df_pyaging = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_pyaging.index = df_pyaging.index.astype(str)
pyaging_meta = pd.read_excel(f"{path_pyaging}/clocks_meta.xlsx", index_col='Clock Name')
pyaging_meta['Clock Name'] = pyaging_meta.index
df_pyaging.rename(columns=dict(zip(pyaging_meta['Model ID'].values, pyaging_meta['Clock Name'].values)), inplace=True)
pyaging_meta.drop(index=['Knight', 'LeeControl', 'LeeRefinedRobust', 'LeeRobust', 'PedBE', 'RepliTali', 'ENCen100'], inplace=True)
epi_ages += pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list()
epi_metrics += pyaging_meta[pyaging_meta['Type'] != 'Age'].index.to_list()
df_pyaging.rename(columns={'Age': 'Chronological Age (Epigenetics)'}, inplace=True)
cols_pyaging = ['Chronological Age (Epigenetics)', 'Sex', 'Tissue'] + pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list() + pyaging_meta[pyaging_meta['Type'] != 'Age'].index.to_list()

df_scores = pd.read_excel(f"{path}/05_scores/df_scores.xlsx", index_col=0)
df_scores.index = df_scores.index.astype(str)
cols_scores = ['SCORE2 wo HDL', 'SCORE2 wo HDL group thr=0.05', 'SCORE2 aver HDL', 'SCORE2 aver HDL group thr=0.05']

dfs = [df_pheno[cols_pheno], df_pyaging[cols_pyaging], df_scores[cols_scores]]
df_epi = reduce(lambda left,right: pd.merge(left, right, left_index=True, right_index=True), dfs)
df_epi['дата рождения'] = pd.to_datetime(df_epi['дата рождения'])
df_epi['Age'] = (pd.to_datetime("2024-11-11") - df_epi['дата рождения']) / np.timedelta64(1, 'D') / 365.25

pheno_associations = {
    #'SCORE group thr=0.01': {
    #    'groups': ['Low', 'High'],
    #    'base': 'Low',
    #    'colors': {'Low': 'dodgerblue', 'High': 'crimson'}
    #},
    'SCORE2 wo HDL group thr=0.05': {
        'groups': ['Low', 'High'],
        'base': 'Low',
        'colors': {'Low': 'dodgerblue', 'High': 'crimson'}
    },
    'SCORE2 aver HDL group thr=0.05': {
        'groups': ['Low', 'High'],
        'base': 'Low',
        'colors': {'Low': 'dodgerblue', 'High': 'crimson'}
    },
}

## Epigenetic ages (categorical score)

In [None]:
# Here we take all samples for linreg
for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/dnam").mkdir(parents=True, exist_ok=True)
    df_epi_scores = df_epi.loc[df_epi[curr_score].isin(pheno_associations[curr_score]['groups'])]
    df_stat = pd.DataFrame(index=epi_ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']

    for epiage_id, epiage in enumerate(epi_ages):

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all").mkdir(parents=True, exist_ok=True)
        
        linreg_cx = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_scores).fit()
        df_epi_scores[f"{epiage}_linear_pred_cx"] = linreg_cx.predict(df_epi_scores)
        df_epi_scores[f"{epiage}_acceleration_cx"] = df_epi_scores[epiage] - df_epi_scores[f"{epiage}_linear_pred_cx"]
        df_epi_scores[f"{epiage}"] = df_epi_scores["Age"] + df_epi_scores[f"{epiage}_acceleration_cx"]
        
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_scores.loc[:, :]).fit()
        df_epi_scores[f"{epiage}_linear_pred"] = linreg.predict(df_epi_scores)
        df_epi_scores[f"{epiage} acceleration"] = df_epi_scores[epiage] - df_epi_scores[f"{epiage}_linear_pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_epi_scores.loc[df_epi_scores[curr_score] == group, f"{epiage} acceleration"].values
            df_stat.at[epiage, f"Mean {group}"] = np.mean(vals[group])
            df_stat.at[epiage, f"Median {group}"] = np.median(vals[group])
            df_stat.at[epiage, f"Q75 {group}"], df_stat.at[epiage, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[epiage, f"IQR {group}"] = df_stat.at[epiage, f"Q75 {group}"] - df_stat.at[epiage, f"Q25 {group}"]
            df_stat.at[epiage, f"Variation {group}"] = variation(vals[group])
        _, df_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[epiage, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ Q('{curr_score}') + Age", data=df_epi_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_ages, "levene_pval_bonferroni"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_stat.loc[epi_ages, "levene_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(curr_score_name)].values
    for pval_col in pvals_cols_ancova:
        _, df_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
        _, df_stat.loc[epi_ages, f"{pval_col}_bonferroni"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='bonferroni')
        _, df_stat.loc[epi_ages, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='simes-hochberg')
    df_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_stat.to_excel(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/ages_corrected.xlsx")
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_scores,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=curr_score,
        palette=score_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/hist_age_corrected.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/hist_age_corrected.pdf", bbox_inches='tight')
    plt.close(fig)
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/ages_pvals_{stat_test}_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/ages_pvals_{stat_test}_corrected.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(24, 28),
    )
    subfigs = fig.subfigures(
        nrows=7,
        ncols=4,
    )
    for epiage_id, epiage in enumerate(df_stat.index.values):
        row_id, col_id = divmod(epiage_id, 4)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.37,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[epiage])
        rho, pval = pearsonr(df_epi_scores['Age'].values, df_epi_scores[epiage].values)
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", epiage] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_scores[['Age', epiage]].min().min()
        xy_max = df_epi_scores[['Age', epiage]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_epi_scores.loc[:, :],
            x='Age',
            y=epiage,
            color='dimgray',
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_scores,
            x='Age',
            y=epiage,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_epi_scores,
            x=curr_score,
            y=f"{epiage} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_stat.at[epiage, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/ages_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/all/ages_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

In [None]:
# Here we take samples with low risk for linreg
for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/dnam").mkdir(parents=True, exist_ok=True)
    df_epi_scores = df_epi.loc[df_epi[curr_score].isin(pheno_associations[curr_score]['groups'])]
    df_stat = pd.DataFrame(index=epi_ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']

    for epiage_id, epiage in enumerate(epi_ages):

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk").mkdir(parents=True, exist_ok=True)
        
        linreg_cx = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_scores).fit()
        df_epi_scores[f"{epiage}_linear_pred_cx"] = linreg_cx.predict(df_epi_scores)
        df_epi_scores[f"{epiage}_acceleration_cx"] = df_epi_scores[epiage] - df_epi_scores[f"{epiage}_linear_pred_cx"]
        df_epi_scores[f"{epiage}"] = df_epi_scores["Age"] + df_epi_scores[f"{epiage}_acceleration_cx"]
        
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base, :]).fit()
        df_epi_scores[f"{epiage}_linear_pred"] = linreg.predict(df_epi_scores)
        df_epi_scores[f"{epiage} acceleration"] = df_epi_scores[epiage] - df_epi_scores[f"{epiage}_linear_pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_epi_scores.loc[df_epi_scores[curr_score] == group, f"{epiage} acceleration"].values
            df_stat.at[epiage, f"Mean {group}"] = np.mean(vals[group])
            df_stat.at[epiage, f"Median {group}"] = np.median(vals[group])
            df_stat.at[epiage, f"Q75 {group}"], df_stat.at[epiage, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[epiage, f"IQR {group}"] = df_stat.at[epiage, f"Q75 {group}"] - df_stat.at[epiage, f"Q25 {group}"]
            df_stat.at[epiage, f"Variation {group}"] = variation(vals[group])
        _, df_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[epiage, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ Q('{curr_score}') + Age", data=df_epi_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_ages, "levene_pval_bonferroni"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_stat.loc[epi_ages, "levene_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(curr_score_name)].values
    for pval_col in pvals_cols_ancova:
        _, df_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
        _, df_stat.loc[epi_ages, f"{pval_col}_bonferroni"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='bonferroni')
        _, df_stat.loc[epi_ages, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='simes-hochberg')
    df_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_stat.to_excel(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/ages_corrected.xlsx")
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_scores,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=curr_score,
        palette=score_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/hist_age_corrected.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/hist_age_corrected.pdf", bbox_inches='tight')
    plt.close(fig)
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/ages_pvals_{stat_test}_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/ages_pvals_{stat_test}_corrected.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(24, 28),
    )
    subfigs = fig.subfigures(
        nrows=7,
        ncols=4,
    )
    for epiage_id, epiage in enumerate(df_stat.index.values):
        row_id, col_id = divmod(epiage_id, 4)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.43,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[epiage])
        rho, pval = pearsonr(df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base]['Age'].values, df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base][epiage].values)
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", epiage] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_scores[['Age', epiage]].min().min()
        xy_max = df_epi_scores[['Age', epiage]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base, :],
            x='Age',
            y=epiage,
            color=score_colors[score_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_scores,
            x='Age',
            y=epiage,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_epi_scores,
            x=curr_score,
            y=f"{epiage} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_stat.at[epiage, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/ages_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/low_risk/ages_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

## Epigenetic ages (continuous score)

In [None]:
# Here we take all samples for linreg
for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    pathlib.Path(f"{path}/05_scores/{curr_score_name}/continuous/dnam/all").mkdir(parents=True, exist_ok=True)
    df_epi_scores = df_epi.loc[df_epi[curr_score].isin(pheno_associations[curr_score]['groups'])]
    df_stat = pd.DataFrame(index=epi_ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']

    for epiage_id, epiage in enumerate(epi_ages):
        
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_scores).fit()
        df_epi_scores[f"{epiage}_linear_pred"] = linreg.predict(df_epi_scores)
        df_epi_scores[f"{epiage} acceleration"] = df_epi_scores[epiage] - df_epi_scores[f"{epiage}_linear_pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_epi_scores.loc[df_epi_scores[curr_score] == group, f"{epiage} acceleration"].values
        _, df_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[epiage, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ Q('{curr_score}') + Age", data=df_epi_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(curr_score_name)].values
    for pval_col in pvals_cols_ancova:
        _, df_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_scores,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=curr_score,
        palette=score_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/all/hist_age_corrected.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/all/hist_age_corrected.pdf", bbox_inches='tight')
    plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(24, 28),
    )
    subfigs = fig.subfigures(
        nrows=7,
        ncols=4,
    )
    for epiage_id, epiage in enumerate(df_stat.index.values):
        row_id, col_id = divmod(epiage_id, 4)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.37,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[epiage])
        rho, pval = pearsonr(df_epi_scores[curr_score_name].values, df_epi_scores[f"{epiage} acceleration"].values)
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", epiage] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        x_min = df_epi_scores[f"{epiage} acceleration"].min()
        x_max = df_epi_scores[f"{epiage} acceleration"].max()
        x_ptp = x_max - x_min
        y_min = df_epi_scores[curr_score_name].min()
        y_max = df_epi_scores[curr_score_name].max()
        y_ptp = y_max - y_min
        regplot = sns.regplot(
            data=df_epi_scores.loc[:, :],
            x=f'{epiage} acceleration',
            y=curr_score_name,
            color='dimgray',
            scatter=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_scores,
            x=f"{epiage} acceleration",
            y=curr_score_name,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(x_min - 0.1 * x_ptp, x_max + 0.1 * x_ptp)
        axs['21'].set_ylim(y_min - 0.1 * y_ptp, y_max + 0.1 * y_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_epi_scores,
            x=curr_score,
            y=f"{epiage} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_stat.at[epiage, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/all/ages_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/all/ages_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

In [None]:
# Here we take samples with low risk for linreg
for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    pathlib.Path(f"{path}/05_scores/{curr_score_name}/continuous/dnam/low_risk").mkdir(parents=True, exist_ok=True)
    df_epi_scores = df_epi.loc[df_epi[curr_score].isin(pheno_associations[curr_score]['groups'])]
    df_stat = pd.DataFrame(index=epi_ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']

    for epiage_id, epiage in enumerate(epi_ages):
        
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base, :]).fit()
        df_epi_scores[f"{epiage}_linear_pred"] = linreg.predict(df_epi_scores)
        df_epi_scores[f"{epiage} acceleration"] = df_epi_scores[epiage] - df_epi_scores[f"{epiage}_linear_pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_epi_scores.loc[df_epi_scores[curr_score] == group, f"{epiage} acceleration"].values
        _, df_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[epiage, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ Q('{curr_score}') + Age", data=df_epi_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

    _, df_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(curr_score_name)].values
    for pval_col in pvals_cols_ancova:
        _, df_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_scores,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=curr_score,
        palette=score_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/low_risk/hist_age_corrected.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/low_risk/hist_age_corrected.pdf", bbox_inches='tight')
    plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(24, 28),
    )
    subfigs = fig.subfigures(
        nrows=7,
        ncols=4,
    )
    for epiage_id, epiage in enumerate(df_stat.index.values):
        row_id, col_id = divmod(epiage_id, 4)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.37,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[epiage])
        rho, pval = pearsonr(df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base, :][curr_score_name].values, df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base, :][f"{epiage} acceleration"].values)
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", epiage] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        x_min = df_epi_scores[f"{epiage} acceleration"].min()
        x_max = df_epi_scores[f"{epiage} acceleration"].max()
        x_ptp = x_max - x_min
        y_min = df_epi_scores[curr_score_name].min()
        y_max = df_epi_scores[curr_score_name].max()
        y_ptp = y_max - y_min
        regplot = sns.regplot(
            data=df_epi_scores.loc[df_epi_scores[curr_score] == score_val_base, :],
            x=f'{epiage} acceleration',
            y=curr_score_name,
            color=score_colors[score_val_base],
            scatter=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_scores,
            x=f"{epiage} acceleration",
            y=curr_score_name,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(x_min - 0.1 * x_ptp, x_max + 0.1 * x_ptp)
        axs['21'].set_ylim(y_min - 0.1 * y_ptp, y_max + 0.1 * y_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_epi_scores,
            x=curr_score,
            y=f"{epiage} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_stat.at[epiage, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/low_risk/ages_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/dnam/low_risk/ages_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

## Epigenetic metrics

In [None]:
for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/dnam").mkdir(parents=True, exist_ok=True)
    df_epi_scores = df_epi.loc[df_epi[curr_score].isin(pheno_associations[curr_score]['groups'])]
    df_stat = pd.DataFrame(index=epi_metrics)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']

    for epi_metric_id, epi_metric in enumerate(epi_metrics):
        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/dnam/").mkdir(parents=True, exist_ok=True)
        vals = {}
        for group in score_vals:
            vals[group] = df_epi_scores.loc[df_epi_scores[curr_score] == group, epi_metric].values
            df_stat.at[epi_metric, f"Mean {group}"] = np.mean(vals[group])
            df_stat.at[epi_metric, f"Median {group}"] = np.median(vals[group])
            df_stat.at[epi_metric, f"Q75 {group}"], df_stat.at[epi_metric, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[epi_metric, f"IQR {group}"] = df_stat.at[epi_metric, f"Q75 {group}"] - df_stat.at[epi_metric, f"Q25 {group}"]
            df_stat.at[epi_metric, f"Variation {group}"] = variation(vals[group])
        _, df_stat.at[epi_metric, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[epi_metric, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"{epi_metric} ~ Q('{curr_score}') + Age", data=df_epi_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[epi_metric, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

    _, df_stat.loc[epi_metrics, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_metrics, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_stat.loc[epi_metrics, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_stat.loc[epi_metrics, "levene_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[epi_metrics, "levene_pval_bonferroni"], _, _ = multipletests(df_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_stat.loc[epi_metrics, "levene_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(curr_score_name)].values
    for pval_col in pvals_cols_ancova:
        _, df_stat.loc[epi_metrics, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_stat.loc[epi_metrics, pval_col].values, 0.05, method='fdr_bh')
        _, df_stat.loc[epi_metrics, f"{pval_col}_bonferroni"], _, _ = multipletests(df_stat.loc[epi_metrics, pval_col].values, 0.05, method='bonferroni')
        _, df_stat.loc[epi_metrics, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_stat.loc[epi_metrics, pval_col].values, 0.05, method='simes-hochberg')
    df_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_stat.to_excel(f"{path}/05_scores/{curr_score_name}/categorical/dnam/metrics.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/metrics_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/metrics_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(18, 8),
    )
    subfigs = fig.subfigures(
        nrows=2,
        ncols=3,
    )
    for epi_metric_id, epi_metric in enumerate(df_stat.index.values):
        row_id, col_id = divmod(epi_metric_id, 3)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.45,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[epi_metric])
        rho, pval = pearsonr(df_epi_scores[curr_score_name].values, df_epi_scores[epi_metric].values)
        ds_table.at[fr"Pearson $\rho$", epi_metric] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", epi_metric] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epi_metric,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epi_metric,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epi_metric])
        
        axs['12'].axis('off')
        
        x_min = df_epi_scores[epi_metric].min()
        x_max = df_epi_scores[epi_metric].max()
        x_ptp = x_max - x_min
        y_min = df_epi_scores[curr_score_name].min()
        y_max = df_epi_scores[curr_score_name].max()
        y_ptp = y_max - y_min
        regplot = sns.regplot(
            data=df_epi_scores,
            x=epi_metric,
            y=curr_score_name,
            color='dimgray',
            scatter=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_scores,
            x=epi_metric,
            y=curr_score_name,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(x_min - 0.1 * x_ptp, x_max + 0.1 * x_ptp)
        axs['21'].set_ylim(y_min - 0.1 * y_ptp, y_max + 0.1 * y_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_epi_scores,
            x=curr_score,
            y=epi_metric,
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(epi_metric)
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[epi_metric, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_stat.at[epi_metric, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_stat.at[epi_metric, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/metrics_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/dnam/metrics_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

# PhenoAge & CognitiveAge

## Load data

In [None]:
path = f"E:/YandexDisk/bbd/fmba"

df_pheno = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
cols_pheno = ['дата рождения']

df_ages = pd.read_excel(f"{path}/PhenoAge_CognitiveAge/df.xlsx", index_col=0)
df_ages.index = df_ages.index.astype(str)
cols_ages = [
    'Chronological Age (PhenoAge)',
    'PhenoAge',
    'Chronological Age (CognitiveAge)',
    'CognitiveAge'
]

df_phenoage_nmk = pd.read_excel(f"{path}/03_pheno_age/data_PhenoAge.xlsx", index_col=0)
df_phenoage_nmk.index = df_phenoage_nmk.index.astype(str)
df_phenoage_nmk.rename(columns={
    'PhenoAge': 'PhenoAge nmk',
    }, inplace=True
)
cols_phenoage_nmk = [
    'PhenoAge nmk',
]

df_scores = pd.read_excel(f"{path}/05_scores/df_scores.xlsx", index_col=0)
df_scores.index = df_scores.index.astype(str)
cols_scores = ['SCORE2 wo HDL', 'SCORE2 wo HDL group thr=0.05', 'SCORE2 aver HDL', 'SCORE2 aver HDL group thr=0.05']

dfs = [df_pheno[cols_pheno], df_ages[cols_ages], df_phenoage_nmk[cols_phenoage_nmk], df_scores[cols_scores]]
df = pd.concat(dfs, axis=1)
df['дата рождения'] = pd.to_datetime(df['дата рождения'])
df['Age'] = (pd.to_datetime("2024-11-11") - df['дата рождения']) / np.timedelta64(1, 'D') / 365.25
df['Chronological Age (PhenoAge nmk)'] = df['Age']

pheno_associations = {
    #'SCORE group thr=0.01': {
    #    'groups': ['Low', 'High'],
    #    'base': 'Low',
    #    'colors': {'Low': 'dodgerblue', 'High': 'crimson'}
    #},
    'SCORE2 wo HDL group thr=0.05': {
        'groups': ['Low', 'High'],
        'base': 'Low',
        'colors': {'Low': 'dodgerblue', 'High': 'crimson'}
    },
    'SCORE2 aver HDL group thr=0.05': {
        'groups': ['Low', 'High'],
        'base': 'Low',
        'colors': {'Low': 'dodgerblue', 'High': 'crimson'}
    },
}

## Ages (categorical score)

### Casual acceleration

In [None]:
# Here we take all samples for linreg
ages = ['PhenoAge', 'PhenoAge nmk', 'CognitiveAge']

for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    df_stat = pd.DataFrame(index=ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']
    
    for age in ages:

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/all").mkdir(parents=True, exist_ok=True)
        
        df_age_scores = df.dropna(subset=[age, curr_score])

        linreg = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores).fit()
        df_age_scores[f"{age} linear pred"] = linreg.predict(df_age_scores)
        df_age_scores[f"{age} acceleration"] = df_age_scores[age] - df_age_scores[f"{age} linear pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_age_scores.loc[df_age_scores[curr_score] == group, f"{age} acceleration"].values
        _, df_stat.at[age, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[age, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"Q('{age}') ~ Q('{curr_score}') + Age", data=df_age_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[age, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_age_scores,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            hue=curr_score,
            palette=score_colors,
            ax=ax
        )
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/all/hist_age_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/all/hist_age_corrected.pdf", bbox_inches='tight')
        plt.close(fig)

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[age])
        rho, pval = pearsonr(df_age_scores['Age'].values, df_age_scores[age].values)
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", age] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        xy_min = df_age_scores[[f'Chronological Age ({age})', age]].min().min()
        xy_max = df_age_scores[[f'Chronological Age ({age})', age]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_age_scores,
            x=f'Chronological Age ({age})',
            y=age,
            color='dimgray',
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_age_scores,
            x=f'Chronological Age ({age})',
            y=age,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlabel(f"Age")
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_age_scores,
            x=curr_score,
            y=f"{age} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[age, "mannwhitneyu_pval"]
        levene_pval = df_stat.at[age, "levene_pval"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols:
            title += f"\nANCOVA: {df_stat.at[age, f'ancova_{pval_col}_pval']:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/all/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/all/{age}.pdf", bbox_inches='tight')
        plt.close(fig)

In [None]:
# Here we take samples with low risk for linreg
ages = ['PhenoAge', 'PhenoAge nmk', 'CognitiveAge']

for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    df_stat = pd.DataFrame(index=ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']
    
    for age in ages:

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/low_risk").mkdir(parents=True, exist_ok=True)
        
        df_age_scores = df.dropna(subset=[age, curr_score])

        linreg = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :]).fit()
        df_age_scores[f"{age} linear pred"] = linreg.predict(df_age_scores)
        df_age_scores[f"{age} acceleration"] = df_age_scores[age] - df_age_scores[f"{age} linear pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_age_scores.loc[df_age_scores[curr_score] == group, f"{age} acceleration"].values
        _, df_stat.at[age, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[age, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"Q('{age}') ~ Q('{curr_score}') + Age", data=df_age_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[age, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_age_scores,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            hue=curr_score,
            palette=score_colors,
            ax=ax
        )
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/low_risk/hist_age_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/low_risk/hist_age_corrected.pdf", bbox_inches='tight')
        plt.close(fig)

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[age])
        rho, pval = pearsonr(df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :][f'Chronological Age ({age})'].values, df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :][age].values)
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", age] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        xy_min = df_age_scores[[f'Chronological Age ({age})', age]].min().min()
        xy_max = df_age_scores[[f'Chronological Age ({age})', age]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :],
            x=f'Chronological Age ({age})',
            y=age,
            color=score_colors[score_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_age_scores,
            x=f'Chronological Age ({age})',
            y=age,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlabel(f"Age")
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_age_scores,
            x=curr_score,
            y=f"{age} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[age, "mannwhitneyu_pval"]
        levene_pval = df_stat.at[age, "levene_pval"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols:
            title += f"\nANCOVA: {df_stat.at[age, f'ancova_{pval_col}_pval']:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/low_risk/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/casual_acc/low_risk/{age}.pdf", bbox_inches='tight')
        plt.close(fig)

### Epi-like acceleration

In [None]:
# Here we take all samples for linreg
ages = ['PhenoAge', 'PhenoAge nmk', 'CognitiveAge']

for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    df_stat = pd.DataFrame(index=ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']
    
    for age in ages:

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/all").mkdir(parents=True, exist_ok=True)
        
        df_age_scores = df.dropna(subset=[age, curr_score])

        linreg_cx = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores).fit()
        df_age_scores[f"{age}_linear_pred_cx"] = linreg_cx.predict(df_age_scores)
        df_age_scores[f"{age}_acceleration_cx"] = df_age_scores[age] - df_age_scores[f"{age}_linear_pred_cx"]
        df_age_scores[f"{age}"] = df_age_scores[f'Chronological Age ({age})'] + df_age_scores[f"{age}_acceleration_cx"]

        linreg = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores).fit()
        df_age_scores[f"{age} linear pred"] = linreg.predict(df_age_scores)
        df_age_scores[f"{age} acceleration"] = df_age_scores[age] - df_age_scores[f"{age} linear pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_age_scores.loc[df_age_scores[curr_score] == group, f"{age} acceleration"].values
        _, df_stat.at[age, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[age, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"Q('{age}') ~ Q('{curr_score}') + Age", data=df_age_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[age, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_age_scores,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            hue=curr_score,
            palette=score_colors,
            ax=ax
        )
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/all/hist_age_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/all/hist_age_corrected.pdf", bbox_inches='tight')
        plt.close(fig)

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[age])
        rho, pval = pearsonr(df_age_scores[f'Chronological Age ({age})'].values, df_age_scores[age].values)
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", age] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        xy_min = df_age_scores[[f'Chronological Age ({age})', age]].min().min()
        xy_max = df_age_scores[[f'Chronological Age ({age})', age]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_age_scores,
            x=f'Chronological Age ({age})',
            y=age,
            color='dimgray',
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_age_scores,
            x=f'Chronological Age ({age})',
            y=age,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlabel(f"Age")
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_age_scores,
            x=curr_score,
            y=f"{age} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[age, "mannwhitneyu_pval"]
        levene_pval = df_stat.at[age, "levene_pval"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols:
            title += f"\nANCOVA: {df_stat.at[age, f'ancova_{pval_col}_pval']:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/all/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/all/{age}.pdf", bbox_inches='tight')
        plt.close(fig)

In [None]:
# Here we take samples with low risk for linreg
ages = ['PhenoAge', 'PhenoAge nmk', 'CognitiveAge']

for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    df_stat = pd.DataFrame(index=ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']
    
    for age in ages:

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/low_risk").mkdir(parents=True, exist_ok=True)
        
        df_age_scores = df.dropna(subset=[age, curr_score])

        linreg_cx = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores).fit()
        df_age_scores[f"{age}_linear_pred_cx"] = linreg_cx.predict(df_age_scores)
        df_age_scores[f"{age}_acceleration_cx"] = df_age_scores[age] - df_age_scores[f"{age}_linear_pred_cx"]
        df_age_scores[f"{age}"] = df_age_scores[f'Chronological Age ({age})'] + df_age_scores[f"{age}_acceleration_cx"]

        linreg = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :]).fit()
        df_age_scores[f"{age} linear pred"] = linreg.predict(df_age_scores)
        df_age_scores[f"{age} acceleration"] = df_age_scores[age] - df_age_scores[f"{age} linear pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_age_scores.loc[df_age_scores[curr_score] == group, f"{age} acceleration"].values
        _, df_stat.at[age, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[age, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"Q('{age}') ~ Q('{curr_score}') + Age", data=df_age_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[age, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_age_scores,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            hue=curr_score,
            palette=score_colors,
            ax=ax
        )
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/low_risk/hist_age_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/low_risk/hist_age_corrected.pdf", bbox_inches='tight')
        plt.close(fig)

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[age])
        rho, pval = pearsonr(df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :][f'Chronological Age ({age})'].values, df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :][age].values)
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", age] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        xy_min = df_age_scores[[f'Chronological Age ({age})', age]].min().min()
        xy_max = df_age_scores[[f'Chronological Age ({age})', age]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :],
            x=f'Chronological Age ({age})',
            y=age,
            color=score_colors[score_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_age_scores,
            x=f'Chronological Age ({age})',
            y=age,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlabel(f"Age")
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_age_scores,
            x=curr_score,
            y=f"{age} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[age, "mannwhitneyu_pval"]
        levene_pval = df_stat.at[age, "levene_pval"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols:
            title += f"\nANCOVA: {df_stat.at[age, f'ancova_{pval_col}_pval']:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/low_risk/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/05_scores/{curr_score_name}/categorical/{age}/epi_like_acc/low_risk/{age}.pdf", bbox_inches='tight')
        plt.close(fig)

## Ages (continuous score)

In [None]:
# Here we take all samples for linreg
ages = ['PhenoAge', 'PhenoAge nmk', 'CognitiveAge']

for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    df_stat = pd.DataFrame(index=ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']
    
    for age in ages:

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/continuous/{age}/all").mkdir(parents=True, exist_ok=True)
        
        df_age_scores = df.dropna(subset=[age, curr_score])

        linreg = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores).fit()
        df_age_scores[f"{age} linear pred"] = linreg.predict(df_age_scores)
        df_age_scores[f"{age} acceleration"] = df_age_scores[age] - df_age_scores[f"{age} linear pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_age_scores.loc[df_age_scores[curr_score] == group, f"{age} acceleration"].values
        _, df_stat.at[age, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[age, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"Q('{age}') ~ Q('{curr_score}') + Age", data=df_age_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[age, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_age_scores,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            hue=curr_score,
            palette=score_colors,
            ax=ax
        )
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/all/hist_age_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/all/hist_age_corrected.pdf", bbox_inches='tight')
        plt.close(fig)

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[age])
        rho, pval = pearsonr(df_age_scores[curr_score_name].values, df_age_scores[f"{age} acceleration"].values)
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", age] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        x_min = df_age_scores[f"{age} acceleration"].min()
        x_max = df_age_scores[f"{age} acceleration"].max()
        x_ptp = x_max - x_min
        y_min = df_age_scores[curr_score_name].min()
        y_max = df_age_scores[curr_score_name].max()
        y_ptp = y_max - y_min
        regplot = sns.regplot(
            data=df_age_scores,
            x=f'{age} acceleration',
            y=curr_score_name,
            color='dimgray',
            scatter=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_age_scores,
            x=f"{age} acceleration",
            y=curr_score_name,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(x_min - 0.1 * x_ptp, x_max + 0.1 * x_ptp)
        axs['21'].set_ylim(y_min - 0.1 * y_ptp, y_max + 0.1 * y_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_age_scores,
            x=curr_score,
            y=f"{age} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[age, "mannwhitneyu_pval"]
        levene_pval = df_stat.at[age, "levene_pval"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols:
            title += f"\nANCOVA: {df_stat.at[age, f'ancova_{pval_col}_pval']:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/all/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/all/{age}.pdf", bbox_inches='tight')
        plt.close(fig)

In [None]:
# Here we take samples with low risk for linreg
ages = ['PhenoAge', 'PhenoAge nmk', 'CognitiveAge']

for curr_score in pheno_associations:
    curr_score_name = curr_score.split(' group')[0]
    df_stat = pd.DataFrame(index=ages)
    score_vals = pheno_associations[curr_score]['groups']
    score_val_base = pheno_associations[curr_score]['base']
    score_colors = pheno_associations[curr_score]['colors']
    
    for age in ages:

        pathlib.Path(f"{path}/05_scores/{curr_score_name}/continuous/{age}/low_risk").mkdir(parents=True, exist_ok=True)
        
        df_age_scores = df.dropna(subset=[age, curr_score])

        linreg = smf.ols(formula=f"Q('{age}') ~ Q('Chronological Age ({age})')", data=df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :]).fit()
        df_age_scores[f"{age} linear pred"] = linreg.predict(df_age_scores)
        df_age_scores[f"{age} acceleration"] = df_age_scores[age] - df_age_scores[f"{age} linear pred"]
        vals = {}
        for group in score_vals:
            vals[group] = df_age_scores.loc[df_age_scores[curr_score] == group, f"{age} acceleration"].values
        _, df_stat.at[age, "mannwhitneyu_pval"] = mannwhitneyu(vals[score_vals[0]], vals[score_vals[1]], alternative='two-sided')
        _, df_stat.at[age, "levene_pval"] = levene(vals[score_vals[0]], vals[score_vals[1]])
        regcov = smf.ols(formula=f"Q('{age}') ~ Q('{curr_score}') + Age", data=df_age_scores).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(curr_score_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[age, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']

        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_age_scores,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            hue=curr_score,
            palette=score_colors,
            ax=ax
        )
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/low_risk/hist_age_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/low_risk/hist_age_corrected.pdf", bbox_inches='tight')
        plt.close(fig)

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=[fr"Pearson $\rho$", "Pearson p-value"], columns=[age])
        rho, pval = pearsonr(df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :][curr_score_name].values, df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :][f"{age} acceleration"].values)
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Pearson p-value", age] = f"{pval:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        x_min = df_age_scores[f"{age} acceleration"].min()
        x_max = df_age_scores[f"{age} acceleration"].max()
        x_ptp = x_max - x_min
        y_min = df_age_scores[curr_score_name].min()
        y_max = df_age_scores[curr_score_name].max()
        y_ptp = y_max - y_min
        regplot = sns.regplot(
            data=df_age_scores.loc[df_age_scores[curr_score] == score_val_base, :],
            x=f'{age} acceleration',
            y=curr_score_name,
            color=score_colors[score_val_base],
            scatter=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_age_scores,
            x=f"{age} acceleration",
            y=curr_score_name,
            hue=curr_score,
            palette=score_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(score_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(x_min - 0.1 * x_ptp, x_max + 0.1 * x_ptp)
        axs['21'].set_ylim(y_min - 0.1 * y_ptp, y_max + 0.1 * y_ptp)
        scatter.legend_.set_title(curr_score_name)
        
        sns.violinplot(
            data=df_age_scores,
            x=curr_score,
            y=f"{age} acceleration",
            hue=curr_score,
            palette=score_colors,
            density_norm='width',
            order=score_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xlabel(curr_score_name)
        mannwhitneyu_pval = df_stat.at[age, "mannwhitneyu_pval"]
        levene_pval = df_stat.at[age, "levene_pval"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols:
            title += f"\nANCOVA: {df_stat.at[age, f'ancova_{pval_col}_pval']:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/low_risk/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/05_scores/{curr_score_name}/continuous/{age}/low_risk/{age}.pdf", bbox_inches='tight')
        plt.close(fig)