# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import plotly.graph_objects as go
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pathlib
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from scipy.stats import mannwhitneyu, variation, levene
import statsmodels.formula.api as smf
from functools import reduce
import pyaging as pya
from tqdm import tqdm
import functools


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

def add_layout(fig, x_label, y_label, title, font_size=25):
    fig.update_layout(
        template="none",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="center",
            x=0.5,
            itemsizing='constant'
        ),
        title=dict(
            text=title,
            font=dict(
                size=font_size
            )
        ),
        autosize=True,
        margin=go.layout.Margin(
            l=120,
            r=20,
            b=80,
            t=100,
            pad=0
        ),
        showlegend=True,
        xaxis=get_axis(x_label, font_size, font_size),
        yaxis=get_axis(y_label, font_size, font_size),
    )


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]


def get_axis(title, title_size, tick_size):
    axis = dict(
        title=title,
        autorange=True,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.001,
        mirror="allticks",
        ticks='outside',
        titlefont=dict(
            color='black',
            size=title_size
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=tick_size
        ),
        exponentformat='e',
        showexponent='all'
    )
    return axis


# DNAm

## Load data

In [None]:
path = "E:/YandexDisk/bbd/fmba/dnam/processed/special_63/funnorm"
path_pyaging = "E:/YandexDisk/pydnameth/datasets/pyaging"
path_epimage = "D:/EpinflammAge"

epi_ages = []
epi_metrics = []
epi_imms = []
epi_scores = []

df_pheno = pd.read_excel(f"{path}/pheno_funnorm.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)

cols_pheno = [
    'Special Status'
]

df_pyaging = df_pheno
pyaging_meta = pd.read_excel(f"{path_pyaging}/clocks_meta_upd.xlsx", index_col='Clock Name')
pyaging_meta['Clock Name'] = pyaging_meta.index
df_pyaging.rename(columns=dict(zip(pyaging_meta['Model ID'].values, pyaging_meta['Clock Name'].values)), inplace=True)
pyaging_meta.drop(index=['Knight', 'LeeControl', 'LeeRefinedRobust', 'LeeRobust', 'PedBE', 'RepliTali', 'ENCen100', 'CpGPTGrimAge3', 'CpGPTPCGrimAge3',
                         'GrimAge2ADM', 'GrimAge2B2M', 'GrimAge2CystatinC', 'GrimAge2GDF15', 'GrimAge2Leptin', 'GrimAge2LogA1C', 'GrimAge2LogCRP', 'GrimAge2PackYrs', 'GrimAge2PAI1', 'GrimAge2TIMP1', 
                         'DNAmFitAgeGaitF', 'DNAmFitAgeGaitM', 'DNAmFitAgeGripF', 'DNAmFitAgeGripM', 'DNAmFitAgeVO2Max', 'DNAmIC'], inplace=True)
epi_ages += pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list()
epi_metrics += pyaging_meta[pyaging_meta['Type'] != 'Age'].index.to_list()
cols_pyaging = ['Age', 'Sex', 'Tissue'] + pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list() + pyaging_meta[pyaging_meta['Type'] != 'Age'].index.to_list()

df_episcores = pd.read_csv(f"{path}/episcores_Les_63_funnorm.csv", index_col=0)
df_episcores.index = df_episcores.index.astype(str)
df_episcores.index = df_episcores.index.str.replace('X', '', regex=True)
selected_cols = df_episcores.columns[~df_episcores.columns.isin(['Sex', 'True Age', 'Epigenetic Age (Zhang)', 'Epigenetic Age (Bernabeu)'])].to_list()
df_episcores.rename(columns={'Epigenetic Age (Bernabeu)': 'Bernabeu'}, inplace=True)
df_episcores.rename(columns=dict(zip(selected_cols, [f"{f} (EpiScores)" for f in selected_cols])), inplace=True)
epi_ages += ['Bernabeu']
epi_scores += [f"{f} (EpiScores)" for f in selected_cols]
cols_episcores = ['Bernabeu'] + [f"{f} (EpiScores)" for f in selected_cols]

df_epimage = pd.read_excel(f"{path}/EpInflammAge.xlsx", index_col=0)
df_epimage.index = df_epimage.index.astype(str)
imms_epimage = pd.read_excel(f"{path_epimage}/models/InflammatoryMarkers/InflammatoryMarkers.xlsx", index_col='feature').index.to_list()
for imm in imms_epimage:
    df_epimage[imm] = np.exp(df_epimage[f"{imm}_log"])
epi_ages += ['EpInflammAge']
cols_epimage = ['EpInflammAge'] + imms_epimage

n_cmn = df_pheno.index.intersection(df_pyaging.index).intersection(df_episcores.index).intersection(df_epimage.index)

dfs = [df_pheno[cols_pheno], df_pyaging[cols_pyaging], df_episcores[cols_episcores], df_epimage[cols_epimage]]
cols_pheno_all = cols_pheno
df_epi = reduce(lambda left,right: pd.merge(left, right, left_index=True, right_index=True), dfs)

pheno_associations = {
    'Special Status': {
        'groups': ['Control', 'Case'],
        'base': 'Control',
        'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    }
}

## Epigenetic ages (corrected)

In [None]:
df_epi_ages = df_epi[cols_pheno_all + ['Age'] + epi_ages].copy()
for an_col in pheno_associations:
    df_epi_ages_ass = df_epi_ages.loc[df_epi_ages[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_ages_stat = pd.DataFrame(index=epi_ages)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_ages_ass[an_col_str] = df_epi_ages_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    
    pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)
    
    epi_ages_mae = {}
    for epiage_id, epiage in enumerate(epi_ages):
        
        linreg_cx = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ass).fit()
        df_epi_ages_ass[f"{epiage}_linear_pred_cx"] = linreg_cx.predict(df_epi_ages_ass)
        df_epi_ages_ass[f"{epiage}_acceleration_cx"] = df_epi_ages_ass[epiage] - df_epi_ages_ass[f"{epiage}_linear_pred_cx"]
        df_epi_ages_ass[f"{epiage}"] = df_epi_ages_ass["Age"] + df_epi_ages_ass[f"{epiage}_acceleration_cx"]
        epi_ages_mae[epiage] = np.mean(np.abs(df_epi_ages_ass[f"{epiage}_acceleration_cx"].values))
        df_epi_ages_ass[f"{epiage} acceleration by MAE"] = df_epi_ages_ass[f"{epiage}_acceleration_cx"] / epi_ages_mae[epiage]
        df_epi_ages_ass.loc[:, f"{epiage} acceleration type"] = 0
        df_epi_ages_ass.loc[df_epi_ages_ass[f"{epiage} acceleration by MAE"] > 1.0, f"{epiage} acceleration type"] = 1.0
        df_epi_ages_ass.loc[df_epi_ages_ass[f"{epiage} acceleration by MAE"] < -1.0, f"{epiage} acceleration type"] = -1.0
        
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == an_val_base, :]).fit()
        df_epi_ages_ass[f"{epiage}_linear_pred"] = linreg.predict(df_epi_ages_ass)
        df_epi_ages_ass[f"{epiage} acceleration"] = df_epi_ages_ass[epiage] - df_epi_ages_ass[f"{epiage}_linear_pred"]
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == group, f"{epiage} acceleration"].values
            df_epi_ages_stat.at[epiage, f"Mean {group}"] = np.mean(vals[group])
            df_epi_ages_stat.at[epiage, f"Median {group}"] = np.median(vals[group])
            df_epi_ages_stat.at[epiage, f"Q75 {group}"], df_epi_ages_stat.at[epiage, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_ages_stat.at[epiage, f"IQR {group}"] = df_epi_ages_stat.at[epiage, f"Q75 {group}"] - df_epi_ages_stat.at[epiage, f"Q25 {group}"]
            df_epi_ages_stat.at[epiage, f"Variation {group}"] = variation(vals[group])
        _, df_epi_ages_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_ages_stat.at[epiage, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ {an_col_str} + Age", data=df_epi_ages_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_ages_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    
    df_epi_ages_ass[f"Epigenetic Ages Summary"] = df_epi_ages_ass.loc[:, [f"{epiage} acceleration type" for epiage in epi_ages]].sum(axis=1)
    df_epi_ages_ass[f"Epigenetic profile"] = 'Neutral'
    epi_profile_thld = 6
    df_epi_ages_ass.loc[df_epi_ages_ass[f"Epigenetic Ages Summary"] > epi_profile_thld, f"Epigenetic profile"] = 'Accelerated aging'
    df_epi_ages_ass.loc[df_epi_ages_ass[f"Epigenetic Ages Summary"] < -epi_profile_thld, f"Epigenetic profile"] = 'Decelerated aging'
    df_epi_ages_ass.to_excel(f"{path}/ages_data_corrected.xlsx")
    
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_ages_stat.columns[df_epi_ages_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_ages_stat.sort_values([f"{pval_col}"], ascending=[True], inplace=True)
    df_epi_ages_stat.to_excel(f"{path}/ages_corrected.xlsx")
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_ages_ass,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=an_col,
        palette=an_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/hist_age_corrected.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/hist_age_corrected.pdf", bbox_inches='tight')
    plt.close(fig)
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_ages_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/ages_pvals_{stat_test}_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/ages_pvals_{stat_test}_corrected.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(36, 20),
        layout="constrained"
    )
    subfigs = fig.subfigures(
        nrows=6,
        ncols=6,
        # wspace=0.001,
        # hspace=0.001,
    )
    for epiage_id, epiage in enumerate(df_epi_ages_stat.index.values):
        row_id, col_id = divmod(epiage_id, 6)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE (from diagonal)', 'MAE (from regression)', fr"Pearson $\rho$", "Bias"], columns=[epiage])
        mae_diag = mean_absolute_error(df_epi_ages_ass['Age'].values, df_epi_ages_ass[epiage].values)
        mae_regr = np.mean(np.abs(df_epi_ages_ass[f"{epiage} acceleration"].values))
        rho, _ = stats.pearsonr(df_epi_ages_ass['Age'].values, df_epi_ages_ass[epiage].values)
        bias = np.mean(df_epi_ages_ass[epiage] - df_epi_ages_ass['Age'])
        ds_table.at['MAE (from diagonal)', epiage] = f"{mae_diag:0.2f}"
        ds_table.at['MAE (from regression)', epiage] = f"{mae_regr:0.2f}"
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Bias", epiage] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_ages_ass[['Age', epiage]].min().min()
        xy_max = df_epi_ages_ass[['Age', epiage]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == an_val_base, :],
            x='Age',
            y=epiage,
            color=an_colors[an_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_ages_ass,
            x='Age',
            y=epiage,
            hue=an_col,
            palette=an_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(an_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_epi_ages_ass,
            x=an_col,
            y=f"{epiage} acceleration",
            hue=an_col,
            palette=an_colors,
            density_norm='width',
            order=an_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        mannwhitneyu_pval = df_epi_ages_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_ages_stat.at[epiage, "levene_pval_fdr_bh"]
        title = ''
        for pval_col in pvals_cols_ancova:
            title += f"ANCOVA: {df_epi_ages_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        title += f'\nMann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        axs['22'].set_title(title)

    fig.savefig(f"{path}/ages_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/ages_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

## Epigenetic metrics

In [None]:
df_epi_metrics = df_epi[cols_pheno_all + ['Age'] + epi_metrics].copy()
for an_col in pheno_associations:
    df_epi_metrics_ass = df_epi_metrics.loc[df_epi_metrics[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_metrics_stat = pd.DataFrame(index=epi_metrics)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_metrics_ass[an_col_str] = df_epi_metrics_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)
    for epi_metric_id, epi_metric in enumerate(epi_metrics):
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_metrics_ass.loc[df_epi_metrics_ass[an_col] == group, epi_metric].values
            df_epi_metrics_stat.at[epi_metric, f"Mean {group}"] = np.mean(vals[group])
            df_epi_metrics_stat.at[epi_metric, f"Median {group}"] = np.median(vals[group])
            df_epi_metrics_stat.at[epi_metric, f"Q75 {group}"], df_epi_metrics_stat.at[epi_metric, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_metrics_stat.at[epi_metric, f"IQR {group}"] = df_epi_metrics_stat.at[epi_metric, f"Q75 {group}"] - df_epi_metrics_stat.at[epi_metric, f"Q25 {group}"]
            df_epi_metrics_stat.at[epi_metric, f"Variation {group}"] = variation(vals[group])
        _, df_epi_metrics_stat.at[epi_metric, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_metrics_stat.at[epi_metric, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epi_metric} ~ {an_col_str} + Age", data=df_epi_metrics_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_metrics_stat.at[epi_metric, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_metrics_stat.columns[df_epi_metrics_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_metrics_stat.sort_values([f"{pval_col}"], ascending=[True], inplace=True)
    df_epi_metrics_stat.to_excel(f"{path}/metrics.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_metrics_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/metrics_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/metrics_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
    
    n_rows = 2
    n_cols = 3
    fig_width = 12
    fig_height = 9
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
    for epi_metric_id, epi_metric in enumerate(df_epi_metrics_stat.index.values):
        row_id, col_id = divmod(epi_metric_id, n_cols)
        
        ql = df_epi_metrics_ass[epi_metric].quantile(0.02)
        qh = df_epi_metrics_ass[epi_metric].quantile(0.98)
        
        sns.violinplot(
            data=df_epi_metrics_ass.loc[(df_epi_metrics_ass[epi_metric] > ql) & (df_epi_metrics_ass[epi_metric] < qh), :],
            x=an_col,
            y=epi_metric,
            palette=an_colors,
            scale='width',
            order=an_vals,
            saturation=0.75,
            ax=axs[row_id, col_id],
            legend=False,
            cut=0,
        )
        axs[row_id, col_id].set_ylabel(epi_metric)
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
        mannwhitneyu_pval = df_epi_metrics_stat.at[epi_metric, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_metrics_stat.at[epi_metric, "levene_pval_fdr_bh"]
        title = ''
        for pval_col in pvals_cols_ancova:
            title += f"ANCOVA: {df_epi_metrics_stat.at[epi_metric, pval_col + '_fdr_bh']:.2e}"
        title += f'\nMann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        axs[row_id, col_id].set_title(title)

    fig.savefig(f"{path}/metrics_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/metrics_distribution.pdf", bbox_inches='tight')
    plt.close(fig)
    

## Epigenetic scores

In [None]:
epi_scores_passed = []
for epi_score in epi_scores:
    if not df_epi[epi_score].eq(df_epi[epi_score].iloc[0]).all():
        epi_scores_passed.append(epi_score)
len(epi_scores_passed)

In [None]:
df_epi_scores = df_epi[cols_pheno_all + ['Age'] + epi_scores].copy()
for an_col in pheno_associations:
    df_epi_scores_ass = df_epi_scores.loc[df_epi_scores[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_scores_stat = pd.DataFrame(index=epi_scores)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_scores_ass[an_col_str] = df_epi_scores_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)
    for epi_score_id, epi_score in enumerate(epi_scores):
        epi_score_str = epi_score.replace(' ', '_')
        epi_score_str = epi_score_str.replace(':', '_')
        epi_score_str = epi_score_str.replace('%', 'percent')
        epi_score_str = epi_score_str.replace('.', '_')
        epi_score_str = epi_score_str.replace('-', '_')
        epi_score_str = epi_score_str.replace('(', '')
        epi_score_str = epi_score_str.replace(')', '')
        df_epi_scores_ass[epi_score_str] = df_epi_scores_ass[epi_score]
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_scores_ass.loc[df_epi_scores_ass[an_col] == group, epi_score].values
            df_epi_scores_stat.at[epi_score, f"Mean {group}"] = np.mean(vals[group])
            df_epi_scores_stat.at[epi_score, f"Median {group}"] = np.median(vals[group])
            df_epi_scores_stat.at[epi_score, f"Q75 {group}"], df_epi_scores_stat.at[epi_score, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_scores_stat.at[epi_score, f"IQR {group}"] = df_epi_scores_stat.at[epi_score, f"Q75 {group}"] - df_epi_scores_stat.at[epi_score, f"Q25 {group}"]
            df_epi_scores_stat.at[epi_score, f"Variation {group}"] = variation(vals[group])
        _, df_epi_scores_stat.at[epi_score, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_scores_stat.at[epi_score, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epi_score_str} ~ {an_col_str} + Age", data=df_epi_scores_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_scores_stat.at[epi_score, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_scores_stat.loc[epi_scores, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_scores_stat.loc[epi_scores, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_scores_stat.loc[epi_scores, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_scores_stat.loc[epi_scores, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_scores_stat.loc[epi_scores, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_scores_stat.loc[epi_scores, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_scores_stat.columns[df_epi_scores_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_scores_stat.loc[epi_scores, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_scores_stat.loc[epi_scores, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_scores_stat.loc[epi_scores, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_scores_stat.sort_values([f"{pval_col}"], ascending=[True], inplace=True)
    df_epi_scores_stat.to_excel(f"{path}/scores.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_scores_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/scores_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/scores_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
    
    n_rows = 8
    n_cols = 15
    fig_width = 60
    fig_height = 40
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
    for epi_score_id, epi_score in enumerate(df_epi_scores_stat.index.values):
        row_id, col_id = divmod(epi_score_id, n_cols)
        
        ql = df_epi_scores_ass[epi_score].quantile(0.02)
        qh = df_epi_scores_ass[epi_score].quantile(0.98)
        
        sns.violinplot(
            data=df_epi_scores_ass.loc[(df_epi_scores_ass[epi_score] >= ql) & (df_epi_scores_ass[epi_score] <= qh), :],
            x=an_col,
            y=epi_score,
            palette=an_colors,
            scale='width',
            order=an_vals,
            saturation=0.75,
            ax=axs[row_id, col_id],
            legend=False,
            cut=0,
        )
        axs[row_id, col_id].set_ylabel(epi_score)
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
        mannwhitneyu_pval = df_epi_scores_stat.at[epi_score, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_scores_stat.at[epi_score, "levene_pval_fdr_bh"]
        title = ''
        for pval_col in pvals_cols_ancova:
            title += f"ANCOVA: {df_epi_scores_stat.at[epi_score, pval_col + '_fdr_bh']:.2e}"
        title += f'\nMann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        axs[row_id, col_id].set_title(title)

    fig.savefig(f"{path}/scores_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/scores_distribution.pdf", bbox_inches='tight')
    plt.close(fig)

In [None]:
df_epinflammage = df_epi[cols_pheno_all + ['Age'] + epi_metrics].copy()
for an_col in pheno_associations:
    df_epi_metrics_ass = df_epi_metrics.loc[df_epi_metrics[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_metrics_stat = pd.DataFrame(index=epi_metrics)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_metrics_ass[an_col_str] = df_epi_metrics_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)
    for epi_metric_id, epi_metric in enumerate(epi_metrics):
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_metrics_ass.loc[df_epi_metrics_ass[an_col] == group, epi_metric].values
            df_epi_metrics_stat.at[epi_metric, f"Mean {group}"] = np.mean(vals[group])
            df_epi_metrics_stat.at[epi_metric, f"Median {group}"] = np.median(vals[group])
            df_epi_metrics_stat.at[epi_metric, f"Q75 {group}"], df_epi_metrics_stat.at[epi_metric, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_metrics_stat.at[epi_metric, f"IQR {group}"] = df_epi_metrics_stat.at[epi_metric, f"Q75 {group}"] - df_epi_metrics_stat.at[epi_metric, f"Q25 {group}"]
            df_epi_metrics_stat.at[epi_metric, f"Variation {group}"] = variation(vals[group])
        _, df_epi_metrics_stat.at[epi_metric, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_metrics_stat.at[epi_metric, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epi_metric} ~ {an_col_str} + Age", data=df_epi_metrics_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_metrics_stat.at[epi_metric, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_metrics_stat.columns[df_epi_metrics_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_metrics_stat.sort_values([f"{pval_col}"], ascending=[True], inplace=True)
    df_epi_metrics_stat.to_excel(f"{path}/metrics.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_metrics_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/metrics_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/metrics_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
    
    n_rows = 2
    n_cols = 3
    fig_width = 12
    fig_height = 9
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
    for epi_metric_id, epi_metric in enumerate(df_epi_metrics_stat.index.values):
        row_id, col_id = divmod(epi_metric_id, n_cols)
        
        ql = df_epi_metrics_ass[epi_metric].quantile(0.02)
        qh = df_epi_metrics_ass[epi_metric].quantile(0.98)
        
        sns.violinplot(
            data=df_epi_metrics_ass.loc[(df_epi_metrics_ass[epi_metric] > ql) & (df_epi_metrics_ass[epi_metric] < qh), :],
            x=an_col,
            y=epi_metric,
            palette=an_colors,
            scale='width',
            order=an_vals,
            saturation=0.75,
            ax=axs[row_id, col_id],
            legend=False,
            cut=0,
        )
        axs[row_id, col_id].set_ylabel(epi_metric)
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
        mannwhitneyu_pval = df_epi_metrics_stat.at[epi_metric, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_metrics_stat.at[epi_metric, "levene_pval_fdr_bh"]
        title = ''
        for pval_col in pvals_cols_ancova:
            title += f"ANCOVA: {df_epi_metrics_stat.at[epi_metric, pval_col + '_fdr_bh']:.2e}"
        title += f'\nMann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        axs[row_id, col_id].set_title(title)

    fig.savefig(f"{path}/metrics_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/metrics_distribution.pdf", bbox_inches='tight')
    plt.close(fig)

# EWAS

## Origin CpG list

In [None]:
df_mnfst = pd.read_pickle(f"E:/YandexDisk/pydnameth/datasets/GPL33022/manifest.pkl")
df_cpgs = pd.read_pickle(f"{path}/betas_funnorm.pkl")
df_cpgs.index = df_cpgs.index.astype(str)

cols_for_pya = ['Age', 'Sex', 'Tissue']
df_for_pya = pd.merge(df_epi[cols_for_pya], df_cpgs, left_index=True, right_index=True)
df_for_pya['Female'] = (df_for_pya['Sex'] == 'F').astype(int)
df_for_pya = pya.pp.epicv2_probe_aggregation(df_for_pya, verbose=True)
df_for_pya.drop(cols_for_pya + ['Female'], axis=1, inplace=True)

In [None]:
cpgs_to_str = df_for_pya.filter(regex='\.|-', axis=1).columns.values
cpgs_to_str_dict = {}
for cpg in tqdm(cpgs_to_str):
    new_cpg = cpg.replace('.', '_')
    new_cpg = new_cpg.replace('-', '_')
    cpgs_to_str_dict[cpg] = new_cpg
cpgs_to_str_inv_dict = {v: k for k, v in cpgs_to_str_dict.items()}
df_for_pya.rename(columns=cpgs_to_str_dict, inplace=True)
cpgs = df_for_pya.columns.values
df_for_pya = pd.concat([df_epi, df_for_pya], axis=1, join="inner")

In [None]:
for an_col in pheno_associations:
    df_for_pya_ass = df_for_pya.loc[df_for_pya[an_col].isin(pheno_associations[an_col]['groups'])]
    df_cpgs_stat = pd.DataFrame(index=cpgs)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace('-', '_')
    df_for_pya_ass[an_col_str] = df_for_pya_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}/{an_col}").mkdir(parents=True, exist_ok=True)
    for cpg in (pbar := tqdm(cpgs)):
        pbar.set_description(f"{cpg}")
        vals = {}
        for group in an_vals:
            vals[group] = df_for_pya_ass.loc[df_for_pya_ass[an_col] == group, cpg].values
        _, df_cpgs_stat.at[cpg, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_cpgs_stat.at[cpg, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{cpg} ~ {an_col_str} + Age", data=df_for_pya_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_cpgs_stat.at[cpg, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_cpgs_stat.loc[cpgs, "levene_pval_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_cpgs_stat.loc[cpgs, "levene_pval_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "levene_pval"].values, 0.05, method='bonferroni')
    pvals_cols_ancova = df_cpgs_stat.columns[df_cpgs_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_cpgs_stat.loc[cpgs, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, pval_col].values, 0.05, method='fdr_bh')
        _, df_cpgs_stat.loc[cpgs, f"{pval_col}_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, pval_col].values, 0.05, method='bonferroni')
    df_cpgs_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_cpgs_stat.rename(index=cpgs_to_str_inv_dict, inplace=True)
    df_cpgs_stat.to_excel(f"{path}/{an_col}/diff_stat_cpgs_orgn.xlsx")

## Filtered CpG list

In [None]:
pvals_cols = ['mannwhitneyu_pval', 'levene_pval', 'ancova_Special_Status[T.Control]_pval']
df_cpgs_stat_fltd = pd.read_excel(f"{path}/{an_col}/diff_stat_cpgs_orgn.xlsx", index_col=0)
cpgs_fltd = pd.read_csv(f"{path}/cpgs_fltd.csv", index_col=0)
cpgs_fltd = cpgs_fltd.index.tolist()
for cpg_id in range(0, len(cpgs_fltd)):
    cpgs_fltd[cpg_id] = cpgs_fltd[cpg_id].split('_')[0]
cpgs_fltd = set(cpgs_fltd)
df_cpgs_stat_fltd = df_cpgs_stat_fltd.loc[list(cpgs_fltd.intersection(df_cpgs_stat_fltd.index)), :]
for pvals_col in pvals_cols:
    _, df_cpgs_stat_fltd.loc[df_cpgs_stat_fltd.index, f"{pvals_col}_fdr_bh"], _, _ = multipletests(df_cpgs_stat_fltd.loc[df_cpgs_stat_fltd.index, pvals_col].values, 0.05, method='fdr_bh')
    _, df_cpgs_stat_fltd.loc[df_cpgs_stat_fltd.index, f"{pvals_col}_bonferroni"], _, _ = multipletests(df_cpgs_stat_fltd.loc[df_cpgs_stat_fltd.index, pvals_col].values, 0.05, method='bonferroni')
df_cpgs_stat_fltd.to_excel(f"{path}/{an_col}/diff_stat_cpgs_fltd.xlsx")