# Description
In-depth analysis of region-specific age-associated epigenetic features:
## 1. Intermediate samples (non-Yakutian origin, but lives all life in Yakutia)
- age-acceleration in the same age groups
## 2. Age acceleration in different age groups between the regions
## 3. Region-specific age-associated epigenetic features:
- Tests with interaction terms for different trends detection

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test, kruskal
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
import functools
import matplotlib.lines as mlines


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Init data

In [None]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)
path = f"D:/YandexDisk/Work/pydnameth/datasets/{platform}"
path_save = f"{path}/GSEUNN/special/050_central_vs_yakutia_deep"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

manifest['CHR'] = manifest['chr'].str[3::]

dnam_suffix = "_harm"

pheno = pd.read_excel(f"{path}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
pheno.drop(["I64_old", "I1_duplicate"], inplace=True)
pheno = pheno.loc[(pheno['Sample_Chronology'] == 0), :]

samples_central = pheno.index[(pheno['Region'] == 'Central') & (pheno['Status'] == 'Control')].values
samples_yakutia = pheno.index[(pheno['Region'] == 'Yakutia') & (pheno['Status'] == 'Control') & (pheno['Nationality'] == 'Sakha')].values
samples_intrmdt = pheno.index[(pheno['Region'] == 'Yakutia') & (pheno['Status'] == 'Control') & (pheno['Nationality'].isin(['Russian', 'Tatar']))].values
samples_all = list(set.union(set(samples_central), set(samples_yakutia), set(samples_intrmdt)))
pheno.loc[samples_central, 'Samples origin'] = 'Central'
pheno.loc[samples_yakutia, 'Samples origin'] = 'Yakutia'
pheno.loc[samples_intrmdt, 'Samples origin'] = 'Intermediate'

pheno = pheno.loc[samples_all, :]

betas = pd.read_pickle(f"{path}/{dataset}/betas{dnam_suffix}.pkl")
cpgs = betas.columns.values
df = pd.merge(pheno, betas, left_index=True, right_index=True)

colors = {
    "Central": "gold",
    "Yakutia": "lightslategray",
    "Intermediate": "palegreen"
}
samples = {
    "Central": samples_central,
    "Yakutia": samples_yakutia,
    "Intermediate": samples_intrmdt
}
cells = {f"{x}{dnam_suffix}": x for x in ["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"]}
ages = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    ages[x] = x

## 1.  Intermediate samples (non-Yakutian origin, but lives all life in Yakutia)

### Histograms for all samples

In [None]:
path_local = "01_intermediate_samples"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

hist_bins = np.linspace(5, 115, 23)

df_fig = df.loc[:, ['Age', 'Samples origin']].copy()

fig = plt.figure()
sns.set_theme(style='whitegrid')
hist = sns.histplot(
    data=df_fig,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Samples origin',
    palette=colors,
    multiple="stack"
)
hist.set(xlim=(0, 120))
plt.savefig(f"{path_save}/{path_local}/hist_all_samples.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_all_samples.pdf", bbox_inches='tight')
plt.close(fig)

### Selecting the same age range samples

In [None]:
df_fig = df.loc[:, ['Age', 'Samples origin'] + list(cells.keys()) + list(ages.keys()) + ['mPACE']].copy()
df_fig.rename(columns=cells, inplace=True)
df_fig.rename(columns=ages, inplace=True)
df_fig.rename(columns={'mPACE': 'DunedinPACE'}, inplace=True)
df_fig_old = df_fig.loc[(df_fig['Age'] > 90) & (df_fig['Age'] < 100), :].copy()

subsets = {
    'full': df_fig,
    'old': df_fig_old
}
dist_num_bins = 20

# Cells
for subset_name, subset_df in subsets.items():
    pathlib.Path(f"{path_save}/{path_local}/cells/{subset_name}").mkdir(parents=True, exist_ok=True)
    df_stat = pd.DataFrame()
    for cell in cells.values():
        vals = {}
        for group in ['Central', 'Yakutia', 'Intermediate']:
            vals[group] = subset_df.loc[subset_df['Samples origin'] == group, cell].values
            df_stat.at[cell, f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[cell, f"median_{group}"] = np.median(vals[group])
            df_stat.at[cell, f"q75_{group}"], df_stat.at[cell, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[cell, f"iqr_{group}"] = df_stat.at[cell, f"q75_{group}"] - df_stat.at[cell, f"q25_{group}"]
        _, pval = kruskal(*vals.values())
        df_stat.at[cell, "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{path_local}/cells/{subset_name}/stat.xlsx", index=True)

    for cell in cells.values():
        fig = go.Figure()
        vals = {}
        for group in ['Central', 'Yakutia', 'Intermediate']:
            vals[group] = subset_df.loc[subset_df['Samples origin'] == group, cell].values
            pointpos = 1.5
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=colors[group],
                    marker=dict(color=colors[group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{cell}", f"")
        fig.add_annotation(
            dict(
                font=dict(color='black', size=24),
                x=0.5,
                y=1.375,
                showarrow=False,
                text=f"KW p-value: {df_stat.at[cell, 'pval_fdr_bh']:0.2e}",
                textangle=0,
                yanchor='top',
                xanchor='center',
                xref="paper",
                yref="paper")
        )
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=True)
        fig.update_layout(legend={'itemsizing': 'constant'})

        stat_01, pval_01 = mannwhitneyu(vals['Central'], vals['Yakutia'], alternative='two-sided')
        stat_02, pval_02 = mannwhitneyu(vals['Central'], vals['Intermediate'], alternative='two-sided')
        stat_12, pval_12 = mannwhitneyu(vals['Yakutia'], vals['Intermediate'], alternative='two-sided')
        fig = add_p_value_annotation(fig, {(0,1): pval_01, (1, 2) : pval_12, (0,2): pval_02})

        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=800,
            height=600,
            margin=go.layout.Margin(
                l=120,
                r=50,
                b=70,
                t=150,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{path_local}/cells/{subset_name}/{cell}", scale=2)

# DunedinPACE
for subset_name, subset_df in subsets.items():
    pathlib.Path(f"{path_save}/{path_local}/DunedinPACE/{subset_name}").mkdir(parents=True, exist_ok=True)
    df_stat = pd.DataFrame()
    vals = {}
    for group in ['Central', 'Yakutia', 'Intermediate']:
        vals[group] = subset_df.loc[subset_df['Samples origin'] == group, 'DunedinPACE'].values
        df_stat.at['DunedinPACE', f"mean_{group}"] = np.mean(vals[group])
        df_stat.at['DunedinPACE', f"median_{group}"] = np.median(vals[group])
        df_stat.at['DunedinPACE', f"q75_{group}"], df_stat.at['DunedinPACE', f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at['DunedinPACE', f"iqr_{group}"] = df_stat.at['DunedinPACE', f"q75_{group}"] - df_stat.at['DunedinPACE', f"q25_{group}"]
    _, pval = kruskal(*vals.values())
    df_stat.at['DunedinPACE', "pval"] = pval
    df_stat.to_excel(f"{path_save}/{path_local}/DunedinPACE/{subset_name}/stat.xlsx", index=True)

    fig = go.Figure()
    vals = {}
    for group in ['Central', 'Yakutia', 'Intermediate']:
        vals[group] = subset_df.loc[subset_df['Samples origin'] == group, 'DunedinPACE'].values
        pointpos = 1.5
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=colors[group],
                marker=dict(color=colors[group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                pointpos=pointpos,
                bandwidth=np.ptp(vals[group]) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", f"DunedinPACE", f"")
    fig.add_annotation(
        dict(
            font=dict(color='black', size=24),
            x=0.5,
            y=1.375,
            showarrow=False,
            text=f"KW p-value: {df_stat.at['DunedinPACE', 'pval']:0.2e}",
            textangle=0,
            yanchor='top',
            xanchor='center',
            xref="paper",
            yref="paper")
    )
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(autorange=True)
    fig.update_layout(legend={'itemsizing': 'constant'})

    stat_01, pval_01 = mannwhitneyu(vals['Central'], vals['Yakutia'], alternative='two-sided')
    stat_02, pval_02 = mannwhitneyu(vals['Central'], vals['Intermediate'], alternative='two-sided')
    stat_12, pval_12 = mannwhitneyu(vals['Yakutia'], vals['Intermediate'], alternative='two-sided')
    fig = add_p_value_annotation(fig, {(0,1): pval_01, (1, 2) : pval_12, (0,2): pval_02})

    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=800,
        height=600,
        margin=go.layout.Margin(
            l=120,
            r=50,
            b=70,
            t=150,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/DunedinPACE/{subset_name}/DunedinPACE", scale=2)

# Ages
for subset_name, subset_df in subsets.items():
    pathlib.Path(f"{path_save}/{path_local}/ages/{subset_name}").mkdir(parents=True, exist_ok=True)
    df_stat = pd.DataFrame()
    for age in ages.values():
        formula = f"{age} ~ Age"
        model = smf.ols(formula=formula, data=subset_df.loc[subset_df['Samples origin'] == 'Central', :]).fit()
        subset_df[f"{age}_linear_pred"] = model.predict(subset_df)
        subset_df[f"{age}Acc"] = subset_df[age] - subset_df[f"{age}_linear_pred"]

        vals = {}
        for group in ['Central', 'Yakutia', 'Intermediate']:
            vals[group] = subset_df.loc[subset_df['Samples origin'] == group, f"{age}Acc"].values
            df_stat.at[f"{age}Acc", f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[f"{age}Acc", f"median_{group}"] = np.median(vals[group])
            df_stat.at[f"{age}Acc", f"q75_{group}"], df_stat.at[f"{age}Acc", f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[f"{age}Acc", f"iqr_{group}"] = df_stat.at[f"{age}Acc", f"q75_{group}"] - df_stat.at[f"{age}Acc", f"q25_{group}"]
        _, pval = kruskal(*vals.values())
        df_stat.at[f"{age}Acc", "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{path_local}/ages/{subset_name}/stat.xlsx", index=True)

    for age in ages.values():
        fig = go.Figure()
        vals = {}
        for group in ['Central', 'Yakutia', 'Intermediate']:
            vals[group] = subset_df.loc[subset_df['Samples origin'] == group, f"{age}Acc"].values
            pointpos = 1.5
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=colors[group],
                    marker=dict(color=colors[group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{age}Acc", f"")
        p_val = df_stat.at[f"{age}Acc", 'pval_fdr_bh']
        fig.add_annotation(
            dict(
                font=dict(color='black', size=24),
                x=0.5,
                y=1.375,
                showarrow=False,
                text=f"KW p-value: {p_val:0.2e}",
                textangle=0,
                yanchor='top',
                xanchor='center',
                xref="paper",
                yref="paper")
        )
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=True)
        fig.update_layout(legend={'itemsizing': 'constant'})

        stat_01, pval_01 = mannwhitneyu(vals['Central'], vals['Yakutia'], alternative='two-sided')
        stat_02, pval_02 = mannwhitneyu(vals['Central'], vals['Intermediate'], alternative='two-sided')
        stat_12, pval_12 = mannwhitneyu(vals['Yakutia'], vals['Intermediate'], alternative='two-sided')
        fig = add_p_value_annotation(fig, {(0,1): pval_01, (1, 2) : pval_12, (0,2): pval_02})

        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=800,
            height=600,
            margin=go.layout.Margin(
                l=120,
                r=50,
                b=70,
                t=150,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{path_local}/ages/{subset_name}/violin_{age}Acc", scale=2)

        min_val = subset_df[["Age", age]].min().min()
        max_val = subset_df[["Age", age]].max().max()
        shift_val = max_val - min_val
        min_val -= 0.05 * shift_val
        max_val += 0.05 * shift_val

        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        fig.add_trace(
            go.Scatter(
                x=subset_df.loc[subset_df['Samples origin'] == 'Central', f"Age"].values,
                y=subset_df.loc[subset_df['Samples origin'] == 'Central', f"{age}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                line=dict(width=5),
                marker_color=colors['Central'],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in ['Central', 'Yakutia', 'Intermediate']:
            fig.add_trace(
                go.Scatter(
                    x=subset_df.loc[subset_df['Samples origin'] == group, f"Age"].values,
                    y=subset_df.loc[subset_df['Samples origin'] == group, f"{age}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=colors[group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color='black',
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{path_local}/ages/{subset_name}/scatter_{age}")

# 2. Age acceleration in different age groups between the regions

### Sliding window

In [None]:
path_local = "02_age_acc_in_groups/sliding_window"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

df_fig = df.loc[df['Samples origin'].isin(['Central', 'Yakutia']), ['Age', 'Samples origin'] + list(cells.keys()) + list(ages.keys()) + ['mPACE']].copy()
df_fig.rename(columns=cells, inplace=True)
df_fig.rename(columns=ages, inplace=True)
df_fig.rename(columns={'mPACE': 'DunedinPACE'}, inplace=True)

for age in ages.values():
    formula = f"{age} ~ Age"
    model = smf.ols(formula=formula, data=df_fig.loc[df_fig['Samples origin'] == 'Central', :]).fit()
    df_fig[f"{age}_linear_pred"] = model.predict(df_fig)
    df_fig[f"{age}Acc"] = df_fig[age] - df_fig[f"{age}_linear_pred"]

age_window = 10
age_min = int(np.floor(df_fig['Age'].min())) + age_window
age_max = int(np.ceil(df_fig['Age'].max())) - age_window

age_cnts = list(range(age_min, age_max + 1))

df_stat = pd.DataFrame(index=age_cnts)
for age in ['DunedinPACE'] + list(ages.values()) :
    for age_cnt in age_cnts:
        age_lhs = age_cnt - age_window
        age_rhs = age_cnt + age_window

        if age != 'DunedinPACE':
            vals_central = df_fig.loc[(df_fig['Samples origin'] == 'Central') & (df_fig['Age'] >= age_lhs) & (df_fig['Age'] <= age_rhs), f"{age}Acc"].values
            vals_yakutia = df_fig.loc[(df_fig['Samples origin'] == 'Yakutia') & (df_fig['Age'] >= age_lhs) & (df_fig['Age'] <= age_rhs), f"{age}Acc"].values
        else:
            vals_central = df_fig.loc[(df_fig['Samples origin'] == 'Central') & (df_fig['Age'] >= age_lhs) & (df_fig['Age'] <= age_rhs), f"DunedinPACE"].values
            vals_yakutia = df_fig.loc[(df_fig['Samples origin'] == 'Yakutia') & (df_fig['Age'] >= age_lhs) & (df_fig['Age'] <= age_rhs), f"DunedinPACE"].values
        df_stat.at[age_cnt, f"{age}_mean_Central"] = np.mean(vals_central)
        df_stat.at[age_cnt, f"{age}_mean_Yakutia"] = np.mean(vals_yakutia)
        stat, pval = mannwhitneyu(vals_central, vals_yakutia, alternative='two-sided')
        df_stat.at[age_cnt, f"{age}_pval"] = pval

    _, df_stat[f"{age}_pval_fdr_bh"], _, _ = multipletests(df_stat[f"{age}_pval"], 0.05, method='fdr_bh')

    df_stat[f"{age}_sign_marker"] = '.'
    df_stat.loc[df_stat[f"{age}_pval_fdr_bh"] < 0.05, f"{age}_sign_marker"] = 'X'

    fig = plt.figure(figsize=(8, 6))
    sns.set_theme(style='whitegrid')
    legend_handles = []
    scatter = sns.scatterplot(
        data=df_stat,
        x=df_stat.index.values,
        y=f"{age}_mean_Central",
        style=f"{age}_sign_marker",
        markers={'.': '.', 'X': 'X'},
        linewidth=0.2,
        alpha=1,
        edgecolor="k",
        color=colors['Central'],
        s=50,
    )
    scatter.get_legend().remove()
    line = sns.lineplot(
        data=df_stat,
        x=df_stat.index.values,
        y=f"{age}_mean_Central",
        alpha=0.75,
        color=colors['Central'],
    )
    legend_handles.append(mlines.Line2D([], [], linestyle='-', color=colors['Central'],  label='Central'))
    scatter = sns.scatterplot(
        data=df_stat,
        x=df_stat.index.values,
        y=f"{age}_mean_Yakutia",
        style=f"{age}_sign_marker",
        markers={'.': '.', 'X': 'X'},
        linewidth=0.2,
        alpha=1,
        edgecolor="k",
        color=colors['Yakutia'],
        s=50,
    )
    scatter.get_legend().remove()
    line = sns.lineplot(
        data=df_stat,
        x=df_stat.index.values,
        y=f"{age}_mean_Yakutia",
        alpha=0.75,
        color=colors['Yakutia'],
    )
    legend_handles.append(mlines.Line2D([], [], linestyle='-', color=colors['Yakutia'],  label='Yakutia'))
    plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=3, frameon=False)
    scatter.set_xlabel("Age")
    if age != 'DunedinPACE':
        scatter.set_ylabel(f"{age}Acc")
    else:
        scatter.set_ylabel(f"{age}")
    plt.savefig(f"{path_save}/{path_local}/{age}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/{age}.pdf", bbox_inches='tight')
    plt.close()

df_stat.to_excel(f"{path_save}/{path_local}/stat.xlsx", index=True)

### Age groups

In [None]:
path_local = "02_age_acc_in_groups"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

df_fig = df.loc[:, ['Age', 'Samples origin'] + list(cells.keys()) + list(ages.keys()) + ['mPACE']].copy()
df_fig.rename(columns=cells, inplace=True)
df_fig.rename(columns=ages, inplace=True)
df_fig.rename(columns={'mPACE': 'DunedinPACE'}, inplace=True)

df_fig['Age group'] = 'All'
df_fig.loc[df_fig['Age'] < 30, 'Age group'] = 'Age < 30'
df_fig.loc[(df_fig['Age'] < 50) & (df_fig['Age'] >= 30), 'Age group'] = '30 <= Age < 50'
df_fig.loc[(df_fig['Age'] < 70) & (df_fig['Age'] >= 50), 'Age group'] = '50 <= Age < 70'
df_fig.loc[df_fig['Age'] >= 70, 'Age group'] = 'Age >= 70'

dist_num_bins = 10

age_groups = ['Age < 30', '30 <= Age < 50', '50 <= Age < 70', 'Age >= 70']

df_stat = pd.DataFrame()
for age in ages.values():
    formula = f"{age} ~ Age"
    model = smf.ols(formula=formula, data=df_fig.loc[df_fig['Samples origin'] == 'Central', :]).fit()
    df_fig[f"{age}_linear_pred"] = model.predict(df_fig)
    df_fig[f"{age}Acc"] = df_fig[age] - df_fig[f"{age}_linear_pred"]

    for group_id, group in enumerate(age_groups):
        vals_neg = df_fig.loc[(df_fig['Samples origin'] == 'Central') & (df_fig['Age group'] == group), f"{age}Acc"].values
        vals_pos = df_fig.loc[(df_fig['Samples origin'] == 'Yakutia') & (df_fig['Age group'] == group), f"{age}Acc"].values
        stat, pval = mannwhitneyu(vals_neg, vals_pos, alternative='two-sided')
        df_stat.at[age, group] = pval

for group_id, group in enumerate(age_groups):
    _, df_stat[f"{group}_fdr_bh"], _, _ = multipletests(df_stat[f"{group}"], 0.05, method='fdr_bh')
df_stat.to_excel(f"{path_save}/{path_local}/stat.xlsx", index=True)

for age in ages.values():
    xlabels = []
    fig = go.Figure()
    for group_id, group in enumerate(age_groups):
        vals_neg = df_fig.loc[(df_fig['Samples origin'] == 'Central') & (df_fig['Age group'] == group), f"{age}Acc"].values
        fig.add_trace(
            go.Violin(
                x=[group_id] * len(vals_neg),
                y=vals_neg,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=colors['Central'],
                marker=dict(color=colors['Central'], line=dict(color='black', width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals_neg) / dist_num_bins,
                opacity=0.8,
                legendgroup=group,
                scalegroup=group,
                side='negative',
                scalemode="width",
                pointpos=-1.5
            )
        )
        vals_pos = df_fig.loc[(df_fig['Samples origin'] == 'Yakutia') & (df_fig['Age group'] == group), f"{age}Acc"].values
        fig.add_trace(
            go.Violin(
                x=[group_id] * len(vals_pos),
                y=vals_pos,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=colors['Yakutia'],
                marker=dict(color=colors['Yakutia'], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals_pos) / dist_num_bins,
                opacity=0.8,
                legendgroup=group,
                scalegroup=group,
                scalemode="width",
                side='positive',
                pointpos=1.5
            )
        )
        pval = df_stat.at[age, f"{group}_fdr_bh"]
        xlabels.append(f"{group}<br>p-value: {pval:0.2e}")

    add_layout(fig, "", f"{age}Acc", "")
    fig.update_layout(
        title=dict(xref='paper', x=1.0),
        xaxis=dict(
            tickmode='array',
            tickvals=list(range(len(age_groups))),
            ticktext=xlabels,
            tickfont=dict(size=22),
        ),
    )
    fig.update_layout(
        violingap=0.39,
        violingroupgap=0.39,
        width=1200,
        height=600,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=80,
            t=30,
            pad=0,
        )
    )
    fig.update_xaxes(autorange=False, range=[-0.5, len(age_groups) - 0.5])
    fig.update_yaxes(autorange=True)
    fig.update_xaxes(tickangle=0)
    save_figure(fig, f"{path_save}/{path_local}/{age}")


xlabels = []
fig = go.Figure()
for group_id, group in enumerate(age_groups):
    vals_neg = df_fig.loc[(df_fig['Samples origin'] == 'Central') & (df_fig['Age group'] == group), 'DunedinPACE'].values
    fig.add_trace(
        go.Violin(
            x=[group_id] * len(vals_neg),
            y=vals_neg,
            name=group,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor=colors['Central'],
            marker=dict(color=colors['Central'], line=dict(color='black', width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals_neg) / dist_num_bins,
            opacity=0.8,
            legendgroup=group,
            scalegroup=group,
            side='negative',
            scalemode="width",
            pointpos=-1.5
        )
    )
    vals_pos = df_fig.loc[(df_fig['Samples origin'] == 'Yakutia') & (df_fig['Age group'] == group), 'DunedinPACE'].values
    fig.add_trace(
        go.Violin(
            x=[group_id] * len(vals_pos),
            y=vals_pos,
            name=group,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor=colors['Yakutia'],
            marker=dict(color=colors['Yakutia'], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals_pos) / dist_num_bins,
            opacity=0.8,
            legendgroup=group,
            scalegroup=group,
            scalemode="width",
            side='positive',
            pointpos=1.5
        )
    )
    stat, pval = mannwhitneyu(vals_neg, vals_pos, alternative='two-sided')
    xlabels.append(f"{group}<br>p-value: {pval:0.2e}")

add_layout(fig, "", 'DunedinPACE', "")
fig.update_layout(
    title=dict(xref='paper', x=1.0),
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(len(age_groups))),
        ticktext=xlabels,
        tickfont=dict(size=22),
    ),
)
fig.update_layout(
    violingap=0.39,
    violingroupgap=0.39,
    width=1200,
    height=600,
    margin=go.layout.Margin(
        l=100,
        r=50,
        b=80,
        t=30,
        pad=0,
    )
)
fig.update_xaxes(autorange=False, range=[-0.5, len(age_groups) - 0.5])
fig.update_yaxes(autorange=True)
fig.update_xaxes(tickangle=0)
save_figure(fig, f"{path_save}/{path_local}/DunedinPACE")

## 3. Region-specific age-associated epigenetic features:
- Tests with interaction terms for different trends detection

In [None]:
path_local = "03_region_age"
pathlib.Path(f"{path_save}/{path_local}/interaction").mkdir(parents=True, exist_ok=True)

df['Origin'] = df['Samples origin']
df_curr = df.loc[df['Origin'] != 'Intermediate', :]

formula = f"Age*C(Origin)"
terms = ['Intercept', 'Age', 'C(Origin)[T.Yakutia]', 'Age:C(Origin)[T.Yakutia]']
aim = 'Age:C(Origin)[T.Yakutia]'

df_stat = pd.DataFrame(index=cpgs, columns=['R2', 'R2_adj'] + [f"{t}_pvalue" for t in terms])
for cpg_id, cpg in tqdm(enumerate(cpgs), desc='from_formula', total=len(cpgs)):
    df_tmp = df_curr.loc[:, ['Age', 'Origin', cpg]]
    df_stat.at[cpg, 'Gene'] = manifest.loc[cpg, 'Gene']
    model = smf.ols(formula=f"{cpg} ~ {formula}", data=df_tmp).fit()
    df_stat.at[cpg, 'R2'] = model.rsquared
    df_stat.at[cpg, 'R2_adj'] = model.rsquared_adj
    pvals = dict(model.pvalues)
    for t in terms:
        df_stat.at[cpg, f"{t}_pvalue"] = pvals[t]

for t in terms:
    _, df_stat[f"{t}_pvalue_fdr_bh"], _, _ = multipletests(df_stat[f"{t}_pvalue"], 0.05, method='fdr_bh')

df_stat.to_excel(f"{path_save}/{path_local}/interaction/stat.xlsx", index_label="CpG")