In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
import matplotlib.pyplot as plt
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot as upset
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from scripts.python.routines.plot.p_value import add_p_value_annotation

# Init dnam and immuno data

In [None]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)
manifest['CHR'] = manifest['chr'].str[3::]

dnam_suffix = "_harm"

immuno_samples = "all_1052_121222" # "ctrl_415_from_all_1052_121222"
immuno_proc = "raw"
immuno_imp = "fast_knn"
immuno_replace = "quarter"

select_dnam = 'chronology_0' # "common_with_immuno" "chronology_0"
select_immuno = "all" # "all" "260_ml_draft"

path_save = f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia/tmp"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

df_immuno = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples({immuno_samples})_proc({immuno_proc})_imp({immuno_imp})_replace({immuno_replace}).xlsx", index_col="index")

pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
pheno.drop(["I64_old", "I1_duplicate"], inplace=True)

# Check DNAm only index
index_dnam_only = pheno.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

# Check phenotype differences in Immunology and DNAm data
indexes_common_glob = pheno.index.intersection(df_immuno.index)
is_region_equal_glob = pheno.loc[indexes_common_glob, 'Region'].equals(df_immuno.loc[indexes_common_glob, 'Region'])
is_sex_equal_glob = pheno.loc[indexes_common_glob, 'Sex'].equals(df_immuno.loc[indexes_common_glob, 'Sex'])
is_status_equal_glob = pheno.loc[indexes_common_glob, 'Status'].equals(df_immuno.loc[indexes_common_glob, 'Status'])
age_diff_glob = np.abs(pheno.loc[indexes_common_glob, 'Age'].values - df_immuno.loc[indexes_common_glob, 'Age'].values)
age_diff_max_glob = np.max(age_diff_glob)
print(f"is_region_equal_glob: {is_region_equal_glob}")
print(f"is_sex_equal_glob: {is_sex_equal_glob}")
print(f"is_status_equal_glob: {is_status_equal_glob}")
print(f"age_diff_max_glob: {age_diff_max_glob}")

df_immuno['is_dnam'] = False
df_immuno.loc[pheno.index.intersection(df_immuno.index), 'is_dnam'] = True
df_immuno = df_immuno.loc[(df_immuno["Status"] == "Control"), :]
df_immuno["Region"].replace({"Yakutiya": "Yakutia"}, inplace=True)
feats_immuno = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()
# Replace Age in DNAm:
pheno.loc[pheno.index.intersection(df_immuno.index), 'Age'] = df_immuno.loc[pheno.index.intersection(df_immuno.index), 'Age']
age_diff = np.abs(pheno.loc[pheno.index.intersection(df_immuno.index), 'Age'].values - df_immuno.loc[pheno.index.intersection(df_immuno.index), 'Age'].values)
age_diff_max = np.max(age_diff)
print(f"age_diff_max: {age_diff_max}")
# Immuno selection
if select_immuno == "260_ml_draft":
    df_immuno = df_immuno.loc[(df_immuno["260ai"] == True) | (df_immuno["Region"] == "Yakutia"), :]

# Remove longitudinal
df_immuno = df_immuno.loc[~df_immuno.index.str.contains("\(1\)|\(2\)|\(3\)"), :]

betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas{dnam_suffix}.pkl")
feats_dnam = betas.columns.values
df_dnam = pd.merge(pheno, betas, left_index=True, right_index=True)
df_dnam = df_dnam.loc[(df_dnam["Status"] == "Control"), :]
df_dnam["Region and residence"] = "Central"
df_dnam.loc[(df_dnam["Region"] == "Yakutia") & (df_dnam["Residence"] == "City"),"Region and residence"] = "Yakutia (City)"
df_dnam.loc[(df_dnam["Region"] == "Yakutia") & (df_dnam["Residence"] == "Village"),"Region and residence"] = "Yakutia (Village)"
# DNAm selection
if select_dnam == "common_with_immuno":
    df_dnam = df_dnam.loc[df_dnam.index.intersection(df_immuno.index).values, :]
elif select_dnam == 'chronology_0':
    df_dnam = df_dnam.loc[df_dnam["Sample_Chronology"] == 0, :]

index_common = df_dnam.index.intersection(df_immuno.index).values

index_dnam_only = df_dnam.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

n_samples_immuno_central = len(df_immuno.index[df_immuno["Region"] == "Central"].values)
n_samples_immuno_yakutia = len(df_immuno.index[df_immuno["Region"] == "Yakutia"].values)
n_samples_dnam_central = len(df_dnam.index[df_dnam["Region"] == "Central"].values)
n_samples_dnam_yakutia = len(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)
print(f"n_samples_immuno_central: {n_samples_immuno_central}")
print(f"n_samples_immuno_yakutia: {n_samples_immuno_yakutia}")
print(f"n_samples_dnam_central: {n_samples_dnam_central}")
print(f"n_samples_dnam_yakutia: {n_samples_dnam_yakutia}")

## DNAm and Immuno comparison

### Venn diagrams

In [None]:
fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(df_immuno.index[df_immuno["Region"] == "Central"].values), set(df_dnam.index[df_dnam["Region"] == "Central"].values)),
    set_labels = ('Immuno', 'DNAm'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(df_immuno.index[df_immuno["Region"] == "Central"].values), set(df_dnam.index[df_dnam["Region"] == "Central"].values)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_save}/venn_Central.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/venn_Central.pdf", bbox_inches='tight')
plt.clf()

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(df_immuno.index[df_immuno["Region"] == "Yakutia"].values), set(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)),
    set_labels = ('Immuno', 'DNAm'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(df_immuno.index[df_immuno["Region"] == "Yakutia"].values), set(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_save}/venn_Yakutia.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/venn_Yakutia.pdf", bbox_inches='tight')
plt.clf()

### Histograms

In [None]:
df_participants = df_dnam.loc[:, ["Age", "Sex", "Region", "Status"]]
df_participants["Data"] = "DNAm only"
df_participants.loc[index_common, "Data"] = "DNAm and Immuno"

# Params for figure
binrange = [10, 105]
bins = 15

palette = {
    "DNAm and Immuno": "forestgreen",
    "DNAm only": "lawngreen",
}
hue_order = ['DNAm only', 'DNAm and Immuno']
fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_participants.loc[df_participants["Region"] == "Central", :],
    hue_order=hue_order,
    binrange=binrange,
    bins=bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/hist_Central.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/hist_Central.pdf", bbox_inches='tight')
plt.close(fig)
print(f"Central DNAm: {df_participants.loc[(df_participants['Data'] == 'DNAm and Immuno') & (df_participants['Region'] == 'Central'), :].shape[0]}")

palette = {
    "DNAm and Immuno": "royalblue",
    "DNAm only": "deepskyblue",
}
fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_participants.loc[df_participants["Region"] == "Yakutia", :],
    hue_order=hue_order,
    binrange=binrange,
    bins=bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/hist_Yakutia.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/hist_Yakutia.pdf", bbox_inches='tight')
plt.close(fig)

# DNAm

## Setup

In [None]:
ll_thld = 85

is_plot_ll = False
is_plot_residence = False

problem = {
    "Color": {
        "Central": "orangered",
        "Yakutia": "mediumblue",
    },
    "ColorXTD": {
        "Central": "orangered",
        "Yakutia (Village)": "mediumblue",
        "Yakutia (City)": "lightskyblue",
    },
    "ColorLL": {
        "Central": "orangered",
        "Yakutia": "midnightblue",
    },
    "Filter": {
        "Central": df_dnam["Region"] == "Central",
        "Yakutia": df_dnam["Region"] == "Yakutia",
    },
    "FilterXTD": {
        "Central": (df_dnam["Region and residence"] == "Central"),
        "Yakutia (Village)": df_dnam["Region and residence"] == "Yakutia (Village)",
        "Yakutia (City)": df_dnam["Region and residence"] == "Yakutia (City)",
    },
    "FilterLL": {
        "Central": (df_dnam["Region"] == "Central") & (df_dnam["Age"] > ll_thld),
        "Yakutia": (df_dnam["Region"] == "Yakutia") & (df_dnam["Age"] > ll_thld),
    },
    "FilterNoLL": {
        "Central": (df_dnam["Region"] == "Central") & (df_dnam["Age"] < ll_thld),
        "Yakutia": (df_dnam["Region"] == "Yakutia") & (df_dnam["Age"] < ll_thld),
    },
    "BaseFilter": (df_dnam["Region"] == "Central"),
    "BasePart": "Central"
}

## Create data for R

In [None]:
pathlib.Path(f"{path_save}/dnam/data_for_R").mkdir(parents=True, exist_ok=True)

betas_R = df_dnam.loc[:, feats_dnam]
betas_R = betas_R.T
betas_R.index.name = "CpG"
betas_R.to_pickle(f"{path_save}/dnam/data_for_R/betas.pkl")
pheno_R = df_dnam.loc[:, ["Age", "Sex", "Region", "Sentrix_ID", "Sentrix_Position"]]
pheno_R.to_pickle(f"{path_save}/dnam/data_for_R/pheno.pkl")

if is_plot_ll:
    betas_R_ll = df_dnam.loc[df_dnam["Age"] > ll_thld, feats_dnam]
    betas_R_ll = betas_R_ll.T
    betas_R_ll.index.name = "CpG"
    betas_R_ll.to_pickle(f"{path_save}/dnam/data_for_R/betas_ll.pkl")
    pheno_R_ll = df_dnam.loc[df_dnam["Age"] > ll_thld, ["Age", "Sex", "Region", "Sentrix_ID", "Sentrix_Position"]]
    pheno_R_ll.to_pickle(f"{path_save}/dnam/data_for_R/pheno_ll.pkl")

if is_plot_residence:
    betas_R_yakutia = df_dnam.loc[df_dnam["Region"] == "Yakutia", feats_dnam]
    betas_R_yakutia = betas_R_yakutia.T
    betas_R_yakutia.index.name = "CpG"
    betas_R_yakutia.to_pickle(f"{path_save}/dnam/data_for_R/betas_yakutia.pkl")
    pheno_R_yakutia = df_dnam.loc[df_dnam["Region"] == "Yakutia", ["Age", "Sex", "Region", "Residence"]]
    pheno_R_yakutia.to_pickle(f"{path_save}/dnam/data_for_R/pheno_yakutia.pkl")

## 1. Data description

### Participants figure

In [None]:
path_local = "dnam/01_data_description/participants"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

hist_min = df_dnam.loc[:, f"Age"].min()
hist_max = df_dnam.loc[:, f"Age"].max()
hist_width = hist_max - hist_min
hist_n_bins = 20
hist_bin_width = hist_width / hist_n_bins

hist_bins = np.linspace(5, 115, 23)

df_fig = df_dnam.loc[:, ['Age', 'Region']]
dict_keys = {key: f"{key}: {df_dnam[problem['Filter'][key]].shape[0]}" for key in problem['Filter']}
colors = {dict_keys[key]: val for key, val in problem['Color'].items()}
df_fig['Region'].replace(dict_keys, inplace=True)
fig = plt.figure()
sns.set_theme(style='whitegrid')
hist = sns.histplot(
    data=df_fig,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    hue_order=[dict_keys["Yakutia"], dict_keys["Central"]],
    x="Age",
    hue="Region",
    palette=colors
)
hist.set(xlim=(0, 120))
plt.savefig(f"{path_save}/{path_local}/hist_region.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_region.pdf", bbox_inches='tight')
plt.close(fig)

if is_plot_residence:
    df_fig = df_dnam.loc[:, ['Age', 'Region and residence']]
    dict_keys = {key: f"{key}: {df_dnam[problem['FilterXTD'][key]].shape[0]}" for key in problem['FilterXTD']}
    colors = {dict_keys[key]: val for key, val in problem['ColorXTD'].items()}
    df_fig['Region and residence'].replace(dict_keys, inplace=True)
    fig = plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_fig,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        hue_order=[dict_keys["Yakutia (City)"], dict_keys["Yakutia (Village)"], dict_keys["Central"]],
        x="Age",
        hue="Region and residence",
        palette=colors
    )
    hist.set(xlim=(0, 120))
    plt.savefig(f"{path_save}/{path_local}/hist_region_and_residence.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/hist_region_and_residence.pdf", bbox_inches='tight')
    plt.close(fig)

### Features

In [None]:
path_local = "dnam/01_data_description/feats"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

df_dnam_age = pd.read_csv(f"{path_save}/dnam/data_for_R/DMP_age.csv", index_col="CpG")
df_dnam_age["CpG"] = df_dnam_age.index.values
df_dnam_age['print'] = df_dnam_age.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
top_to_hightlight = df_dnam_age["print"].values[0:5]
df_dnam_age['log_pval'] = -np.log10(df_dnam_age["adj.P.Val"])
sns.set_theme(style='whitegrid')
df_dnam_age.sort_values(["MAPINFO"], ascending=[True], inplace=True)
mhat(
    df=df_dnam_age,
    chr='CHR',
    pv='adj.P.Val',
    path=f"{path_save}/{path_local}",
    valpha=1,
    markernames=tuple(top_to_hightlight),
    markeridcol='print',
    gstyle=2,
    dim=(12,4),
    axtickfontsize=8
)

## 2. Cells

In [None]:
path_local = "dnam/02_cells"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
cells = {f"{x}{dnam_suffix}": x for x in ["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"]}
df_cells = pd.DataFrame()
df_cells_ll = pd.DataFrame()
df_cells_xtd = pd.DataFrame()
for cell in tqdm(cells):
    vals = {}
    lls = {}
    for group in problem["Filter"]:

        vals[group] = df_dnam.loc[problem["Filter"][group], cell].values
        df_cells.at[cell, f"mean_{group}"] = np.mean(vals[group])
        df_cells.at[cell, f"median_{group}"] = np.median(vals[group])
        df_cells.at[cell, f"q75_{group}"], df_cells.at[cell, f"q25_{group}"] = np.percentile(vals[group], [75 ,25])
        df_cells.at[cell, f"iqr_{group}"] = df_cells.at[cell, f"q75_{group}"] - df_cells.at[cell, f"q25_{group}"]

        lls[group] = df_dnam.loc[problem["FilterLL"][group], cell].values
        df_cells_ll.at[cell, f"mean_{group}"] = np.mean(lls[group])
        df_cells_ll.at[cell, f"median_{group}"] = np.median(lls[group])
        df_cells_ll.at[cell, f"q75_{group}"], df_cells_ll.at[cell, f"q25_{group}"] = np.percentile(lls[group], [75 ,25])
        df_cells_ll.at[cell, f"iqr_{group}"] = df_cells_ll.at[cell, f"q75_{group}"] - df_cells_ll.at[cell, f"q25_{group}"]

    _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_cells.at[cell, "pval"] = pval

    _, pval = mannwhitneyu(*lls.values(), alternative='two-sided')
    df_cells_ll.at[cell, "pval"] = pval

    xtds = {}
    for group in ["Yakutia (City)", "Yakutia (Village)"]:
        xtds[group] = df_dnam.loc[problem["FilterXTD"][group], cell].values
        df_cells_xtd.at[cell, f"mean_{group}"] = np.mean(xtds[group])
        df_cells_xtd.at[cell, f"median_{group}"] = np.median(xtds[group])
        df_cells_xtd.at[cell, f"q75_{group}"], df_cells_xtd.at[cell, f"q25_{group}"] = np.percentile(xtds[group], [75 ,25])
        df_cells_xtd.at[cell, f"iqr_{group}"] = df_cells_xtd.at[cell, f"q75_{group}"] - df_cells_xtd.at[cell, f"q25_{group}"]

    _, pval = mannwhitneyu(*xtds.values(), alternative='two-sided')
    df_cells_xtd.at[cell, "pval"] = pval

_, df_cells["pval_fdr_bh"], _, _ = multipletests(df_cells["pval"], 0.05, method='fdr_bh')
df_cells.to_excel(f"{path_save}/{path_local}/cells.xlsx", index=True)

_, df_cells_ll["pval_fdr_bh"], _, _ = multipletests(df_cells_ll["pval"], 0.05, method='fdr_bh')
if is_plot_ll:
    df_cells_ll.to_excel(f"{path_save}/{path_local}/cells_ll.xlsx", index=True)

_, df_cells_xtd["pval_fdr_bh"], _, _ = multipletests(df_cells_xtd["pval"], 0.05, method='fdr_bh')
if is_plot_residence:
    df_cells_xtd.to_excel(f"{path_save}/{path_local}/cells_xtd.xlsx", index=True)

In [None]:
for cell in tqdm(cells):

    vals = {}
    lls = {}
    for group in problem["Filter"]:
        vals[group] = df_dnam.loc[problem["Filter"][group], cell].values
        print(f"{group}: {len(vals[group])}")
        lls[group] = df_dnam.loc[problem["FilterLL"][group], cell].values
        print(f"{group} ll: {len(lls[group])}")

    xtds = {}
    for group in ["Yakutia (City)", "Yakutia (Village)"]:
        xtds[group] = df_dnam.loc[problem["FilterXTD"][group], cell].values

    # Plot with residence ==========================================================
    if is_plot_residence:
        dist_num_bins = 15
        fig = go.Figure()
        fig.add_trace(
            go.Violin(
                x=[''] * len(vals['Central']),
                y=vals['Central'],
                name='Central',
                box_visible=True,
                meanline_visible=True,
                showlegend=True,
                line_color='black',
                fillcolor=problem["ColorXTD"]['Central'],
                marker = dict(color=problem["ColorXTD"]['Central'], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals['Central']) / dist_num_bins,
                opacity=0.8
            )
        )
        fig.add_trace(
            go.Violin(
                x=[f"{df_cells_xtd.at[cell, 'pval_fdr_bh']:0.2e}"] * len(xtds["Yakutia (Village)"]),
                y=xtds["Yakutia (Village)"],
                name="Yakutia (Village)",
                box_visible=True,
                meanline_visible=True,
                showlegend=True,
                line_color='deepskyblue',
                fillcolor=problem["ColorXTD"]["Yakutia (Village)"],
                marker=dict(color=problem["ColorXTD"]["Yakutia (Village)"], line=dict(color='deepskyblue', width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(xtds["Yakutia (Village)"]) / dist_num_bins,
                opacity=0.8,
                legendgroup="Yakutia (Village)",
                scalegroup="Yakutia",
                side='negative',
                scalemode="width",
                pointpos=-1.5
            )
        )
        fig.add_trace(
            go.Violin(
                x=[f"{df_cells_xtd.at[cell, 'pval_fdr_bh']:0.2e}"] * len(xtds["Yakutia (City)"]),
                y=xtds["Yakutia (City)"],
                name="Yakutia (City)",
                box_visible=True,
                meanline_visible=True,
                showlegend=True,
                line_color='black',
                fillcolor=problem["ColorXTD"]["Yakutia (City)"],
                marker=dict(color=problem["ColorXTD"]["Yakutia (City)"], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(xtds["Yakutia (City)"]) / dist_num_bins,
                opacity=0.8,
                legendgroup="Yakutia (City)",
                scalegroup="Yakutia",
                scalemode="width",
                side='positive',
                pointpos=1.5
            )
        )
        add_layout(fig, "", f"{cells[cell]}", f"")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=25)
        fig.update_layout(legend={'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=600,
            margin=go.layout.Margin(
                l=150,
                r=50,
                b=55,
                t=100,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.1)
        fig = add_p_value_annotation(fig, {(0, 1): df_cells.at[cell, 'pval_fdr_bh']})
        fig.for_each_annotation(lambda a: a.update(font_size=23))
        fig.update_layout(yaxis=dict(tickfont=dict(size=23)))
        save_figure(fig, f"{path_save}/{path_local}/{cell}_residence")

    # Plot with longevity ==========================================================
    if is_plot_ll:
        dist_num_bins = 20
        fig = go.Figure()
        for group in problem["Filter"]:
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=problem["Color"][group],
                    marker=dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8,
                    legendgroup=group,
                    scalegroup=group,
                    side='negative',
                    scalemode="width",
                    pointpos=-1.5
                )
            )
            fig.add_trace(
                go.Violin(
                    y=lls[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='orange',
                    fillcolor=problem["ColorLL"][group],
                    marker=dict(color=problem["ColorLL"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(lls[group]) / dist_num_bins,
                    opacity=0.8,
                    legendgroup=group,
                    scalegroup=group,
                    scalemode="width",
                    side='positive',
                    pointpos=1.5
                )
            )
        add_layout(fig, "", f"{cells[cell]}", f"p-value (all): {df_cells.at[cell, 'pval_fdr_bh']:0.2e}<br>p-value ({ll_thld}+): {df_cells_ll.at[cell, 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=600,
            margin=go.layout.Margin(
                l=150,
                r=50,
                b=55,
                t=100,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{path_local}/{cell}_ll")

    # Regular plot =======================================================
    dist_num_bins = 15
    fig = go.Figure()
    for group in problem["Filter"]:
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals[group]) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", f"{cells[cell]}", f"p-value: {df_cells.at[cell, 'pval_fdr_bh']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=600,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=55,
            t=100,
            pad=0,
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/{path_local}/{cell}")

## 3. Age Accelerations

In [None]:
path_local = "dnam/03_age_accelerations"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
age_types = {f"{x}{dnam_suffix}": x for x in ['DNAmAgeHannum', 'DNAmAge', 'DNAmPhenoAge', 'DNAmGrimAge']}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x
df_aas = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
df_aas_ll = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
df_aas_xtd = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
for age_type in tqdm(age_types):
    formula = f"{age_type} ~ Age"
    model = smf.ols(formula=formula, data=df_dnam.loc[df_dnam["Region"] == problem["BasePart"]]).fit()
    df_dnam[f"{age_type}_linear_pred"] = model.predict(df_dnam)
    y_pred = model.predict(pheno)
    df_dnam[f"{age_type}Acc"] = df_dnam[age_type] - df_dnam[f"{age_type}_linear_pred"]

    vals = {}
    lls = {}
    for group in problem["Filter"]:

        vals[group] = df_dnam.loc[problem["Filter"][group], f"{age_type}Acc"].values
        df_aas.at[f"{age_type}Acc", f"mean_{group}"] = np.mean(vals[group])
        df_aas.at[f"{age_type}Acc", f"median_{group}"] = np.median(vals[group])
        df_aas.at[f"{age_type}Acc", f"q75_{group}"], df_aas.at[f"{age_type}Acc", f"q25_{group}"] = np.percentile(vals[group], [75 ,25])
        df_aas.at[f"{age_type}Acc", f"iqr_{group}"] = df_aas.at[f"{age_type}Acc", f"q75_{group}"] - df_aas.at[f"{age_type}Acc", f"q25_{group}"]
        print(f"{group}: {len(vals[group])}")

        lls[group] = df_dnam.loc[problem["FilterLL"][group], f"{age_type}Acc"].values
        df_aas_ll.at[f"{age_type}Acc", f"mean_{group}"] = np.mean(lls[group])
        df_aas_ll.at[f"{age_type}Acc", f"median_{group}"] = np.median(lls[group])
        df_aas_ll.at[f"{age_type}Acc", f"q75_{group}"], df_aas_ll.at[f"{age_type}Acc", f"q25_{group}"] = np.percentile(lls[group], [75 ,25])
        df_aas_ll.at[f"{age_type}Acc", f"iqr_{group}"] = df_aas_ll.at[f"{age_type}Acc", f"q75_{group}"] - df_aas_ll.at[f"{age_type}Acc", f"q25_{group}"]
        print(f"{group} ll: {len(lls[group])}")

    _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_aas.at[f"{age_type}Acc", "pval"] = pval

    _, pval = mannwhitneyu(*lls.values(), alternative='two-sided')
    df_aas_ll.at[f"{age_type}Acc", "pval"] = pval

    xtds = {}
    for group in ["Yakutia (City)", "Yakutia (Village)"]:
        xtds[group] = df_dnam.loc[problem["FilterXTD"][group], f"{age_type}Acc"].values
        df_aas_xtd.at[f"{age_type}Acc", f"mean_{group}"] = np.mean(xtds[group])
        df_aas_xtd.at[f"{age_type}Acc", f"median_{group}"] = np.median(xtds[group])
        df_aas_xtd.at[f"{age_type}Acc", f"q75_{group}"], df_aas_xtd.at[f"{age_type}Acc", f"q25_{group}"] = np.percentile(xtds[group], [75 ,25])
        df_aas_xtd.at[f"{age_type}Acc", f"iqr_{group}"] = df_aas_xtd.at[f"{age_type}Acc", f"q75_{group}"] - df_aas_xtd.at[f"{age_type}Acc", f"q25_{group}"]

    _, pval = mannwhitneyu(*xtds.values(), alternative='two-sided')
    df_aas_xtd.at[f"{age_type}Acc", "pval"] = pval

_, df_aas["pval_fdr_bh"], _, _ = multipletests(df_aas["pval"], 0.05, method='fdr_bh')
df_aas.to_excel(f"{path_save}/{path_local}/aas.xlsx", index=True)

_, df_aas_ll["pval_fdr_bh"], _, _ = multipletests(df_aas_ll["pval"], 0.05, method='fdr_bh')
if is_plot_ll:
    df_aas_ll.to_excel(f"{path_save}/{path_local}/aas_ll.xlsx", index=True)

_, df_aas_xtd["pval_fdr_bh"], _, _ = multipletests(df_aas_xtd["pval"], 0.05, method='fdr_bh')
if is_plot_residence:
    df_aas_xtd.to_excel(f"{path_save}/{path_local}/aas_xtd.xlsx", index=True)

In [None]:
for age_type in tqdm(age_types):

    vals = {}
    lls = {}
    for group in problem["Filter"]:
        vals[group] = df_dnam.loc[problem["Filter"][group], f"{age_type}Acc"].values
        print(f"{group}: {len(vals[group])}")
        lls[group] = df_dnam.loc[problem["FilterLL"][group], f"{age_type}Acc"].values
        print(f"{group} ll: {len(lls[group])}")

    xtds = {}
    for group in ["Yakutia (City)", "Yakutia (Village)"]:
        xtds[group] = df_dnam.loc[problem["FilterXTD"][group], f"{age_type}Acc"].values

    # Plot with residence ==========================================================
    if is_plot_residence:
        dist_num_bins = 15
        fig = go.Figure()
        fig.add_trace(
            go.Violin(
                x=[''] * len(vals['Central']),
                y=vals['Central'],
                name='Central',
                box_visible=True,
                meanline_visible=True,
                showlegend=True,
                line_color='black',
                fillcolor=problem["ColorXTD"]['Central'],
                marker = dict(color=problem["ColorXTD"]['Central'], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals['Central']) / dist_num_bins,
                opacity=0.8
            )
        )
        fig.add_trace(
            go.Violin(
                x=[f"{df_aas_xtd.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}"] * len(xtds["Yakutia (Village)"]),
                y=xtds["Yakutia (Village)"],
                name="Yakutia (Village)",
                box_visible=True,
                meanline_visible=True,
                showlegend=True,
                line_color='deepskyblue',
                fillcolor=problem["ColorXTD"]["Yakutia (Village)"],
                marker=dict(color=problem["ColorXTD"]["Yakutia (Village)"], line=dict(color='deepskyblue', width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(xtds["Yakutia (Village)"]) / dist_num_bins,
                opacity=0.8,
                legendgroup="Yakutia (Village)",
                scalegroup="Yakutia",
                side='negative',
                scalemode="width",
                pointpos=-1.5
            )
        )
        fig.add_trace(
            go.Violin(
                x=[f"{df_aas_xtd.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}"] * len(xtds["Yakutia (City)"]),
                y=xtds["Yakutia (City)"],
                name="Yakutia (City)",
                box_visible=True,
                meanline_visible=True,
                showlegend=True,
                line_color='black',
                fillcolor=problem["ColorXTD"]["Yakutia (City)"],
                marker=dict(color=problem["ColorXTD"]["Yakutia (City)"], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(xtds["Yakutia (City)"]) / dist_num_bins,
                opacity=0.8,
                legendgroup="Yakutia (City)",
                scalegroup="Yakutia",
                scalemode="width",
                side='positive',
                pointpos=1.5
            )
        )
        add_layout(fig, "", f"{age_types[age_type]}Acc", f"")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=25)
        fig.update_layout(legend={'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=600,
            margin=go.layout.Margin(
                l=150,
                r=50,
                b=55,
                t=100,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.1)
        fig = add_p_value_annotation(fig, {(0, 1): df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']})
        fig.for_each_annotation(lambda a: a.update(font_size=23))
        fig.update_layout(yaxis=dict(tickfont=dict(size=23)))
        save_figure(fig, f"{path_save}/{path_local}/violin_{age_type}Acc_residence")

    # Plot with longevity ==========================================================
    if is_plot_ll:
        dist_num_bins = 20
        fig = go.Figure()
        for group in problem["Filter"]:
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=problem["Color"][group],
                    marker=dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8,
                    legendgroup=group,
                    scalegroup=group,
                    side='negative',
                    scalemode="width",
                    pointpos=-1.5
                )
            )
            fig.add_trace(
                go.Violin(
                    y=lls[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='orange',
                    fillcolor=problem["ColorLL"][group],
                    marker=dict(color=problem["ColorLL"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(lls[group]) / dist_num_bins,
                    opacity=0.8,
                    legendgroup=group,
                    scalegroup=group,
                    scalemode="width",
                    side='positive',
                    pointpos=1.5
                )
            )

        add_layout(fig, "", f"{age_types[age_type]}Acc", f"p-value: {df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}<br>p-value ({ll_thld}+): {df_aas_ll.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=600,
            margin=go.layout.Margin(
                l=150,
                r=50,
                b=75,
                t=100,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{path_local}/violin_{age_type}Acc_ll")

    # Regular plot =======================================================
    dist_num_bins = 20
    fig = go.Figure()
    for group in problem["Filter"]:
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker=dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals[group]) / dist_num_bins,
                opacity=0.8,
            )
        )
    add_layout(fig, "", f"{age_types[age_type]}Acc", f"p-value: {df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=600,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=75,
            t=100,
            pad=0,
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/{path_local}/violin_{age_type}Acc")

    min_val = df_dnam[["Age", age_type]].min().min()
    max_val = df_dnam[["Age", age_type]].max().max()
    shift_val = max_val - min_val
    min_val -= 0.05 * shift_val
    max_val += 0.05 * shift_val

    # Plot without residence =======================================================
    if is_plot_residence:
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_dnam.loc[df_dnam["Region"] == problem["BasePart"], f"Age"].values,
                y=df_dnam.loc[df_dnam["Region"] == problem["BasePart"], f"{age_type}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                line=dict(width=5),
                marker_color=problem["Color"][problem["BasePart"]],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in problem["FilterXTD"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[problem["FilterXTD"][group], f"Age"].values,
                    y=df_dnam.loc[problem["FilterXTD"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=problem["ColorXTD"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color="black",
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{path_local}/scatter_{age_type}_residence")
    # Plot with longevity ==========================================================
    if is_plot_ll:
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_dnam.loc[df_dnam["Region"] == problem["BasePart"], f"Age"].values,
                y=df_dnam.loc[df_dnam["Region"] == problem["BasePart"], f"{age_type}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                line=dict(width=5),
                marker_color=problem["Color"][problem["BasePart"]],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in problem["Filter"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[problem["FilterNoLL"][group], f"Age"].values,
                    y=df_dnam.loc[problem["FilterNoLL"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=problem["Color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color="black",
                            width=0.5
                        )
                    )
                )
            )
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[problem["FilterLL"][group], f"Age"].values,
                    y=df_dnam.loc[problem["FilterLL"][group], f"{age_type}"].values,
                    showlegend=False,
                    name=group,
                    mode="markers",
                    line_color=problem["ColorLL"][group],
                    marker_symbol="x-dot",
                    marker=dict(
                        size=12,
                        opacity=0.75,
                        line=dict(
                            color="orange",
                            width=1.0
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{path_local}/scatter_{age_type}_ll")

    # Regular plot =======================================================
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=[min_val, max_val],
            y=[min_val, max_val],
            showlegend=False,
            name="",
            mode="lines",
            marker_color="black",
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    fig.add_trace(
        go.Scatter(
            x=df_dnam.loc[df_dnam["Region"] == problem["BasePart"], f"Age"].values,
            y=df_dnam.loc[df_dnam["Region"] == problem["BasePart"], f"{age_type}_linear_pred"].values,
            showlegend=False,
            name="",
            mode="lines",
            line=dict(width=5),
            marker_color=problem["Color"][problem["BasePart"]],
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    for group in problem["Filter"]:
        fig.add_trace(
            go.Scatter(
                x=df_dnam.loc[problem["Filter"][group], f"Age"].values,
                y=df_dnam.loc[problem["Filter"][group], f"{age_type}"].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=problem["Color"][group],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
    add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_xaxes(autorange=False)
    fig.update_yaxes(autorange=False)
    fig.update_layout(title_xref='paper')
    fig.update_layout(xaxis_range=[min_val, max_val])
    fig.update_layout(yaxis_range=[min_val, max_val])
    fig.update_layout(
        width=850,
        height=800,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=100,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/scatter_{age_type}")

## 4. ChAMP Region DMPs

### Setup

In [None]:
pval_lim = 1e-20
fc_lim = 0.05
path_local = f"dnam/04_DMPs_region/pval({pval_lim:0.2e})_fc({fc_lim:0.2e})"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

### Read ChAMP results

In [None]:
df_dmps = pd.read_csv(f"{path_save}/dnam/data_for_R/DMP_region.csv", index_col="CpG")
df_dmps["CpG"] = df_dmps.index.values
df_dmps.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
df_dmps['print'] = df_dmps.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
df_dmps['log_pval'] = -np.log10(df_dmps["adj.P.Val"])

### Obtain gene list

In [None]:
df_dmps_selected = df_dmps.loc[(df_dmps["adj.P.Val"] < pval_lim) & ((df_dmps["logFC"] < -fc_lim) | (df_dmps["logFC"] > fc_lim)), :]
df_dmps_selected.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
top_to_hightlight = df_dmps_selected["print"].values[0:2]
df_dmps_selected.to_excel(f"{path_save}/{path_local}/selected.xlsx")
genes_dmps_selected = set()
for cpg in df_dmps_selected.index.values:
    genes_raw = manifest.at[cpg, 'Gene']
    if isinstance(genes_raw, str):
        genes = genes_raw.split(';')
        genes_dmps_selected.update(set(genes))
if 'non-genic' in genes_dmps_selected:
    genes_dmps_selected.remove('non-genic')
if ' ' in genes_dmps_selected:
    genes_dmps_selected.remove(' ')
genes_dmps_selected = list(genes_dmps_selected)
genes_dmps_df = pd.DataFrame({'gene':genes_dmps_selected})
genes_dmps_df.to_excel(f"{path_save}/{path_local}/genes.xlsx", index=False)
print(f"Number of CpGs: {df_dmps_selected.shape[0]}")
print(f"Number of genes: {genes_dmps_df.shape[0]}")

### Manhattan and volcano plots

In [None]:
sns.set_theme(style='whitegrid')
df_dmps.sort_values(["MAPINFO"], ascending=[True], inplace=True)
mhat(
    df=df_dmps,
    chr='CHR',
    pv='adj.P.Val',
    path=f"{path_save}/{path_local}",
    valpha=1,
    markernames=tuple(top_to_hightlight),
    markeridcol='print',
    gstyle=2,
    dim=(12, 4),
    axtickfontsize=8
)
sns.set_theme(style='whitegrid')
volcano(
    df=df_dmps,
    lfc='logFC',
    pv='adj.P.Val',
    pv_thr=(pval_lim, pval_lim),
    lfc_thr=(fc_lim, fc_lim),
    path=f"{path_save}/{path_local}",
    genenames=tuple(top_to_hightlight),
    geneid='print',
    gstyle=2,
    sign_line=True,
    color=(problem["Color"]['Central'], "grey", problem["Color"]['Yakutia'])
)

### Perform dimensionality reduction

In [None]:
pathlib.Path(f"{path_save}/{path_local}/dim_red").mkdir(parents=True, exist_ok=True)

feats_dim_red = df_dmps_selected["CpG"].values
df_dnam_dim_red = df_dnam.loc[:, list(feats_dim_red) + ["Age", "Sex", "Region"]]
data_dim_red = df_dnam_dim_red.loc[:, feats_dim_red].values
classes_dim_red = df_dnam_dim_red.loc[:, 'Region'].values

In [None]:
print(f"PCA")
pca = PCA(n_components=2, whiten=False)
data_pca = pca.fit_transform(data_dim_red)
df_dnam_dim_red['PC 1'] = data_pca[:, 0]
df_dnam_dim_red['PC 2'] = data_pca[:, 1]

print(f"Incremental PCA")
n_batches = 32
ipca = IncrementalPCA(n_components=2)
for data_batch in np.array_split(data_dim_red, n_batches):
    ipca.partial_fit(data_batch)
data_ipca = ipca.transform(data_dim_red)
df_dnam_dim_red['Incremental PC 1'] = data_ipca[:, 0]
df_dnam_dim_red['Incremental PC 2'] = data_ipca[:, 1]

print(f"Kernel PCA")
kpca = KernelPCA(kernel='rbf', fit_inverse_transform=True, gamma=None, n_components=2)
data_kpca = kpca.fit_transform(data_dim_red)
df_dnam_dim_red['Kernel PC 1'] = data_kpca[:, 0]
df_dnam_dim_red['Kernel PC 2'] = data_kpca[:, 1]

print(f"SVD")
tsvd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5)
tsvd.fit(data_dim_red)
data_svd = tsvd.transform(data_dim_red)
df_dnam_dim_red['SVD 1'] = data_svd[:, 0]
df_dnam_dim_red['SVD 2'] = data_svd[:, 1]

print(f"GRP")
GRP = GaussianRandomProjection(n_components=2, eps=0.5)
GRP.fit(data_dim_red)
data_grp = GRP.transform(data_dim_red)
df_dnam_dim_red['Gaussian Random Projection 1'] = data_grp[:, 0]
df_dnam_dim_red['Gaussian Random Projection 2'] = data_grp[:, 1]

print(f"SRP")
SRP = SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False)
SRP.fit(data_dim_red)
data_srp = SRP.transform(data_dim_red)
df_dnam_dim_red['Sparse Random Projection 1'] = data_srp[:, 0]
df_dnam_dim_red['Sparse Random Projection 2'] = data_srp[:, 1]

print(f"MDS")
mds = MDS(n_components=2, metric=True)
data_mds = mds.fit_transform(data_dim_red)
df_dnam_dim_red['Multi Dimensional Scale 1'] = data_mds[:, 0]
df_dnam_dim_red['Multi Dimensional Scale 2'] = data_mds[:, 1]

print(f"ISOMAP")
isomap = Isomap(n_components=2, n_neighbors=5)
isomap.fit(data_dim_red)
data_isomap = isomap.transform(data_dim_red)
df_dnam_dim_red['IsoMap 1'] = data_isomap[:, 0]
df_dnam_dim_red['IsoMap 2'] = data_isomap[:, 1]

print(f"MiniBatchDictionaryLearning")
miniBatchDictLearning = MiniBatchDictionaryLearning(n_components=2, batch_size=200, alpha=1, n_iter=25)
miniBatchDictLearning.fit(data_dim_red)
data_batch = miniBatchDictLearning.fit_transform(data_dim_red)
df_dnam_dim_red['MBDL 1'] = data_batch[:, 0]
df_dnam_dim_red['MBDL 2'] = data_batch[:, 1]

print(f"ICA")
fastICA = FastICA(n_components=2, algorithm='parallel', whiten=True, tol=1e-3, max_iter=1000)
data_ica = fastICA.fit_transform(data_dim_red)
df_dnam_dim_red['IC 1'] = data_ica[:, 0]
df_dnam_dim_red['IC 2'] = data_ica[:, 1]

print(f"t-SNE")
tsne = TSNE(n_components=2, learning_rate=300, perplexity=30, early_exaggeration=12, init='random')
data_tsne = tsne.fit_transform(data_dim_red)
df_dnam_dim_red['tSNE 1'] = data_tsne[:, 0]
df_dnam_dim_red['tSNE 2'] = data_tsne[:, 1]

print(f"LLE")
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, method='modified')
lle.fit(data_dim_red)
data_lle = lle.transform(data_dim_red)
df_dnam_dim_red['LLE 1'] = data_lle[:, 0]
df_dnam_dim_red['LLE 2'] = data_lle[:, 1]

In [None]:
dim_red_methods_dict = {
    'PCA': ['PC 1', 'PC 2'],
    'IncrementalPCA': ['Incremental PC 1', 'Incremental PC 2'],
    'KernelPCA': ['Kernel PC 1', 'Kernel PC 2'],
    'SingularValueDecomposition': ['SVD 1', 'SVD 2'],
    'GaussianRandomProjection': ['Gaussian Random Projection 1', 'Gaussian Random Projection 2'],
    'SparseRandomProjection': ['Sparse Random Projection 1', 'Sparse Random Projection 2'],
    'MultiDimensionalScaling': ['Multi Dimensional Scale 1', 'Multi Dimensional Scale 2'],
    'Isomap': ['IsoMap 1', 'IsoMap 2'],
    'MiniBatchDictionaryLearning': ['MBDL 1', 'MBDL 2'],
    'ICA': ['IC 1', 'IC 2'],
    'T-SNE': ['tSNE 1', 'tSNE 2'],
    'LocallyLinearEmbedding': ['LLE 1', 'LLE 2']
}
df_dnam_dim_red.loc[:, list(chain(*dim_red_methods_dict.values()))].to_excel(f"{path_save}/{path_local}/dim_red/table.xlsx", index=True)
for method in dim_red_methods_dict:
    x_col = dim_red_methods_dict[method][0]
    y_col = dim_red_methods_dict[method][1]

    # Plot with residence =======================================================
    fig = go.Figure()
    for group in problem["ColorXTD"]:
        fig.add_trace(
            go.Scatter(
                x=df_dnam_dim_red.loc[problem["FilterXTD"][group], x_col].values,
                y=df_dnam_dim_red.loc[problem["FilterXTD"][group], y_col].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=problem["ColorXTD"][group],
                marker=dict(
                    size=8,
                    opacity=0.8,
                    color=problem["ColorXTD"][group],
                    symbol="circle",
                    line=dict(
                        color="black",
                        width=1
                    )
                )
            )
        )
    add_layout(fig, x_col, y_col, f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        width=850,
        height=800,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=100,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/dim_red/{method}_residence")

    # Plot without longevity =======================================================
    fig = go.Figure()
    for group in problem["Color"]:
        fig.add_trace(
            go.Scatter(
                x=df_dnam_dim_red.loc[problem["Filter"][group], x_col].values,
                y=df_dnam_dim_red.loc[problem["Filter"][group], y_col].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=problem["Color"][group],
                marker=dict(
                    size=8,
                    opacity=0.8,
                    color=problem["Color"][group],
                    symbol="circle",
                    line=dict(
                        color="black",
                        width=1
                    )
                )
            )
        )
    add_layout(fig, x_col, y_col, f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        width=850,
        height=800,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=100,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/dim_red/{method}")

    # Plot with longevity ==========================================================
    fig = go.Figure()
    for group in problem["Color"]:
        fig.add_trace(
            go.Scatter(
                x=df_dnam_dim_red.loc[problem["FilterNoLL"][group], x_col].values,
                y=df_dnam_dim_red.loc[problem["FilterNoLL"][group], y_col].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=problem["Color"][group],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    color=problem["Color"][group],
                    symbol="circle",
                    line=dict(
                        color="black",
                        width=1
                    )
                )
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_dnam_dim_red.loc[problem["FilterLL"][group], x_col].values,
                y=df_dnam_dim_red.loc[problem["FilterLL"][group], y_col].values,
                showlegend=False,
                name=group,
                mode="markers",
                line_color=problem["ColorLL"][group],
                marker_symbol="x-dot",
                marker=dict(
                    size=12,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=1
                    )
                )
            )
        )
    add_layout(fig, x_col, y_col, f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        width=850,
        height=800,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=100,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/dim_red/{method}_ll")

### Obtain entrez genes lists with possible synonyms

In [None]:
pathlib.Path(f"{path_save}/{path_local}/genes_synonyms").mkdir(parents=True, exist_ok=True)

mg = mygene.MyGeneInfo()
print(f"genes_dmps_selected: {len(genes_dmps_selected)}")
df_queries_all = []
genes_missed = []
number_of_synonyms = 0
for gene in tqdm(genes_dmps_selected):
    df_query = mg.query(gene, scopes='entrezgene', species='human', as_dataframe=True)
    if df_query.empty:
        genes_missed.append(gene)
    else:
        df_queries_all.append(df_query)
        if gene not in set(df_query.loc[:, "symbol"].values):
            number_of_synonyms += 1
            print(f"{gene} not in {list(df_query.loc[:, 'symbol'].values)}")
print(f"Total number of synonyms: {number_of_synonyms}")

my_gene_all = pd.concat(df_queries_all)
my_gene_all.to_excel(f"{path_save}/{path_local}/genes_synonyms/my_gene_all.xlsx", index=True)

genes_dmps_missed_df = pd.DataFrame({'gene': genes_missed})
genes_dmps_missed_df.to_excel(f"{path_save}/{path_local}/genes_synonyms/genes_mygene_missed.xlsx", index=False)

genes_dmps_selected_all = list(set(my_gene_all.loc[:, "symbol"].values))
genes_dmps_selected_all_df = pd.DataFrame({'gene': genes_dmps_selected_all})
genes_dmps_selected_all_df.to_excel(f"{path_save}/{path_local}/genes_synonyms/genes_mygene_all.xlsx", index=False)

### Perform GSEA for selected gene libraries

In [None]:
pathlib.Path(f"{path_save}/{path_local}/GSEA").mkdir(parents=True, exist_ok=True)
libraries = gp.get_library_name("Human")
df_libraries = pd.DataFrame(index=libraries)
df_libraries.to_excel(f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia/GSEA_libs/libraries.xlsx", index=True)

dfs_enrichr = []
for genes_list in libraries:
    pathlib.Path(f"{path_save}/{path_local}/GSEA/{genes_list}").mkdir(parents=True, exist_ok=True)
    df_enrichr = gp.enrichr(
        gene_list=genes_dmps_selected,
        gene_sets=genes_list,
        organism='Human',
        outdir=f"{path_save}/{path_local}/GSEA/{genes_list}",
        cutoff=1.00,
        verbose=True,
        no_plot=True
    )
    dfs_enrichr.append(df_enrichr.results)
dfs_enrichr = pd.concat(dfs_enrichr)
dfs_enrichr.to_excel(f"{path_save}/{path_local}/GSEA/results.xlsx", index=True)
dfs_enrichr.to_pickle(f"{path_save}/{path_local}/GSEA/results.pkl")

### Checking GSEA elements

In [None]:
library_dict = gp.parser.get_library('GO_Molecular_Function_2021', organism='Human')
len(library_dict["response to cold (GO:0009409)"])

In [None]:
library_dict = gp.parser.get_library('BioCarta_2015', organism='Human')
len(set(library_dict["telomeres telomerase cellular aging and immortality"]).intersection(set(genes_dmps_selected)))

### Plot significant GSEA terms

In [None]:
libraries_file = [
    "libraries_target_GO_Biological_Process",
    "libraries_target_GO_Cellular_Component",
    "libraries_target_GO_Molecular_Function",
    "libraries_target_nonGO",
]

for library_file in libraries_file:
    libraries_target = pd.read_excel(f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia/GSEA_libs/{library_file}.xlsx")["library"].values

    gsea_cols = ["Gene_set", "Term", "Overlap", "P-value", "Adjusted P-value", "Odds Ratio", "Combined Score"]

    dfs_enrichr = pd.read_pickle(f"{path_save}/{path_local}/GSEA/results.pkl")
    dfs_enrichr = dfs_enrichr.loc[(dfs_enrichr["Adjusted P-value"] < 0.05) & (dfs_enrichr["Gene_set"].isin(libraries_target)), gsea_cols]
    dfs_enrichr.index = range(len(dfs_enrichr))

    if dfs_enrichr.empty == False:
        dfs_enrichr[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(dfs_enrichr.loc[:, 'Adjusted P-value'].values)
        dfs_enrichr.rename(columns={'Gene_set': 'Gene Library'}, inplace=True)
        dfs_enrichr.to_excel(f"{path_save}/{path_local}/GSEA/terms_{library_file}.xlsx")
        plt.figure(figsize=(10, 0.5 * dfs_enrichr.shape[0]))
        sns.set_theme(style='whitegrid', font_scale=2)
        bar = sns.barplot(
            data=dfs_enrichr,
            hue="Gene Library",
            y=dfs_enrichr.index,
            x=r'$ -\log_{10}(\mathrm{p-value})$',
            palette=list(px.colors.qualitative.Alphabet) + list(px.colors.qualitative.Dark24) + list(px.colors.qualitative.Light24),
            edgecolor='black',
            orient="h",
            dodge=False
        )
        bar.set_yticklabels(dfs_enrichr["Term"])
        sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
        plt.savefig(f"{path_save}/{path_local}/GSEA/terms_{library_file}.png", bbox_inches='tight')
        plt.savefig(f"{path_save}/{path_local}/GSEA/terms_{library_file}.pdf", bbox_inches='tight')
        plt.close()

### Plot upset plots for target terms

In [None]:
genes = "origin"

df_upset_terms = pd.read_excel(f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia/GSEA_libs/gene_lists/enrichr.xlsx")

target_terms_dict = {
    "aging": ["aging", "longevity"],
    "nutrition": ["dietary", "food", "starvation"],
    "cold_adaptation": ["cold", "temperature", "thermogenesis"]
}

for terms_name, target_terms in target_terms_dict.items():

    df_upset_terms_target = df_upset_terms.loc[df_upset_terms["keyword"].isin(target_terms), :]
    dict_upset_gene_lists = {"Central VS Yakutia": genes_dmps_selected}
    for ind, row in df_upset_terms_target.iterrows():
        print(f"{row['library']} {row['term']}")
        library_dict = gp.parser.get_library(row['library'], organism='Human')
        if f"{row['code']}" not in dict_upset_gene_lists:
            dict_upset_gene_lists[f"{row['code']}"] = library_dict[row['term']]
        else:
            dict_upset_gene_lists[f"{row['code']}"] = list(set(dict_upset_gene_lists[f"{row['code']}"]).union(set(library_dict[row['term']])))

    upset_genes_all = list(set().union(*list(dict_upset_gene_lists.values())))
    df_upset = pd.DataFrame(index=upset_genes_all)
    for k, v in dict_upset_gene_lists.items():
        df_upset[k] = df_upset.index.isin(v)
    df_upset = df_upset.set_index(list(dict_upset_gene_lists.keys()))
    tmp = plt.figure(figsize=(85, 15))
    upset_fig = upset.UpSet(df_upset, subset_size='count', show_counts=True, min_degree=1, element_size=None, totals_plot_elements=5).plot(tmp)
    plt.savefig(f"{path_save}/{path_local}/GSEA/upset_{terms_name}.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/GSEA/upset_{terms_name}.pdf", bbox_inches='tight')
    plt.close()

### Plot region enrichment

In [None]:
pathlib.Path(f"{path_save}/{path_local}/region_enrichment").mkdir(parents=True, exist_ok=True)

orders = {
    'CHR': [str(x) for x in range(1, 24)],
    'RELATION_TO_UCSC_CPG_ISLAND': ['S_Shelf', 'S_Shore', 'Island', 'N_Shore', 'N_Shelf', 'OpenSea'],
    'UCSC_REFGENE_GROUP': ['TSS1500', 'TSS200', '5\'UTR', '1stExon', 'Body', '3\'UTR']
}
col_names = {
    'CHR': "CHR",
    'RELATION_TO_UCSC_CPG_ISLAND': "Relation_to_Island",
    'UCSC_REFGENE_GROUP': "UCSC_RefGene_Group"
}
fig_sizes = {
    'CHR': (17, 10),
    'RELATION_TO_UCSC_CPG_ISLAND': (5, 10),
    'UCSC_REFGENE_GROUP': (5, 10)
}
colors = {
    'CHR': px.colors.qualitative.Dark24,
    'RELATION_TO_UCSC_CPG_ISLAND': px.colors.qualitative.Light24[17:23],
    'UCSC_REFGENE_GROUP': px.colors.qualitative.Light24[11:17]
}

df_dmps_fisher_target = manifest.loc[df_dmps_selected.index.values, :]
df_dmps_fisher_global = manifest.loc[df_dmps.index.values, :]
df_dmps_fisher_padding = df_dmps_fisher_global.loc[~df_dmps_fisher_global.index.isin(df_dmps_selected.index.values), :]
for var in orders:
    columns=["11", "12", "21", "22", "sum", "pval", "odds_ratio"]
    df_var = pd.DataFrame(index=orders[var], columns=columns, data=np.zeros((len(orders[var]), len(columns))))
    df_var.index.name = col_names[var].replace("_", " ")
    for var_val in orders[var]:
        contingency_table = pd.DataFrame(index=["specific", "non-specific"], columns=["in_val", "not_in_val"])
        contingency_table.at["specific", "in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[col_names[var]] == var_val, :].shape[0]
        contingency_table.at["specific", "not_in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[col_names[var]] != var_val, :].shape[0]
        contingency_table.at["non-specific", "in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[col_names[var]] == var_val, :].shape[0]
        contingency_table.at["non-specific", "not_in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[col_names[var]] != var_val, :].shape[0]
        df_var.at[var_val, "11"] = contingency_table.at["specific", "in_val"]
        df_var.at[var_val, "12"] = contingency_table.at["specific", "not_in_val"]
        df_var.at[var_val, "21"] = contingency_table.at["non-specific", "in_val"]
        df_var.at[var_val, "22"] = contingency_table.at["non-specific", "not_in_val"]
        df_var.at[var_val, "sum"] = contingency_table.values.sum()
        odds_ratio, pval = stats.fisher_exact(contingency_table.to_numpy(), alternative='two-sided')
        if np.isnan(odds_ratio):
            odds_ratio = 1.0
        df_var.at[var_val, "odds_ratio"], df_var.at[var_val, "pval"] = odds_ratio, pval
    _, df_var['pval_fdr_bh'], _, _ = multipletests(df_var['pval'].values, 0.05, method='fdr_bh')
    df_var[r'$ \log_{10}(\mathrm{Odds\ ratio})$'] = np.log10(df_var.loc[:, 'odds_ratio'].values)
    df_var[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_var.loc[:, 'pval_fdr_bh'].values)
    df_var.to_excel(f"{path_save}/{path_local}/region_enrichment/fisher_{var}.xlsx")

    for pval_show_type in ["color", "cross"]:
        plt.figure(figsize=fig_sizes[var])
        plt.xticks(rotation=90)
        sns.set_theme(style='whitegrid', font_scale=2)
        if pval_show_type == "color":
            plot = plt.scatter(df_var.index, df_var.loc[:, r'$ \log_{10}(\mathrm{Odds\ ratio})$'].values, c=df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values, cmap='Reds')
            plt.clf()
            cbar = plt.colorbar(plot)
            plt.xticks(rotation=90)
            cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center')
            ax = sns.barplot(data=df_var, x=df_var.index, y=r'$ \log_{10}(\mathrm{Odds\ ratio})$', hue=r'$ -\log_{10}(\mathrm{p-value})$', palette='Reds', dodge=False, edgecolor='black')
            ax.legend_.remove()

        else:
            bar = sns.barplot(data=df_var, x=df_var.index, y=r'$ \log_{10}(\mathrm{Odds\ ratio})$', palette=colors[var], edgecolor='black')
            for bar_index, this_bar in enumerate(bar.patches):
                if df_var.at[df_var.index[bar_index], "pval_fdr_bh"] < 0.05:
                    this_bar.set_hatch('x')
                this_bar.set_edgecolor('skyblue')
        plt.savefig(f"{path_save}/{path_local}/region_enrichment/fisher_{var}_{pval_show_type}.png", bbox_inches='tight')
        plt.savefig(f"{path_save}/{path_local}/region_enrichment/fisher_{var}_{pval_show_type}.pdf", bbox_inches='tight')
        plt.close()

### Plot examples

In [None]:
n_top = 10
dist_num_bins = 25
pathlib.Path(f"{path_save}/{path_local}/examples").mkdir(parents=True, exist_ok=True)
df_dmps_top = df_dmps.sort_values(['adj.P.Val'], ascending=[True]).head(n_top)
for cpg_id, (cpg, row) in enumerate(df_dmps_top.iterrows()):
    pval = row['adj.P.Val']
    log_fc = row['logFC']
    gene = manifest.at[cpg, 'Gene']

    # Plot with residence ==========================================================
    dist_num_bins = 15
    fig = go.Figure()
    vals = df_dnam.loc[problem["FilterXTD"]['Central'], cpg].values
    fig.add_trace(
        go.Violin(
            x=['Central'] * len(vals),
            y=vals,
            name='Central',
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='black',
            fillcolor=problem["ColorXTD"]['Central'],
            marker = dict(color=problem["ColorXTD"]['Central'], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth = np.ptp(vals) / dist_num_bins,
            opacity=0.8
        )
    )
    vals = df_dnam.loc[problem["FilterXTD"]["Yakutia (Village)"], cpg].values
    fig.add_trace(
        go.Violin(
            x=[f"Yakutia"] * len(vals),
            y=vals,
            name="Yakutia (Village)",
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='deepskyblue',
            fillcolor=problem["ColorXTD"]["Yakutia (Village)"],
            marker=dict(color=problem["ColorXTD"]["Yakutia (Village)"], line=dict(color='deepskyblue', width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals) / dist_num_bins,
            opacity=0.8,
            legendgroup="Yakutia (Village)",
            scalegroup="Yakutia",
            side='negative',
            scalemode="width",
            pointpos=-1.5
        )
    )
    vals = df_dnam.loc[problem["FilterXTD"]["Yakutia (City)"], cpg].values
    fig.add_trace(
        go.Violin(
            x=[f"Yakutia"] * len(vals),
            y=vals,
            name="Yakutia (City)",
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='black',
            fillcolor=problem["ColorXTD"]["Yakutia (City)"],
            marker=dict(color=problem["ColorXTD"]["Yakutia (City)"], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals) / dist_num_bins,
            opacity=0.8,
            legendgroup="Yakutia (City)",
            scalegroup="Yakutia",
            scalemode="width",
            side='positive',
            pointpos=1.5
        )
    )
    add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}")
    fig.update_layout(title_xref='paper', title={'y': 0.87})
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=650,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=55,
            t=150,
            pad=0,
        )
    )
    fig.update_layout(legend_y=1.25)
    fig.update_layout(yaxis=dict(tickfont=dict(size=23)))
    save_figure(fig, f"{path_save}/{path_local}/examples/{cpg_id}_{cpg}_residence")

    # Plot without longevity ===================================================
    fig = go.Figure()
    for group in problem["Filter"]:
        vals = df_dnam.loc[problem["Filter"][group], cpg].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}")
    fig.update_layout(title_xref='paper', title={'y': 0.95})
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=615,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=75,
            t=115,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{cpg_id}_{cpg}")

    # Plot with longevity ======================================================
    fig = go.Figure()
    for group in problem["Filter"]:

        vals = df_dnam.loc[problem["Filter"][group], cpg].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals) / dist_num_bins,
                opacity=0.8,
                legendgroup=group,
                scalegroup=group,
                side='negative',
                scalemode="width",
                pointpos=-1.5
            )
        )

        lls = df_dnam.loc[problem["FilterLL"][group], cpg].values
        fig.add_trace(
            go.Violin(
                y=lls,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='orange',
                fillcolor=problem["ColorLL"][group],
                marker=dict(color=problem["ColorLL"][group], line=dict(color='black', width=0.3), opacity=0.8),
                points='all',
                bandwidth= np.ptp(lls) / dist_num_bins,
                opacity=0.8,
                legendgroup=group,
                scalegroup=group,
                side='positive',
                scalemode="width",
                pointpos=1.5
            )
        )

    add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}")
    fig.update_layout(title_xref='paper', title={'y': 0.95})
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=615,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=75,
            t=115,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{cpg_id}_{cpg}_ll")

## 4. ChAMP Residence DMPs

In [None]:
pval_lim = 0.05
fc_lim = 0.001
path_local = f"dnam/05_DMPs_yakutia_residence/pval({pval_lim:0.2e})_fc({fc_lim:0.2e})"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

df_dmps = pd.read_csv(f"{path_save}/dnam/data_for_R/DMP_yakutia_residence.csv", index_col="CpG")
df_dmps["CpG"] = df_dmps.index.values
df_dmps.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
df_dmps['print'] = df_dmps.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
df_dmps['log_pval'] = -np.log10(df_dmps["adj.P.Val"])

df_dmps_selected = df_dmps.loc[(df_dmps["adj.P.Val"] < pval_lim) & ((df_dmps["logFC"] < -fc_lim) | (df_dmps["logFC"] > fc_lim)), :]
df_dmps_selected.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
top_to_hightlight = df_dmps_selected["print"].values[0:2]
df_dmps_selected.to_excel(f"{path_save}/{path_local}/selected.xlsx")
genes_dmps_selected = set()
for cpg in df_dmps_selected.index.values:
    genes_raw = manifest.at[cpg, 'Gene']
    if isinstance(genes_raw, str):
        genes = genes_raw.split(';')
        genes_dmps_selected.update(set(genes))
if 'non-genic' in genes_dmps_selected:
    genes_dmps_selected.remove('non-genic')
if ' ' in genes_dmps_selected:
    genes_dmps_selected.remove(' ')
genes_dmps_selected = list(genes_dmps_selected)
genes_dmps_df = pd.DataFrame({'gene':genes_dmps_selected})
genes_dmps_df.to_excel(f"{path_save}/{path_local}/genes.xlsx", index=False)
print(f"Number of CpGs: {df_dmps_selected.shape[0]}")
print(f"Number of genes: {genes_dmps_df.shape[0]}")

# Immunology data

## Setup

In [None]:
ll_thld = 85
problem = {
    "Color": {
        "Central": "limegreen",
        "Yakutia": "royalblue",
    },
    "ColorLL": {
        "Central": "darkgreen",
        "Yakutia": "darkblue",
    },
    "Filter": {
        "Central": df_immuno["Region"] == "Central",
        "Yakutia": df_immuno["Region"] == "Yakutia",
    },
    "FilterLL": {
        "Central": (df_immuno["Region"] == "Central") & (df_immuno["Age"] > ll_thld),
        "Yakutia": (df_immuno["Region"] == "Yakutia") & (df_immuno["Age"] > ll_thld),
    },
    "FilterNoLL": {
        "Central": (df_immuno["Region"] == "Central") & (df_immuno["Age"] < ll_thld),
        "Yakutia": (df_immuno["Region"] == "Yakutia") & (df_immuno["Age"] < ll_thld),
    },
    "BaseFilter": (df_immuno["Region"] == "Central"),
    "BasePart": "Central"
}

## 1. Data description

### Participants figure

In [None]:
path_local = "immuno/01_data_description/participants"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df_immuno.to_excel(f"{path_save}/{path_local}/df_immuno.xlsx")

#### Immuno age distribution in regions

In [None]:
hist_min = df_immuno.loc[:, f"Age"].min()
hist_max = df_immuno.loc[:, f"Age"].max()
hist_width = hist_max - hist_min
hist_n_bins = 20
hist_bin_width = hist_width / hist_n_bins

fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_immuno,
    bins=hist_n_bins,
    binrange=(hist_min, hist_max),
    binwidth=hist_bin_width,
    discrete=False,
    edgecolor='k',
    linewidth=1,
    # element="step",
    x="Age",
    hue="Region",
    palette=problem["Color"]
)
plt.savefig(f"{path_save}/{path_local}/hist_region.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_region.pdf", bbox_inches='tight')
plt.close(fig)

fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_immuno.loc[df_immuno["Region"] == "Central", :],
    bins=hist_n_bins,
    binrange=(hist_min, hist_max),
    binwidth=hist_bin_width,
    discrete=False,
    edgecolor='k',
    linewidth=1,
    # element="step",
    x="Age",
    hue="Sex",
    hue_order=["M", "F"],
    palette={"M": "blue", "F": "red"}
)
plt.legend(title='Sex', loc='upper left', labels=[f"F ({df_immuno.loc[(df_immuno['Sex'] == 'F') & (df_immuno['Region'] == 'Central')].shape[0]})", f"M ({df_immuno.loc[(df_immuno['Sex'] == 'M') & (df_immuno['Region'] == 'Central')].shape[0]})"])
plt.title("Sex distribution in Central region")
plt.savefig(f"{path_save}/{path_local}/hist_sex_central.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_sex_central.pdf", bbox_inches='tight')
plt.close(fig)

fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_immuno.loc[df_immuno["Region"] == "Yakutia", :],
    bins=hist_n_bins,
    binrange=(hist_min, hist_max),
    binwidth=hist_bin_width,
    discrete=False,
    edgecolor='k',
    linewidth=1,
    # element="step",
    x="Age",
    hue="Sex",
    hue_order=["M", "F"],
    palette={"M": "blue", "F": "red"}
)
plt.legend(title='Sex', loc='upper left', labels=[f"F ({df_immuno.loc[(df_immuno['Sex'] == 'F') & (df_immuno['Region'] == 'Yakutia')].shape[0]})", f"M ({df_immuno.loc[(df_immuno['Sex'] == 'M') & (df_immuno['Region'] == 'Yakutia')].shape[0]})", ])
plt.title("Sex distribution in Yakutia region")
plt.savefig(f"{path_save}/{path_local}/hist_sex_yakutia.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_sex_yakutia.pdf", bbox_inches='tight')
plt.close(fig)

### Features

In [None]:
path_local = "immuno/01_data_description/feats"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

feats_plot = ["Age"] + list(feats_immuno)
df_immuno_corr_mtx = pd.DataFrame(data=np.zeros(shape=(len(feats_plot), len(feats_plot))), index=feats_plot, columns=feats_plot)
for f_id_1 in range(len(feats_plot)):
    for f_id_2 in range(f_id_1, len(feats_plot)):
        f_1 = feats_plot[f_id_1]
        f_2 = feats_plot[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = df_immuno.loc[:, f_1].values
            vals_2 = df_immuno.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_immuno_corr_mtx.at[f_2, f_1] = pval
            df_immuno_corr_mtx.at[f_1, f_2] = corr
        else:
            df_immuno_corr_mtx.at[f_2, f_1] = np.nan
selection = np.tri(df_immuno_corr_mtx.shape[0], df_immuno_corr_mtx.shape[1], -1, dtype=np.bool)
df_fdr = df_immuno_corr_mtx.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_immuno_corr_mtx_fdr = df_immuno_corr_mtx.copy()
for line_id in range(df_fdr.shape[0]):
    df_immuno_corr_mtx_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(
        df_fdr.at[line_id, 'pval_fdr_bh'])

sns.set_theme(style='whitegrid')
df_to_plot = df_immuno_corr_mtx_fdr.copy()
mtx_to_plot = df_to_plot.to_numpy()

mtx_triu = np.triu(mtx_to_plot, +1)
max_corr = np.max(mtx_triu)
min_corr = np.min(mtx_triu)
mtx_triu_mask = np.ma.masked_array(mtx_triu, mtx_triu == 0)
cmap_triu = plt.get_cmap("bwr").copy()

mtx_tril = np.tril(mtx_to_plot, -1)
mtx_tril_mask = np.ma.masked_array(mtx_tril, mtx_tril == 0)
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')

fig, ax = plt.subplots()

im_triu = ax.imshow(mtx_triu_mask, cmap=cmap_triu, vmin=-1, vmax=1)
cbar_triu = ax.figure.colorbar(im_triu, ax=ax, location='right')
cbar_triu.set_label(r"$\mathrm{Correlation\:coefficient}$", horizontalalignment='center', fontsize=10)

im_tril = ax.imshow(mtx_tril_mask, cmap=cmap_tril, vmin=-np.log10(0.05))
cbar_tril = ax.figure.colorbar(im_tril, ax=ax, location='right')
cbar_tril.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center', fontsize=10)

ax.grid(None)
ax.set_aspect("equal")
ax.set_xticks(np.arange(df_to_plot.shape[1]))
ax.set_yticks(np.arange(df_to_plot.shape[0]))
ax.set_xticklabels(df_to_plot.columns.values)
ax.set_yticklabels(df_to_plot.index.values)
plt.setp(ax.get_xticklabels(), rotation=90)
threshold = np.ptp(mtx_tril.flatten()) * 0.5
ax.tick_params(axis='both', which='major', labelsize=5)
ax.tick_params(axis='both', which='minor', labelsize=5)
textcolors = ("black", "white")
for i in range(df_to_plot.shape[0]):
    for j in range(df_to_plot.shape[1]):
        color = "black"
        if i > j:
            color = textcolors[int(mtx_tril[i, j] < threshold)]
        if np.isinf(mtx_to_plot[i, j]) or np.isnan(mtx_to_plot[i, j]):
            text = ax.text(j, i, f"", ha="center", va="center", color=color, fontsize=1.3)
        else:
            text = ax.text(j, i, f"{mtx_to_plot[i, j]:0.2f}", ha="center", va="center", color=color, fontsize=1.3)
fig.tight_layout()
plt.savefig(f"{path_save}/{path_local}/corr_mtx_fdr.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/corr_mtx_fdr.pdf", bbox_inches='tight', dpi=400)
plt.clf()
df_save = df_immuno_corr_mtx_fdr
df_save.to_excel(f"{path_save}/{path_local}/corr_mtx_fdr.xlsx", index=True)

## 2. SImAge Results

In [None]:
simage_df = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/draft/07_central_vs_yakutia/SImAge/df.xlsx", index_col="index")
df_immuno.loc[df_immuno.index.values, "SImAge"] = simage_df.loc[df_immuno.index.values, "Estimation"]

In [None]:
path_local = "immuno/02_age_accelerations"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

formula = f"SImAge ~ Age"
model = smf.ols(formula=formula, data=df_immuno.loc[df_immuno["Region"] == problem["BasePart"]]).fit()
df_immuno[f"SImAge_linear_pred"] = model.predict(df_immuno)
y_pred = model.predict(pheno)
df_immuno[f"SImAgeAcc"] = df_immuno["SImAge"] - df_immuno[f"SImAge_linear_pred"]
df_aa = pd.DataFrame()
vals = {}
lls = {}
for group in problem["Filter"]:

    vals[group] = df_immuno.loc[problem["Filter"][group], f"SImAgeAcc"].values
    df_aa.at[f"SImAgeAcc", f"mean_{group}"] = np.mean(vals[group])
    df_aa.at[f"SImAgeAcc", f"median_{group}"] = np.median(vals[group])
    df_aa.at[f"SImAgeAcc", f"q75_{group}"], df_aa.at[f"SImAgeAcc", f"q25_{group}"] = np.percentile(vals[group], [75 ,25])
    df_aa.at[f"SImAgeAcc", f"iqr_{group}"] = df_aa.at[f"SImAgeAcc", f"q75_{group}"] - df_aa.at[f"SImAgeAcc", f"q25_{group}"]
    print(f"{group}: {len(vals[group])}")

    lls[group] = df_immuno.loc[problem["FilterLL"][group], f"SImAgeAcc"].values
    df_aa.at[f"SImAgeAccLL", f"mean_{group}"] = np.mean(lls[group])
    df_aa.at[f"SImAgeAccLL", f"median_{group}"] = np.median(lls[group])
    df_aa.at[f"SImAgeAccLL", f"q75_{group}"], df_aa.at[f"SImAgeAccLL", f"q25_{group}"] = np.percentile(lls[group], [75 ,25])
    df_aa.at[f"SImAgeAccLL", f"iqr_{group}"] = df_aa.at[f"SImAgeAccLL", f"q75_{group}"] - df_aa.at[f"SImAgeAccLL", f"q25_{group}"]
    print(f"{group}: {len(lls[group])}")

_, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
df_aa.at[f"SImAgeAcc", "pval"] = pval

_, pval = mannwhitneyu(*lls.values(), alternative='two-sided')
df_aa.at[f"SImAgeAccLL", "pval"] = pval

df_aa.to_excel(f"{path_save}/{path_local}/aa.xlsx", index=True)

# Plot with longevity ==========================================================
dist_num_bins = 20
fig = go.Figure()
for group in problem["Filter"]:
    fig.add_trace(
        go.Violin(
            y=vals[group],
            name=group,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor=problem["Color"][group],
            marker=dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals[group]) / dist_num_bins,
            opacity=0.8,
            legendgroup=group,
            scalegroup=group,
            side='negative',
            scalemode="width",
            pointpos=-1.5
        )
    )
    fig.add_trace(
        go.Violin(
            y=lls[group],
            name=group,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='orange',
            fillcolor=problem["ColorLL"][group],
            marker=dict(color=problem["ColorLL"][group], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(lls[group]) / dist_num_bins,
            opacity=0.8,
            legendgroup=group,
            scalegroup=group,
            scalemode="width",
            side='positive',
            pointpos=1.5
        )
    )
add_layout(fig, "", f"SImAge Acceleration", f"p-value (all): {df_aa.at[f'SImAgeAcc', 'pval']:0.2e}<br>p-value ({ll_thld}+): {df_aa.at[f'SImAgeAccLL', 'pval']:0.2e}")
fig.update_layout(title_xref='paper')
fig.update_layout(legend_font_size=20)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_layout(
    violingap=0.35,
    violingroupgap=0.35,
    width=850,
    height=600,
    margin=go.layout.Margin(
        l=150,
        r=50,
        b=75,
        t=100,
        pad=0,
    )
)
fig.update_layout(legend_y=1.01)
save_figure(fig, f"{path_save}/{path_local}/violin_SImAgeAcc_ll")

# Plot without longevity =======================================================
dist_num_bins = 20
fig = go.Figure()
for group in problem["Filter"]:
    fig.add_trace(
        go.Violin(
            y=vals[group],
            name=group,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor=problem["Color"][group],
            marker=dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals[group]) / dist_num_bins,
            opacity=0.8,
        )
    )
add_layout(fig, "", f"SImAge Acceleration", f"p-value (all): {df_aa.at[f'SImAgeAcc', 'pval']:0.2e}")
fig.update_layout(title_xref='paper')
fig.update_layout(legend_font_size=20)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_layout(
    violingap=0.35,
    violingroupgap=0.35,
    width=850,
    height=600,
    margin=go.layout.Margin(
        l=150,
        r=50,
        b=75,
        t=100,
        pad=0,
    )
)
fig.update_layout(legend_y=1.01)
save_figure(fig, f"{path_save}/{path_local}/violin_SImAgeAcc")

min_val = df_immuno[["Age", "SImAge"]].min().min()
max_val = df_immuno[["Age", "SImAge"]].max().max()
shift_val = max_val - min_val
min_val -= 0.05 * shift_val
max_val += 0.05 * shift_val

# Plot without longevity =======================================================
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        showlegend=False,
        name="",
        mode="lines",
        marker_color="black",
        marker=dict(
            size=8,
            opacity=0.75,
            line=dict(
                color="black",
                width=0.5
            )
        )
    )
)
fig.add_trace(
    go.Scatter(
        x=df_immuno.loc[df_immuno["Region"] == problem["BasePart"], f"Age"].values,
        y=df_immuno.loc[df_immuno["Region"] == problem["BasePart"], f"SImAge_linear_pred"].values,
        showlegend=False,
        name="",
        mode="lines",
        marker_color=problem["Color"][problem["BasePart"]],
        line=dict(width=5),
        marker=dict(
            size=8,
            opacity=0.75,
            line=dict(
                color="black",
                width=0.5
            )
        )
    )
)
for group in problem["Filter"]:
    fig.add_trace(
        go.Scatter(
            x=df_immuno.loc[problem["Filter"][group], f"Age"].values,
            y=df_immuno.loc[problem["Filter"][group], f"SImAge"].values,
            showlegend=True,
            name=group,
            mode="markers",
            line_color=problem["Color"][group],
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
add_layout(fig, f"Age", f"SImAge", f"")
fig.update_layout(legend_font_size=20)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_xaxes(autorange=False)
fig.update_yaxes(autorange=False)
fig.update_layout(title_xref='paper')
fig.update_layout(xaxis_range=[min_val, max_val])
fig.update_layout(yaxis_range=[min_val, max_val])
fig.update_layout(
    width=650,
    height=600,
    margin=go.layout.Margin(
        l=100,
        r=50,
        b=100,
        t=50,
        pad=0,
    )
)
save_figure(fig, f"{path_save}/{path_local}/scatter_SImAge")

# Plot with longevity ==========================================================
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        showlegend=False,
        name="",
        mode="lines",
        marker_color="black",
        marker=dict(
            size=8,
            opacity=0.75,
            line=dict(
                color="black",
                width=0.5
            )
        )
    )
)
fig.add_trace(
    go.Scatter(
        x=df_immuno.loc[df_immuno["Region"] == problem["BasePart"], f"Age"].values,
        y=df_immuno.loc[df_immuno["Region"] == problem["BasePart"], f"SImAge_linear_pred"].values,
        showlegend=False,
        name="",
        mode="lines",
        marker_color=problem["Color"][problem["BasePart"]],
        line=dict(width=5),
        marker=dict(
            size=8,
            opacity=0.75,
            line=dict(
                color="black",
                width=0.5
            )
        )
    )
)
for group in problem["Filter"]:
    fig.add_trace(
        go.Scatter(
            x=df_immuno.loc[problem["FilterNoLL"][group], f"Age"].values,
            y=df_immuno.loc[problem["FilterNoLL"][group], f"SImAge"].values,
            showlegend=True,
            name=group,
            mode="markers",
            line_color=problem["Color"][group],
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    fig.add_trace(
        go.Scatter(
            x=df_immuno.loc[problem["FilterLL"][group], f"Age"].values,
            y=df_immuno.loc[problem["FilterLL"][group], f"SImAge"].values,
            showlegend=False,
            name=group,
            mode="markers",
            line_color=problem["ColorLL"][group],
            marker_symbol="x-dot",
            marker=dict(
                size=12,
                opacity=0.75,
                line=dict(
                    color="orange",
                    width=1.0
                )
            )
        )
    )
add_layout(fig, f"Age", f"SImAge", f"")
fig.update_layout(legend_font_size=20)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_xaxes(autorange=False)
fig.update_yaxes(autorange=False)
fig.update_layout(title_xref='paper')
fig.update_layout(xaxis_range=[min_val, max_val])
fig.update_layout(yaxis_range=[min_val, max_val])
fig.update_layout(
    width=650,
    height=600,
    margin=go.layout.Margin(
        l=100,
        r=50,
        b=100,
        t=50,
        pad=0,
    )
)
save_figure(fig, f"{path_save}/{path_local}/scatter_SImAge_ll")

## 3. Region specific: Mann-Whitney and median test and fold change

In [None]:
path_local = "immuno/03_region_specific"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df_immuno_stat = pd.DataFrame()
df_immuno_stat.index.name = "feat"
df_immuno_stat_ll = pd.DataFrame()
df_immuno_stat_ll.index.name = "feat"
for feat in tqdm(feats_immuno):
    vals = {}
    lls = {}
    for group in problem["Filter"]:

        vals[group] = df_immuno.loc[problem["Filter"][group], feat].values
        df_immuno_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_immuno_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_immuno_stat.at[feat, f"q75_{group}"], df_immuno_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_immuno_stat.at[feat, f"iqr_{group}"] = df_immuno_stat.at[feat, f"q75_{group}"] - df_immuno_stat.at[feat, f"q25_{group}"]

        lls[group] = df_immuno.loc[problem["FilterLL"][group], feat].values
        df_immuno_stat_ll.at[feat, f"mean_{group}"] = np.mean(lls[group])
        df_immuno_stat_ll.at[feat, f"median_{group}"] = np.median(lls[group])
        df_immuno_stat_ll.at[feat, f"q75_{group}"], df_immuno_stat_ll.at[feat, f"q25_{group}"] = np.percentile(lls[group], [75 , 25])
        df_immuno_stat_ll.at[feat, f"iqr_{group}"] = df_immuno_stat_ll.at[feat, f"q75_{group}"] - df_immuno_stat_ll.at[feat, f"q25_{group}"]

    _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_immuno_stat.at[feat, "log_fold_change"] = np.log2(np.mean(vals["Yakutia"])) - np.log2(np.mean(vals["Central"]))
    df_immuno_stat.at[feat, "mw_pval"] = pval
    _, pval, _, _ = median_test(*vals.values())
    df_immuno_stat.at[feat, "mood_pval"] = pval

    _, pval = mannwhitneyu(*lls.values(), alternative='two-sided')
    df_immuno_stat_ll.at[feat, "log_fold_change"] = np.log2(np.mean(lls["Yakutia"])) - np.log2(np.mean(lls["Central"]))
    df_immuno_stat_ll.at[feat, "mw_pval"] = pval
    _, pval, _, _ = median_test(*lls.values())
    df_immuno_stat_ll.at[feat, "mood_pval"] = pval

_, df_immuno_stat["mw_pval_fdr_bh"], _, _ = multipletests(df_immuno_stat["mw_pval"], 0.05, method='fdr_bh')
_, df_immuno_stat["mood_pval_fdr_bh"], _, _ = multipletests(df_immuno_stat["mood_pval"], 0.05, method='fdr_bh')
df_immuno_stat.to_excel(f"{path_save}/{path_local}/table.xlsx", index=True)

_, df_immuno_stat_ll["mw_pval_fdr_bh"], _, _ = multipletests(df_immuno_stat_ll["mw_pval"], 0.05, method='fdr_bh')
_, df_immuno_stat_ll["mood_pval_fdr_bh"], _, _ = multipletests(df_immuno_stat_ll["mood_pval"], 0.05, method='fdr_bh')
df_immuno_stat_ll.to_excel(f"{path_save}/{path_local}/table_ll.xlsx", index=True)

In [None]:
df_immuno_stat.index.name = "Biomarker"
df_immuno_stat.sort_values(["mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_immuno_stat[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_immuno_stat.loc[:,"mw_pval_fdr_bh"].values.astype(float))
df_immuno_stat['Color'] = 'pink'
df_immuno_stat.loc[df_immuno_stat['mw_pval_fdr_bh'] < 0.05, 'Color'] = 'red'
plt.figure(figsize=(10, 20))
sns.set_theme(style='whitegrid', font_scale=2)
sns.barplot(
    data=df_immuno_stat,
    y=df_immuno_stat.index,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    edgecolor='black',
    palette=df_immuno_stat['Color'].values,
    orient="h",
    dodge=False
)
plt.savefig(f"{path_save}/{path_local}/bar_mw_pval_fdr_bh.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/bar_mw_pval_fdr_bh.pdf", bbox_inches='tight')
plt.close()
df_immuno_stat.sort_values(["mood_pval_fdr_bh"], ascending=[True], inplace=True)
df_immuno_stat[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_immuno_stat["mood_pval_fdr_bh"].values)
df_immuno_stat['Color'] = 'pink'
df_immuno_stat.loc[df_immuno_stat['mood_pval_fdr_bh'] < 0.05, 'Color'] = 'red'
plt.figure(figsize=(10, 20))
sns.set_theme(style='whitegrid', font_scale=2)
sns.barplot(
    data=df_immuno_stat,
    y=df_immuno_stat.index,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    edgecolor='black',
    palette=df_immuno_stat['Color'].values,
    orient="h",
    dodge=False
)
plt.savefig(f"{path_save}/{path_local}/bar_mood_pval_fdr_bh.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/bar_mood_pval_fdr_bh.pdf", bbox_inches='tight')
plt.close()

df_immuno_stat_ll.index.name = "Biomarker"
df_immuno_stat_ll.sort_values(["mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_immuno_stat_ll[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_immuno_stat_ll.loc[:,"mw_pval_fdr_bh"].values.astype(float))
df_immuno_stat_ll['Color'] = 'pink'
df_immuno_stat_ll.loc[df_immuno_stat_ll['mw_pval_fdr_bh'] < 0.05, 'Color'] = 'red'
plt.figure(figsize=(10, 20))
sns.set_theme(style='whitegrid', font_scale=2)
sns.barplot(
    data=df_immuno_stat_ll,
    y=df_immuno_stat_ll.index,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    edgecolor='black',
    palette=df_immuno_stat_ll['Color'].values,
    orient="h",
    dodge=False
)
plt.savefig(f"{path_save}/{path_local}/bar_mw_pval_fdr_bh_ll.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/bar_mw_pval_fdr_bh_ll.pdf", bbox_inches='tight')
plt.close()

df_immuno_stat_ll.sort_values(["mood_pval_fdr_bh"], ascending=[True], inplace=True)
df_immuno_stat_ll[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_immuno_stat_ll["mood_pval_fdr_bh"].values)
df_immuno_stat_ll['Color'] = 'pink'
df_immuno_stat_ll.loc[df_immuno_stat_ll['mood_pval_fdr_bh'] < 0.05, 'Color'] = 'red'
plt.figure(figsize=(10, 20))
sns.set_theme(style='whitegrid', font_scale=2)
sns.barplot(
    data=df_immuno_stat_ll,
    y=df_immuno_stat_ll.index,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    edgecolor='black',
    palette=df_immuno_stat_ll['Color'].values,
    orient="h",
    dodge=False
)
plt.savefig(f"{path_save}/{path_local}/bar_mood_pval_fdr_bh_ll.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/bar_mood_pval_fdr_bh_ll.pdf", bbox_inches='tight')
plt.close()

In [None]:
fc_lim = 0.5
pval_lim = 1e-5

df_immuno_stat.sort_values(["mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_immuno_stat['print'] = df_immuno_stat.index.values
df_immuno_mw_top = df_immuno_stat.loc[((df_immuno_stat["log_fold_change"] > fc_lim) | (df_immuno_stat["log_fold_change"] < -fc_lim)) & (df_immuno_stat["mw_pval_fdr_bh"] < pval_lim), :]
top_to_hightlight = df_immuno_mw_top["print"].values
sns.set_theme(style='whitegrid')
volcano(
    df=df_immuno_stat,
    lfc='log_fold_change',
    pv='mw_pval_fdr_bh',
    pv_thr=(pval_lim, pval_lim),
    lfc_thr=(fc_lim, fc_lim),
    path=f"{path_save}/{path_local}",
    genenames=tuple(top_to_hightlight),
    geneid='print',
    gstyle=2,
    dotsize=10,
    sign_line=True,
    figname="volcano_mw_pval_fdr_bh",
    color=("limegreen", "grey", "royalblue")
)

### Plot examples

In [None]:
pval_col = "mw_pval_fdr_bh"
dist_num_bins = 50
pathlib.Path(f"{path_save}/{path_local}/examples").mkdir(parents=True, exist_ok=True)

top_features_ranges = {
    'IL2': [-2, 18],
    'IL25': [-100, 2000],
    'CD40LG': [-150, 10000],
    'IL10': [-3, 50],
    'IL17F': [-20, 800],
    'IL3': [-2, 10],
    'IL17A': [-5, 50],
    'FGF2': [-10, 500],
}
top_features_bandwidth={
    'IL2': {'Central': 0.5, 'Yakutia': 0.5},
    'IL25': {'Central': 20, 'Yakutia': 20},
    'CD40LG': {'Central': 100, 'Yakutia': 100},
    'IL10': {'Central': 1, 'Yakutia': 1},
    'IL17F': {'Central': 10, 'Yakutia': 10},
    'IL3': {'Central': 0.2, 'Yakutia': 0.2},
    'IL17A': {'Central': 2, 'Yakutia': 2},
}

feats_to_plot = list(df_immuno_mw_top.index)
for feat_id, feat in enumerate(feats_to_plot):
    pval = df_immuno_stat.at[feat, pval_col]
    log_fc = df_immuno_stat.at[feat, "log_fold_change"]

    # Plot without longevity ===================================================
    fig = go.Figure()
    for group in problem["Filter"]:
        vals = df_immuno.loc[problem["Filter"][group], feat].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker=dict(color=problem["Color"][group], line=dict(color='black', width=0.3), opacity=0.8),
                points='all',
                bandwidth=top_features_bandwidth[feat][group] if feat in top_features_bandwidth else np.ptp(
                    vals) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", feat, f"p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}")
    fig.update_layout(title_xref='paper')
    if feat in top_features_ranges:
        fig.update_yaxes(autorange=False)
        fig.update_layout(yaxis_range=top_features_ranges[feat])
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=600,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=75,
            t=100,
            pad=0,
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{feat}")

    # Plot with longevity ======================================================
    fig = go.Figure()
    for group in problem["Filter"]:

        vals = df_immuno.loc[problem["Filter"][group], feat].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker=dict(color=problem["Color"][group], line=dict(color='black', width=0.3), opacity=0.8),
                points='all',
                bandwidth=top_features_bandwidth[feat][group] if feat in top_features_bandwidth else np.ptp(
                    vals) / dist_num_bins,
                opacity=0.8,
                legendgroup=group,
                scalegroup=group,
                side='negative',
                scalemode="width",
                pointpos=-1.5
            )
        )

        lls = df_immuno.loc[problem["FilterLL"][group], feat].values
        fig.add_trace(
            go.Violin(
                y=lls,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='orange',
                fillcolor=problem["ColorLL"][group],
                marker=dict(color=problem["ColorLL"][group], line=dict(color='black', width=0.3), opacity=0.8),
                points='all',
                bandwidth=top_features_bandwidth[feat][group] if feat in top_features_bandwidth else np.ptp(
                    lls) / dist_num_bins,
                opacity=0.8,
                legendgroup=group,
                scalegroup=group,
                side='positive',
                scalemode="width",
                pointpos=1.5
            )
        )

    add_layout(fig, "", feat, f"p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}")
    fig.update_layout(title_xref='paper')
    if feat in top_features_ranges:
        fig.update_yaxes(autorange=False)
        fig.update_layout(yaxis_range=top_features_ranges[feat])
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=850,
        height=600,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=75,
            t=100,
            pad=0,
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{feat}_ll")