In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
import functools


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Init dnam and fill it from immunology data

In [None]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)
manifest['CHR'] = manifest['chr'].str[3::]

dnam_suffix = "_harm"

immuno_samples = "all_1052_121222" # "ctrl_415_from_all_1052_121222"
immuno_proc = "raw"
immuno_imp = "fast_knn"
immuno_replace = "quarter"

select_dnam = 'chronology_0' # "common_with_immuno" "chronology_0"
select_immuno = "260_ml_draft"

path_save = f"{path}/{platform}/{dataset}/special/054_yakutia_EWAS_revision"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

df_immuno = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples({immuno_samples})_proc({immuno_proc})_imp({immuno_imp})_replace({immuno_replace}).xlsx", index_col="index")

pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
pheno.drop(["I64_old", "I1_duplicate"], inplace=True)

# Check DNAm only index
index_dnam_only = pheno.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

# Check phenotype differences in Immunology and DNAm data
indexes_common_glob = pheno.index.intersection(df_immuno.index)
is_region_equal_glob = pheno.loc[indexes_common_glob, 'Region'].equals(df_immuno.loc[indexes_common_glob, 'Region'])
is_sex_equal_glob = pheno.loc[indexes_common_glob, 'Sex'].equals(df_immuno.loc[indexes_common_glob, 'Sex'])
is_status_equal_glob = pheno.loc[indexes_common_glob, 'Status'].equals(df_immuno.loc[indexes_common_glob, 'Status'])
age_diff_glob = np.abs(pheno.loc[indexes_common_glob, 'Age'].values - df_immuno.loc[indexes_common_glob, 'Age'].values)
age_diff_max_glob = np.max(age_diff_glob)
print(f"is_region_equal_glob: {is_region_equal_glob}")
print(f"is_sex_equal_glob: {is_sex_equal_glob}")
print(f"is_status_equal_glob: {is_status_equal_glob}")
print(f"age_diff_max_glob: {age_diff_max_glob}")

df_immuno['is_dnam'] = False
df_immuno.loc[pheno.index.intersection(df_immuno.index), 'is_dnam'] = True
df_immuno = df_immuno.loc[(df_immuno["Status"] == "Control"), :]
df_immuno["Region"].replace({"Yakutiya": "Yakutia"}, inplace=True)
feats_immuno = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()
# Replace Age in DNAm:
pheno.loc[pheno.index.intersection(df_immuno.index), 'Age'] = df_immuno.loc[pheno.index.intersection(df_immuno.index), 'Age']
age_diff = np.abs(pheno.loc[pheno.index.intersection(df_immuno.index), 'Age'].values - df_immuno.loc[pheno.index.intersection(df_immuno.index), 'Age'].values)
age_diff_max = np.max(age_diff)
print(f"age_diff_max: {age_diff_max}")
# Immuno selection
if select_immuno == "260_ml_draft":
    df_immuno = df_immuno.loc[(df_immuno["260ai"] == True) | (df_immuno["Region"] == "Yakutia"), :]

betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas{dnam_suffix}.pkl")
feats_dnam = betas.columns.values
df_dnam = pd.merge(pheno, betas, left_index=True, right_index=True)
df_dnam = df_dnam.loc[(df_dnam["Status"] == "Control"), :]
df_dnam["Region and residence"] = "Central"
df_dnam.loc[(df_dnam["Region"] == "Yakutia") & (df_dnam["Residence"] == "City"), "Region and residence"] = "Yakutia (City)"
df_dnam.loc[(df_dnam["Region"] == "Yakutia") & (df_dnam["Residence"] == "Village"), "Region and residence"] = "Yakutia (Village)"
# DNAm selection
if select_dnam == "common_with_immuno":
    df_dnam = df_dnam.loc[df_dnam.index.intersection(df_immuno.index).values, :]
elif select_dnam == 'chronology_0':
    df_dnam = df_dnam.loc[df_dnam["Sample_Chronology"] == 0, :]

index_common = df_dnam.index.intersection(df_immuno.index).values

index_dnam_only = df_dnam.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

n_samples_immuno_central = len(df_immuno.index[df_immuno["Region"] == "Central"].values)
n_samples_immuno_yakutia = len(df_immuno.index[df_immuno["Region"] == "Yakutia"].values)
n_samples_dnam_central = len(df_dnam.index[df_dnam["Region"] == "Central"].values)
n_samples_dnam_yakutia = len(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)
print(f"n_samples_immuno_central: {n_samples_immuno_central}")
print(f"n_samples_immuno_yakutia: {n_samples_immuno_yakutia}")
print(f"n_samples_dnam_central: {n_samples_dnam_central}")
print(f"n_samples_dnam_yakutia: {n_samples_dnam_yakutia}")

### Delete sakha samples

In [None]:
ids_non_sakha = pd.read_excel(f"{path_save}/samples_to_delete.xlsx", index_col=0).index.values
df_dnam.drop(labels=ids_non_sakha, inplace=True, errors='ignore')
df_immuno.drop(labels=ids_non_sakha, inplace=True, errors='ignore')
n_samples_immuno_central = len(df_immuno.index[df_immuno["Region"] == "Central"].values)
n_samples_immuno_yakutia = len(df_immuno.index[df_immuno["Region"] == "Yakutia"].values)
n_samples_dnam_central = len(df_dnam.index[df_dnam["Region"] == "Central"].values)
n_samples_dnam_yakutia = len(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)
print(f"n_samples_immuno_central: {n_samples_immuno_central}")
print(f"n_samples_immuno_yakutia: {n_samples_immuno_yakutia}")
print(f"n_samples_dnam_central: {n_samples_dnam_central}")
print(f"n_samples_dnam_yakutia: {n_samples_dnam_yakutia}")

index_common = df_dnam.index.intersection(df_immuno.index).values

index_dnam_only = df_dnam.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

# Save pheno table

In [None]:
age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x
age_types['mPACE'] = 'DunedinPACE'

cells = {f"{x}{dnam_suffix}": x for x in ["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"]}

df_ph = df_dnam.loc[:, ["Sentrix_ID", "Sentrix_Position", "Age", "Sex", "Region", "Status"] + list(age_types.keys()) + list(cells.keys())].copy()
df_ph.rename(columns=age_types, inplace=True)
df_ph.rename(columns=cells, inplace=True)
df_ph['idat_ID'] = df_ph['Sentrix_ID'].astype(str) + '_' + df_ph['Sentrix_Position'].astype(str)
df_gsm = pd.read_excel('D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/026_data_for_GEO/GSM.xlsx', index_col='idat_ID')
df_ph.loc[df_ph.index.values, 'GSM'] = df_gsm.loc[df_ph.loc[df_ph.index.values, 'idat_ID'].values, 'GSM'].values

first_columns = [
    'GSM',
    'Sentrix_ID',
    'Sentrix_Position',
    'idat_ID',
    'Age',
    'Sex',
    'Region',
    'Status',
]
df_ph = df_ph[first_columns + [col for col in df_ph.columns if col not in first_columns]]
df_ph.to_excel(f"{path_save}/samples.xlsx", index=True)

# DNAm

## Setup

In [None]:
dict_problems_all = {
    "all_region": {
        "target": "Region",
        "path": "00_all_region",
        "color": {
            "Central": "gold",
            "Yakutia": "lightslategray"
        },
        "color_line": "black",
        "filter": {
            "Central": (df_dnam["Region"] == "Central"),
            "Yakutia": (df_dnam["Region"] == "Yakutia")
        },
        "base_filter": (df_dnam["Region"] == "Central"),
        "base_part": "Central",
        "all_filter": (df_dnam["Region"].isin(["Central", "Yakutia"])),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_fc": 0.0,
        "dmp_top_n": 1000,
        'dmr_pval': 0.05,
        'xlim': [-2.0, 2.0, 9]
    },
    "central_sex": {
        "target": "Sex",
        "path": "01_central_sex",
        "color": {
            "F": "hotpink",
            "M": "skyblue"
        },
        "color_line": "black",
        "filter": {
            "F": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "F"),
            "M": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "M")
        },
        "base_filter": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "F"),
        "base_part": "F",
        "all_filter": (df_dnam["Region"] == "Central"),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_top_n": 1000,
        "dmp_fc": 0.0,
        'xlim': [-2.5, 2.5, 11]
    },
    "yakutia_sex": {
        "target": "Sex",
        "path": "02_yakutia_sex",
        "color": {
            "F": "firebrick",
            "M": "royalblue"
        },
        "color_line": 'black', #"lightsteelblue",
        "filter": {
            "F": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "F"),
            "M": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "M")
        },
        "base_filter": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "F"),
        "base_part": "F",
        "all_filter": (df_dnam["Region"] == "Yakutia"),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_fc": 0.0,
        'xlim': [-2.5, 2.5, 11]
    },
}

problems_selected = ['all_region', 'central_sex', 'yakutia_sex']
dict_problems = {}
for problem in dict_problems_all:
    if problem in problems_selected:
        dict_problems[problem] = dict_problems_all[problem]

for key, val in dict_problems.items():
    pathlib.Path(f"{path_save}/{val['path']}").mkdir(parents=True, exist_ok=True)

## Ages

In [None]:
age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x

dist_num_bins = 15

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/03_ages").mkdir(parents=True, exist_ok=True)

    df_stat = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
    for age_type in tqdm(age_types):
        formula = f"{age_type} ~ Age"
        model = smf.ols(formula=formula, data=df_dnam.loc[dict_problem["base_filter"], :]).fit()
        df_dnam[f"{age_type}_linear_pred"] = model.predict(df_dnam)
        df_dnam[f"{age_type}Acc"] = df_dnam[age_type] - df_dnam[f"{age_type}_linear_pred"]

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}Acc"].values
            df_stat.at[f"{age_type}Acc", f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[f"{age_type}Acc", f"median_{group}"] = np.median(vals[group])
            df_stat.at[f"{age_type}Acc", f"q75_{group}"], df_stat.at[f"{age_type}Acc", f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[f"{age_type}Acc", f"iqr_{group}"] = df_stat.at[f"{age_type}Acc", f"q75_{group}"] - df_stat.at[f"{age_type}Acc", f"q25_{group}"]

        _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_stat.at[f"{age_type}Acc", "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/03_ages/stat.xlsx", index=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], ["Sex", "Region", "Age"] + list(age_types.keys()) + [f"{x}Acc" for x in age_types]]
    df_fig.rename(columns=age_types, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/03_ages/fig.xlsx")

    for age_type in tqdm(age_types):

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}Acc"].values

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5

            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color=dict_problem["color_line"],
                    fillcolor=dict_problem["color"][group],
                    marker=dict(color=dict_problem["color"][group], line=dict(color=dict_problem["color_line"],width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8,
                )
            )
        add_layout(fig, "", f"{age_types[age_type]}Acc", f"p-value: {df_stat.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=500,
            height=600,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=50,
                t=50,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages/violin_{age_type}Acc")

        min_val = df_dnam[["Age", age_type]].min().min()
        max_val = df_dnam[["Age", age_type]].max().max()
        shift_val = max_val - min_val
        min_val -= 0.05 * shift_val
        max_val += 0.05 * shift_val

        # Regular plot =======================================================
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_dnam.loc[dict_problem["base_filter"], f"Age"].values,
                y=df_dnam.loc[dict_problem["base_filter"], f"{age_type}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                line=dict(width=5),
                marker_color=dict_problem["color"][dict_problem["base_part"]],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in dict_problem["filter"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[dict_problem["filter"][group], f"Age"].values,
                    y=df_dnam.loc[dict_problem["filter"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=dict_problem["color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color=dict_problem["color_line"],
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages/scatter_{age_type}")

## Ages without regression (raw values from Horvath's calculator)

In [None]:
age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x

dist_num_bins = 15

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/03_ages_raw").mkdir(parents=True, exist_ok=True)

    df_stat = pd.DataFrame(index=[f"{x}AccRaw" for x in age_types], columns=["pval", "pval_fdr_bh"])
    for age_type in tqdm(age_types):
        df_dnam[f"{age_type}AccRaw"] = df_dnam[age_type] - df_dnam[f"Age"]

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}AccRaw"].values
            df_stat.at[f"{age_type}AccRaw", f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[f"{age_type}AccRaw", f"median_{group}"] = np.median(vals[group])
            df_stat.at[f"{age_type}AccRaw", f"q75_{group}"], df_stat.at[f"{age_type}AccRaw", f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[f"{age_type}AccRaw", f"iqr_{group}"] = df_stat.at[f"{age_type}AccRaw", f"q75_{group}"] - df_stat.at[f"{age_type}AccRaw", f"q25_{group}"]

        _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_stat.at[f"{age_type}AccRaw", "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/03_ages_raw/stat.xlsx", index=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], ["Sex", "Region", "Age"] + list(age_types.keys()) + [f"{x}AccRaw" for x in age_types]]
    df_fig.rename(columns=age_types, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/03_ages_raw/fig.xlsx")

    for age_type in tqdm(age_types):

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}AccRaw"].values

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5

            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color=dict_problem["color_line"],
                    fillcolor=dict_problem["color"][group],
                    marker=dict(color=dict_problem["color"][group], line=dict(color=dict_problem["color_line"],width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8,
                )
            )
        add_layout(fig, "", f"{age_types[age_type]}AccRaw", f"p-value: {df_stat.at[f'{age_type}AccRaw', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=500,
            height=600,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=50,
                t=50,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages_raw/violin_{age_type}Acc")

        min_val = df_dnam[["Age", age_type]].min().min()
        max_val = df_dnam[["Age", age_type]].max().max()
        shift_val = max_val - min_val
        min_val -= 0.05 * shift_val
        max_val += 0.05 * shift_val

        # Regular plot =======================================================
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in dict_problem["filter"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[dict_problem["filter"][group], f"Age"].values,
                    y=df_dnam.loc[dict_problem["filter"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=dict_problem["color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color=dict_problem["color_line"],
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages_raw/scatter_{age_type}")

## Special age plot

In [None]:
def color_tick(color, text):
    return f"<span style='color:{str(color)}'> {str(text)} </span>"

dict_problem = dict_problems['all_region']
pathlib.Path(f"{path_save}/{dict_problem['path']}/age_special_plot").mkdir(parents=True, exist_ok=True)

age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x

age_colors = {
    f"DNAmAgeHannum{dnam_suffix}": 'blue',
    f"DNAmAge{dnam_suffix}": 'crimson',
    f"DNAmPhenoAge{dnam_suffix}": 'seagreen',
    f"DNAmGrimAge{dnam_suffix}": 'darkviolet',
    "PCHorvath1": 'darkred',
    "PCHorvath2": 'firebrick',
    "PCHannum": 'darkblue',
    "PCPhenoAge": 'darkgreen',
    "PCGrimAge": 'purple',
    "mPACE": 'black'
}

df_stat_ages = pd.read_excel(f"{path_save}/{dict_problem['path']}/03_ages/stat.xlsx", index_col=0)

fig = go.Figure()
age_order = list(age_types.keys())[::-1]
age_labels = {}
for age_id, age_type in tqdm(enumerate(age_order)):
    vals_0 = df_dnam.loc[dict_problem["filter"]["Central"], f"{age_type}Acc"].values
    color_0 = dict_problem['color']['Central']
    vals_1 = df_dnam.loc[dict_problem["filter"]["Yakutia"], f"{age_type}Acc"].values
    color_1 = dict_problem['color']['Yakutia']
    pval = df_stat_ages.at[f'{age_type}Acc', 'pval_fdr_bh']
    age_label = f"{age_types[age_type]}<br>p-value: {pval:0.2e}"
    age_labels[age_type] = age_label

    fig.add_trace(
        go.Violin(
            y=[age_id] * len(vals_0),
            x=vals_0,
            name=age_label,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color=age_colors[age_type],
            fillcolor=color_0,
            marker=dict(color=color_0, line=dict(color=age_colors[age_type], width=0.35), opacity=0.8, size=8),
            points='all',
            bandwidth=np.ptp(vals_0) / dist_num_bins,
            opacity=0.8,
            legendgroup=age_label,
            scalegroup=age_label,
            side='negative',
            orientation='h',
            scalemode="width",
            pointpos=-1.5
        )
    )

    fig.add_trace(
        go.Violin(
            y=[age_id] * len(vals_1),
            x=vals_1,
            name=age_label,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color=age_colors[age_type],
            fillcolor=color_1,
            marker=dict(color=color_1, line=dict(color=age_colors[age_type], width=0.35), opacity=0.8, size=8),
            points='all',
            bandwidth=np.ptp(vals_1) / dist_num_bins,
            opacity=0.8,
            legendgroup=age_label,
            scalegroup=age_label,
            scalemode="width",
            side='positive',
            orientation='h',
            pointpos=1.5
        )
    )
add_layout(fig, "Region-specific age acceleration", f"", f"")
fig.update_layout(
    title=dict(xref='paper', x=1.0),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="left",
        x=0.0001,
        itemsizing='constant',
        font_size=22
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=list(range(len(age_types))),
        ticktext=[color_tick(age_colors[x], age_labels[x]) for x in age_order],
        tickfont=dict(size=25)
    ),
    xaxis=dict(
        tickfont=dict(size=26),
        titlefont=dict(size=26)
    )
)
fig.update_layout(
    violingap=0.39,
    violingroupgap=0.39,
    height=140 * len(age_types),
    width=1000,
    margin=go.layout.Margin(
        l=260,
        r=30,
        b=110,
        t=30,
        pad=0,
    )
)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_yaxes(autorange=False, range=[-0.5, len(age_types) - 0.5])
fig.update_xaxes(autorange=True)
save_figure(fig, f"{path_save}/{dict_problem['path']}/age_special_plot/ages_violins")

fig = go.Figure()

vals_0 = df_dnam.loc[dict_problem["filter"]["Central"], "mPACE"].values
color_0 = dict_problem['color']['Central']
vals_1 = df_dnam.loc[dict_problem["filter"]["Yakutia"], "mPACE"].values
color_1 = dict_problem['color']['Yakutia']
_, pval = mannwhitneyu(vals_0, vals_1, alternative='two-sided')
label = f"DunedinPACE<br>p-value: {pval:0.2e}"

fig.add_trace(
    go.Violin(
        y=[0] * len(vals_0),
        x=vals_0,
        name=label,
        box_visible=True,
        meanline_visible=True,
        showlegend=False,
        line_color=age_colors["mPACE"],
        fillcolor=color_0,
        marker=dict(color=color_0, line=dict(color=age_colors["mPACE"], width=0.35), opacity=0.8, size=8),
        points='all',
        bandwidth=np.ptp(vals_0) / dist_num_bins,
        opacity=0.8,
        legendgroup=label,
        scalegroup=label,
        side='negative',
        orientation='h',
        scalemode="width",
        pointpos=-1.5
    )
)

fig.add_trace(
    go.Violin(
        y=[0] * len(vals_1),
        x=vals_1,
        name=label,
        box_visible=True,
        meanline_visible=True,
        showlegend=False,
        line_color=age_colors["mPACE"],
        fillcolor=color_1,
        marker=dict(color=color_1, line=dict(color=age_colors["mPACE"], width=0.35), opacity=0.8, size=8),
        points='all',
        bandwidth=np.ptp(vals_1) / dist_num_bins,
        opacity=0.8,
        legendgroup=label,
        scalegroup=label,
        scalemode="width",
        side='positive',
        orientation='h',
        pointpos=1.5
    )
)
add_layout(fig, "DunedinPACE", f"", f"")
fig.update_layout(
    title=dict(xref='paper', x=1.0),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="left",
        x=0.0001,
        itemsizing='constant',
        font_size=22
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=[0],
        ticktext=[color_tick(age_colors["mPACE"], label)],
        tickfont=dict(size=25)
    ),
    xaxis=dict(
        tickfont=dict(size=26),
        titlefont=dict(size=26)
    )
)
fig.update_layout(
    violingap=0.39,
    violingroupgap=0.39,
    height=300,
    width=1000,
    margin=go.layout.Margin(
        l=260,
        r=30,
        b=110,
        t=30,
        pad=0,
    )
)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_yaxes(autorange=False, range=[-0.5, 0.5])
fig.update_xaxes(autorange=True)
save_figure(fig, f"{path_save}/{dict_problem['path']}/age_special_plot/mpace_violins")

In [None]:
dict_problem = dict_problems['all_region']

palette = {age_types[x]: age_colors[x] for x in age_types}
palette['DunedinPACE'] = age_colors['mPACE']

age_ranges = {
    'Participants under 40 years old': df_dnam.index[df_dnam['Age'] < 40].values,
    'Participants from 40 to 60 years old': df_dnam.index[(df_dnam['Age'] < 60) & (df_dnam['Age'] >= 40)].values,
    'Participants from 60 to 80 years old': df_dnam.index[(df_dnam['Age'] < 80) & (df_dnam['Age'] >= 60)].values,
    'Participants over 80 years old': df_dnam.index[df_dnam['Age'] >= 80].values,
    'All Participants': df_dnam.index[df_dnam['Age'] >= 0].values
}

for age_id, (age_range_name, age_range_ids) in enumerate(age_ranges.items()):
    df_tmp = df_dnam.loc[age_range_ids, [f"{x}Acc" for x in age_types] + ['Region', 'Age', 'mPACE']].copy()
    
    df_stat = pd.DataFrame(index=list(age_types.values()))
    for age_type in age_types:
        _, pval = mannwhitneyu(
            df_tmp.loc[df_tmp['Region'] == "Central", f"{age_type}Acc"].values,
            df_tmp.loc[df_tmp['Region'] == "Yakutia", f"{age_type}Acc"].values,
            alternative='two-sided'
        )
        df_stat.at[f"{age_types[age_type]}", "pval"] = pval
    _, df_stat['pval_fdr_bh'], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat[r'$-\log_{10}(\mathrm{p-value})$'] = -np.log10(df_stat['pval_fdr_bh'].astype(float))
    _, mPACE_pval = mannwhitneyu(
        df_tmp.loc[df_tmp['Region'] == "Central", "mPACE"].values,
        df_tmp.loc[df_tmp['Region'] == "Yakutia", "mPACE"].values,
        alternative='two-sided'
    )
    df_stat.at['DunedinPACE', 'pval'] = mPACE_pval
    df_stat.at['DunedinPACE', r'$-\log_{10}(\mathrm{p-value})$'] = -np.log10(mPACE_pval)
    df_stat['Clocks'] = df_stat.index.values
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/age_special_plot/age_range_{age_id}.xlsx")
    df_stat.sort_values(['pval'], ascending=[True], inplace=True)
    
    fig = plt.figure(figsize=(4, 4))
    sns.set_theme(style='whitegrid')
    barplot = sns.barplot(
        data=df_stat,
        y='Clocks',
        hue='Clocks',
        x=r'$-\log_{10}(\mathrm{p-value})$',
        edgecolor='black',
        palette=palette,
        dodge=False,
    )
    barplot.get_legend().remove()
    for container in barplot.containers:
        barplot.bar_label(container, fmt='%.2f')
    barplot.set_title(f"{age_range_name}")
    plt.axvline(x=-np.log10(0.05), color='red', linestyle=':', linewidth=1)
    plt.savefig(f"{path_save}/{dict_problem['path']}/age_special_plot/age_range_{age_id}.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_save}/{dict_problem['path']}/age_special_plot/age_range_{age_id}.pdf", bbox_inches='tight')
    plt.close()

In [None]:
dict_problem = dict_problems['all_region']

for age_type in tqdm(age_types):

    min_val = df_dnam[["Age", age_type]].min().min()
    max_val = df_dnam[["Age", age_type]].max().max()
    shift_val = max_val - min_val
    min_val -= 0.05 * shift_val
    max_val += 0.05 * shift_val
    
    formula = f"{age_type} ~ Age"
    model = smf.ols(formula=formula, data=df_dnam.loc[dict_problem["base_filter"], :]).fit()
    fit_x0 = -100
    fit_x1 = 200
    fit_y0 = model.predict(pd.DataFrame(data=[fit_x0], columns=['Age']))[0]
    fit_y1 = model.predict(pd.DataFrame(data=[fit_x1], columns=['Age']))[0]

    # Regular plot =======================================================
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=[min_val, max_val],
            y=[min_val, max_val],
            showlegend=False,
            name="",
            mode="lines",
            line=dict(width=1, dash='dot'),
            marker_color="black",
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    for group in dict_problem["filter"]:
        fig.add_trace(
            go.Scatter(
                x=df_dnam.loc[dict_problem["filter"][group], f"Age"].values,
                y=df_dnam.loc[dict_problem["filter"][group], f"{age_type}"].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=dict_problem["color"][group],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color=age_colors[age_type],
                        width=0.5
                    )
                )
            )
        )
        
    fig.add_trace(
        go.Scatter(
            x=[fit_x0, fit_x1],
            y=[fit_y0, fit_y1],
            showlegend=False,
            name="",
            mode="lines",
            line=dict(width=7),
            marker_color=age_colors[age_type],
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    fig.add_trace(
        go.Scatter(
            x=[fit_x0, fit_x1],
            y=[fit_y0, fit_y1],
            showlegend=False,
            name="",
            mode="lines",
            line=dict(width=6),
            marker_color=dict_problem["color"][dict_problem["base_part"]],
            marker=dict(
                size=11,
                opacity=0.75,
                line=dict(
                    color=age_colors[age_type],
                    width=0.5
                )
            )
        )
    )
    add_layout(fig, f"Age", f"<span style='color:{str(age_colors[age_type])}'> {str(age_types[age_type])} </span>", f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_xaxes(autorange=False)
    fig.update_yaxes(autorange=False)
    fig.update_layout(title_xref='paper')
    fig.update_layout(xaxis_range=[min_val, max_val])
    fig.update_layout(yaxis_range=[min_val, max_val])
    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False)
    fig.update_layout(
        width=400,
        height=450,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=100,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{dict_problem['path']}/age_special_plot/scatter_{age_type}.pdf")

## DMPs

In [None]:
path_with_limma = f"{path}/{platform}/{dataset}/special/043_yakutia_EWAS/245"

dmp_top_n = 1000
n_highlights = 2
n_examples = 10
dist_num_bins = 15

reg_enr_orders = {
    'CHR': [str(x) for x in range(1, 24)],
    'RELATION_TO_UCSC_CPG_ISLAND': ['S_Shelf', 'S_Shore', 'Island', 'N_Shore', 'N_Shelf', 'OpenSea'],
    'UCSC_REFGENE_GROUP': ['TSS1500', 'TSS200', '5\'UTR', '1stExon', 'Body', '3\'UTR']
}
reg_enr_col_names = {
    'CHR': "CHR",
    'RELATION_TO_UCSC_CPG_ISLAND': "Relation_to_Island",
    'UCSC_REFGENE_GROUP': "UCSC_RefGene_Group"
}
reg_enr_fig_sizes = {
    'CHR': (17, 10),
    'RELATION_TO_UCSC_CPG_ISLAND': (5, 10),
    'UCSC_REFGENE_GROUP': (5, 10)
}
reg_enr_colors = {
    'CHR': px.colors.qualitative.Dark24,
    'RELATION_TO_UCSC_CPG_ISLAND': px.colors.qualitative.Light24[17:23],
    'UCSC_REFGENE_GROUP': px.colors.qualitative.Light24[11:17]
}

for problem, dict_problem in dict_problems.items():
    path_curr = f"{path_save}/{dict_problem['path']}/06_DMP"
   
    df_dmps = pd.read_csv(f"{path_with_limma}/{dict_problem['path']}/data_from_R/DMP.csv", index_col=0)
    df_dmps["CpG"] = df_dmps.index.values
    df_dmps.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
    df_dmps['print'] = df_dmps.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
    df_dmps['log_pval'] = -np.log10(df_dmps["adj.P.Val"])
    phenos = list(dict_problem['filter'].keys())
    mean_1 = df_dnam.loc[dict_problem['filter'][phenos[0]], df_dmps.index.values].mean(axis=0)
    mean_2 = df_dnam.loc[dict_problem['filter'][phenos[1]], df_dmps.index.values].mean(axis=0)
    df_dmps["log2FC"] = (np.log2(mean_2) - np.log2(mean_1))
    # df_dmps.to_excel(f"{path_curr}/stat.xlsx")
    
    top_to_hightlight = df_dmps["print"].values[0:n_highlights]
    sns.set_theme(style='whitegrid')
    df_dmps.sort_values(["MAPINFO"], ascending=[True], inplace=True)
    mhat(
        df=df_dmps,
        chr='CHR',
        pv='adj.P.Val',
        path=f"{path_curr}",
        valpha=1,
        axtickfontsize=10,
        axlabelfontsize=12,
        # gfont=12,
        # markernames=tuple(top_to_hightlight),
        # markeridcol='print',
        # gstyle=2,
        dim=(8, 4),
    )
    sns.set_theme(style='whitegrid')
    volcano(
        df=df_dmps,
        lfc='log2FC',
        pv='adj.P.Val',
        pv_thr=(1, 1),
        lfc_thr=(0.0, 0.0),
        path=f"{path_curr}",
        #genenames=tuple(top_to_hightlight),
        geneid='print',
        axtickfontsize=12,
        axlabelfontsize=12,
        gfont=18,
        gstyle=2,
        sign_line=False,
        ar=0,
        color=(list(dict_problem["color"].values())[1], "lavender", list(dict_problem["color"].values())[0]),
        dim=(4, 4),
        xlm=dict_problem["xlim"]
    )
    
    df_dmps.sort_values(["P.Value"], ascending=[True], inplace=True)
    df_dmps_selected = df_dmps.head(dmp_top_n)
    df_dmps_selected.to_excel(f"{path_curr}/cpgs.xlsx")
    print(f"Number of CpGs: {df_dmps_selected.shape[0]}")
    
    dmps_genes = set()
    for cpg in df_dmps_selected.index.values:
        genes_raw = manifest.at[cpg, 'Gene']
        if isinstance(genes_raw, str):
            genes = genes_raw.split(';')
            dmps_genes.update(set(genes))
    if 'non-genic' in dmps_genes:
        dmps_genes.remove('non-genic')
    if ' ' in dmps_genes:
        dmps_genes.remove(' ')
    dmps_genes = list(dmps_genes)
    df_dmps_genes = pd.DataFrame({'gene': dmps_genes})
    df_dmps_genes.to_excel(f"{path_curr}/genes.xlsx", index=False)
    print(f"Number of genes: {df_dmps_genes.shape[0]}")
    
    pathlib.Path(f"{path_curr}/examples").mkdir(parents=True, exist_ok=True)
    df_dmps_examples = df_dmps.sort_values(['P.Value'], ascending=[True]).head(n_examples)
    df_dmps_examples.to_excel(f"{path_curr}/examples/df.xlsx")
    for cpg_id, (cpg, row) in enumerate(df_dmps_examples.iterrows()):
        pval = row['adj.P.Val']
        log_fc = row['log2FC']
        gene = manifest.at[cpg, 'Gene']

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5
            vals = df_dnam.loc[dict_problem["filter"][group], cpg].values
            fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=dict_problem["color"][group],
                    marker = dict(color=dict_problem["color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth = np.ptp(vals) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}", font_size=30)
        fig.update_layout(title_xref='paper', title={'y': 0.95})
        fig.update_layout(legend_font_size=30)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend={'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=615,
            margin=go.layout.Margin(
                l=120,
                r=50,
                b=90,
                t=120,
                pad=0,
            )
        )
        save_figure(fig, f"{path_curr}/examples/{cpg_id}_{cpg}")
    
    pathlib.Path(f"{path_curr}/suspicious").mkdir(parents=True, exist_ok=True)
    df_dmps_examples = df_dmps.loc[(df_dmps['log2FC'] < 0.05) & (df_dmps['log2FC'] > -0.05), :]
    df_dmps_examples = df_dmps_examples.sort_values(['P.Value'], ascending=[True]).head(n_examples)
    df_dmps_examples.to_excel(f"{path_curr}/suspicious/df.xlsx")
    for cpg_id, (cpg, row) in enumerate(df_dmps_examples.iterrows()):
        pval = row['adj.P.Val']
        log_fc = row['log2FC']
        gene = manifest.at[cpg, 'Gene']

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5
            vals = df_dnam.loc[dict_problem["filter"][group], cpg].values
            fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=dict_problem["color"][group],
                    marker = dict(color=dict_problem["color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth = np.ptp(vals) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}", font_size=30)
        fig.update_layout(title_xref='paper', title={'y': 0.95})
        fig.update_layout(legend_font_size=30)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend={'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=615,
            margin=go.layout.Margin(
                l=120,
                r=50,
                b=90,
                t=120,
                pad=0,
            )
        )
        save_figure(fig, f"{path_curr}/suspicious/{cpg_id}_{cpg}")
    
    pathlib.Path(f"{path_curr}/reg_enr").mkdir(parents=True, exist_ok=True)
    df_dmps_fisher_target = manifest.loc[df_dmps_selected.index.values, :]
    df_dmps_fisher_global = manifest.loc[df_dmps.index.values, :]
    df_dmps_fisher_padding = df_dmps_fisher_global.loc[~df_dmps_fisher_global.index.isin(df_dmps_selected.index.values), :]
    for var in reg_enr_orders:
        columns=["11", "12", "21", "22", "sum", "pval", "odds_ratio"]
        df_var = pd.DataFrame(index=reg_enr_orders[var], columns=columns, data=np.zeros((len(reg_enr_orders[var]), len(columns))))
        df_var.index.name = reg_enr_col_names[var].replace("_", " ")
        for var_val in reg_enr_orders[var]:
            contingency_table = pd.DataFrame(index=["specific", "non-specific"], columns=["in_val", "not_in_val"])
            contingency_table.at["specific", "in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[reg_enr_col_names[var]] == var_val, :].shape[0]
            contingency_table.at["specific", "not_in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[reg_enr_col_names[var]] != var_val, :].shape[0]
            contingency_table.at["non-specific", "in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[reg_enr_col_names[var]] == var_val, :].shape[0]
            contingency_table.at["non-specific", "not_in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[reg_enr_col_names[var]] != var_val, :].shape[0]
            df_var.at[var_val, "11"] = contingency_table.at["specific", "in_val"]
            df_var.at[var_val, "12"] = contingency_table.at["specific", "not_in_val"]
            df_var.at[var_val, "21"] = contingency_table.at["non-specific", "in_val"]
            df_var.at[var_val, "22"] = contingency_table.at["non-specific", "not_in_val"]
            df_var.at[var_val, "sum"] = contingency_table.values.sum()
            odds_ratio, pval = stats.fisher_exact(contingency_table.to_numpy(), alternative='two-sided')
            if np.isnan(odds_ratio):
                odds_ratio = 1.0
            df_var.at[var_val, "odds_ratio"], df_var.at[var_val, "pval"] = odds_ratio, pval
        _, df_var['pval_fdr_bh'], _, _ = multipletests(df_var['pval'].values, 0.05, method='fdr_bh')
        df_var[r'$ \log_{10}(\mathrm{Odds\ ratio})$'] = np.log10(df_var.loc[:, 'odds_ratio'].values)
        df_var[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_var.loc[:, 'pval_fdr_bh'].values)

        plt.figure(figsize=reg_enr_fig_sizes[var])
        plt.xticks(rotation=90)
        sns.set_theme(style='whitegrid', font_scale=2)
        cmap = plt.get_cmap("viridis").copy()
        cmap.set_under('black')

        plot = plt.scatter(
            df_var.index,
            df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values,
            c=df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values,
            cmap=cmap,
            vmin=-np.log10(0.05)
        )
        plt.clf()
        cbar = plt.colorbar(plot, extend='min')

        df_var['bar_color'] = 'black'
        for df_var_index in df_var.index.values:
            if df_var.at[df_var_index, "pval_fdr_bh"] < 0.05:
                value_tmp = df_var.at[df_var_index, r'$ -\log_{10}(\mathrm{p-value})$']
                value_color = (value_tmp-cbar.vmin)/(cbar.vmax-cbar.vmin)
                df_var.at[df_var_index, 'bar_color'] = matplotlib.colors.rgb2hex(cbar.cmap(value_color))
        df_var.to_excel(f"{path_curr}/reg_enr/fisher_{var}.xlsx")

        plt.xticks(rotation=90)
        cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center')
        ax = sns.barplot(
            data=df_var,
            x=df_var.index,
            y=r'$ \log_{10}(\mathrm{Odds\ ratio})$',
            palette=df_var.loc[:, 'bar_color'],
            dodge=False,
            edgecolor='black',
        )
        plt.savefig(f"{path_curr}/reg_enr/fisher_{var}.png", bbox_inches='tight')
        plt.savefig(f"{path_curr}/reg_enr/fisher_{var}.pdf", bbox_inches='tight')
        plt.close()

## Region-specific genes intersection

In [None]:
path_local = f"{path_save}/{dict_problems['all_region']['path']}/06_DMP"
genes_our = pd.read_excel(f"{path_local}/genes.xlsx", index_col='gene').index.values
genes_ref = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/genes/region_specific/Cardona2014.xlsx", index_col='gene').index.values

pathlib.Path(f"{path_local}/genes_intersection").mkdir(parents=True, exist_ok=True)

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(genes_our), set(genes_ref)),
    set_labels = ('DMPs', 'Cardona2014'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(genes_our), set(genes_ref)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_local}/genes_intersection/venn.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_local}/genes_intersection/venn.pdf", bbox_inches='tight')
plt.clf()

sections = get_sections([set(genes_our), set(genes_ref)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_local}/genes_intersection/{sec}.xlsx", index_label='gene')

df_genes = pd.DataFrame(index=genes_our)
df_genes["Cardona et. al. 2014"] = "No"
df_genes.loc[set(genes_our).intersection(set(genes_ref)), "Cardona et. al. 2014"] = "Yes"
df_genes.to_excel(f"{path_local}/genes_intersection/genes.xlsx", index_label='gene')

## Region-specific CpGs processing

In [None]:
path_local = f"{path_save}/{dict_problems['all_region']['path']}/06_DMP"
df_cpgs = pd.read_excel(f"{path_local}/cpgs.xlsx", index_col=0)
dict_col = {
    "log2FC": "logFC",
    "logFC": "WTF",
    "adj.P.Val": "Adj. p-value",
    "Central_AVG": "Central avg",
    "Yakutia_AVG": "Yakutia avg",
    "deltaBeta": "Delta"
}
df_cpgs.rename(columns=dict_col, inplace=True)
df_cpgs.loc[:, "Gene"] = manifest.loc[df_cpgs.index.values, 'Gene']
df_cpgs.loc[:, "Relation to Island"] = manifest.loc[df_cpgs.index.values, 'Relation_to_Island']
df_cpgs.loc[:, "UCSC RefGene Group"] = manifest.loc[df_cpgs.index.values, 'UCSC_RefGene_Group']

df_cpgs = df_cpgs.loc[:, ["CHR", "MAPINFO", "Gene", "Relation to Island", "UCSC RefGene Group", "logFC", "Central avg", "Yakutia avg", "Delta"]]
df_cpgs.to_excel(f"{path_local}/cpgs_processed.xlsx", index_label='CpG')

## Sex-specific CpGs intersection

In [None]:
path_local = f"{path_save}/sex_specificity_in_regions"

df_cpgs_ctl = pd.read_excel(f"{path_save}/{dict_problems['central_sex']['path']}/06_DMP/stat.xlsx", index_col=0)
cpgs_ctl = pd.read_excel(f"{path_save}/{dict_problems['central_sex']['path']}/06_DMP/cpgs.xlsx", index_col=0).index.values
df_cpgs_ctl["Significant in Central"] = "No"
df_cpgs_ctl.loc[cpgs_ctl, "Significant in Central"] = "Yes"
dict_col_ctl = {
    "log2FC": "logFC in Central",
    "adj.P.Val": "Adj. p-value in Central",
    "F_AVG": "F avg in Central",
    "M_AVG": "M avg in Central",
    "deltaBeta": "Delta in Central"
}
df_cpgs_ctl.rename(columns=dict_col_ctl, inplace=True)

df_cpgs_ykt = pd.read_excel(f"{path_save}/{dict_problems['yakutia_sex']['path']}/06_DMP/stat.xlsx", index_col=0)
cpgs_ykt = pd.read_excel(f"{path_save}/{dict_problems['yakutia_sex']['path']}/06_DMP/cpgs.xlsx", index_col=0).index.values
df_cpgs_ykt["Significant in Yakutia"] = "No"
df_cpgs_ykt.loc[cpgs_ykt, "Significant in Yakutia"] = "Yes"
dict_col_ykt = {
    "log2FC": "logFC in Yakutia",
    "adj.P.Val": "Adj. p-value in Yakutia",
    "F_AVG": "F avg in Yakutia",
    "M_AVG": "M avg in Yakutia",
    "deltaBeta": "Delta in Yakutia"
}
df_cpgs_ykt.rename(columns=dict_col_ykt, inplace=True)

cpgs_grant2022 = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/cpgs/sex_specific/Grant2022.xlsx", index_col='CpG').index.values
cpgs_inoshita2015 = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/cpgs/sex_specific/Inoshita2015.xlsx", index_col='CpG').index.values
cpgs_mccarthy2014 = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/cpgs/sex_specific/McCarthy2014.xlsx", index_col='CpG').index.values

df_cpgs_cmn = df_cpgs_ctl.loc[:, ["CHR", "MAPINFO"]]
cpgs_cmn = df_cpgs_cmn.index.values
df_cpgs_cmn.loc[cpgs_cmn, "Gene"] = manifest.loc[cpgs_cmn, 'Gene']
df_cpgs_cmn.loc[cpgs_cmn, "Relation to Island"] = manifest.loc[cpgs_cmn, 'Relation_to_Island']
df_cpgs_cmn.loc[cpgs_cmn, "UCSC RefGene Group"] = manifest.loc[cpgs_cmn, 'UCSC_RefGene_Group']

df_cpgs_cmn.loc[cpgs_cmn, "logFC in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "logFC in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "Adj. p-value in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "Adj. p-value in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "F avg in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "F avg in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "M avg in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "M avg in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "Delta in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "Delta in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "Significant in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "Significant in Central"]

df_cpgs_cmn.loc[cpgs_cmn, "logFC in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "logFC in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "Adj. p-value in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "Adj. p-value in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "F avg in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "F avg in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "M avg in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "M avg in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "Delta in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "Delta in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "Significant in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "Significant in Yakutia"]

df_cpgs_cmn["Grant et. al. 2022"] = "No"
df_cpgs_cmn.loc[set(cpgs_cmn).intersection(set(cpgs_grant2022)), "Grant et. al. 2022"] = "Yes"

df_cpgs_cmn["Inoshita et. al. 2015"] = "No"
df_cpgs_cmn.loc[set(cpgs_cmn).intersection(set(cpgs_inoshita2015)), "Inoshita et. al. 2015"] = "Yes"

df_cpgs_cmn["McCarthy et. al. 2014"] = "No"
df_cpgs_cmn.loc[set(cpgs_cmn).intersection(set(cpgs_mccarthy2014)), "McCarthy et. al. 2014"] = "Yes"

pathlib.Path(f"{path_local}/cpgs").mkdir(parents=True, exist_ok=True)
conds_cols = [
    "Significant in Central",
    "Significant in Yakutia",
    "Grant et. al. 2022",
    "Inoshita et. al. 2015",
    "McCarthy et. al. 2014"
]
df_intxn_order = pd.DataFrame(index=conds_cols)
df_intxn_order.to_excel(f"{path_local}/cpgs/conds_cols.xlsx", index_label='Set')
conditions = [df_cpgs_cmn[metric] == "Yes"  for metric in ["Significant in Central", "Significant in Yakutia"]]
df_cpgs_cmn = df_cpgs_cmn[disjunction(conditions)]
df_cpgs_cmn.to_excel(f"{path_local}/cpgs/table.xlsx", index_label='CpG')

sections = get_sections([set(cpgs_ctl), set(cpgs_ykt), set(cpgs_grant2022), set(cpgs_inoshita2015), set(cpgs_mccarthy2014)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_local}/cpgs/{sec}.xlsx", index_label='gene')

dict_upset_lists = {
    "McCarthy et. al. 2014": cpgs_mccarthy2014,
    "Inoshita et. al. 2015": cpgs_inoshita2015,
    "Grant et. al. 2022": cpgs_grant2022,
    'Sex-specific in Yakutia': cpgs_ykt,
    'Sex-specific in Central': cpgs_ctl,
}
upset_all = list(set().union(*list(dict_upset_lists.values())))
df_upset = pd.DataFrame(index=upset_all)
for k, v in dict_upset_lists.items():
    df_upset[k] = df_upset.index.isin(v)
df_upset = df_upset.set_index(list(dict_upset_lists.keys()))
tmp = plt.figure(figsize=(32, 12))
upset_fig = upsetplot.UpSet(
    df_upset,
    sort_categories_by='input',
    subset_size='count',
    show_counts=True,
    min_degree=0,
    element_size=None,
    totals_plot_elements=3,
    include_empty_subsets=False
)
upset_fig.style_subsets(present=["Sex-specific in Central", "Sex-specific in Yakutia"], edgecolor="red", linewidth=2)
upset_fig.style_subsets(present=["Sex-specific in Central", "Grant et. al. 2022"], absent=["Sex-specific in Yakutia"], facecolor="blue")
upset_fig.style_subsets(present=["Sex-specific in Yakutia", "Grant et. al. 2022"], absent=["Sex-specific in Central"], facecolor="green")
upset_fig.style_subsets(present=["Sex-specific in Yakutia", "Sex-specific in Central", "Grant et. al. 2022"], facecolor="yellow")
upset_fig.plot(tmp)
plt.savefig(f"{path_local}/cpgs/upset.png", bbox_inches='tight')
plt.savefig(f"{path_local}/cpgs/upset.pdf", bbox_inches='tight')
plt.close()

## Sex-specific GSEA intersection

In [None]:
path_local = f"{path_save}/sex_specificity_in_regions"

df_gsea_ctl = pd.read_csv(f"{path_with_limma}/{dict_problems['central_sex']['path']}/data_from_R/GSEA(methylglm)_GO.csv", index_col="ID")
df_gsea_ctl["Significant in Central"] = "No"
df_gsea_ctl.loc[df_gsea_ctl['padj'] < 0.05, "Significant in Central"] = "Yes"
df_gsea_ctl.rename(columns={"padj": "Adj. p-value in Central"}, inplace=True)
terms_ctl = df_gsea_ctl.index[df_gsea_ctl["Significant in Central"] == "Yes"].values

df_gsea_ykt = pd.read_csv(f"{path_with_limma}/{dict_problems['yakutia_sex']['path']}/data_from_R/GSEA(methylglm)_GO.csv", index_col="ID")
df_gsea_ykt["Significant in Yakutia"] = "No"
df_gsea_ykt.loc[df_gsea_ykt['padj'] < 0.05, "Significant in Yakutia"] = "Yes"
df_gsea_ykt.rename(columns={"padj": "Adj. p-value in Yakutia"}, inplace=True)
terms_ykt = df_gsea_ykt.index[df_gsea_ykt["Significant in Yakutia"] == "Yes"].values

pathlib.Path(f"{path_local}/gsea").mkdir(parents=True, exist_ok=True)
df_gsea_ctl.loc[df_gsea_ykt.index, "Adj. p-value in Yakutia"] = df_gsea_ykt.loc[df_gsea_ykt.index, "Adj. p-value in Yakutia"]
df_gsea_ctl.loc[df_gsea_ykt.index, "Significant in Yakutia"] = df_gsea_ykt.loc[df_gsea_ykt.index, "Significant in Yakutia"]
df_gsea = df_gsea_ctl.loc[(df_gsea_ctl["Significant in Yakutia"] == "Yes") | (df_gsea_ctl["Significant in Central"] == "Yes"), :]
df_gsea.to_excel(f"{path_local}/gsea/table.xlsx", index_label='ID')

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(terms_ctl), set(terms_ykt)),
    set_labels = ('Central', 'Yakutia'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(terms_ctl), set(terms_ykt)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_local}/gsea/venn.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_local}/gsea/venn.pdf", bbox_inches='tight')
plt.clf()

sections = get_sections([set(terms_ctl), set(terms_ykt)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_local}/gsea/{sec}.xlsx", index_label='gene')