In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
import functools


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Init dnam and fill it from immunology data

In [None]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)
manifest['CHR'] = manifest['chr'].str[3::]

dnam_suffix = "_harm"

immuno_samples = "all_1052_121222" # "ctrl_415_from_all_1052_121222"
immuno_proc = "raw"
immuno_imp = "fast_knn"
immuno_replace = "quarter"

select_dnam = 'chronology_0' # "common_with_immuno" "chronology_0"
select_immuno = "260_ml_draft"

path_save = f"{path}/{platform}/{dataset}/special/043_yakutia_EWAS/analysis_without_non_sakha"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

df_immuno = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples({immuno_samples})_proc({immuno_proc})_imp({immuno_imp})_replace({immuno_replace}).xlsx", index_col="index")

pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
pheno.drop(["I64_old", "I1_duplicate"], inplace=True)

# Check DNAm only index
index_dnam_only = pheno.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

# Check phenotype differences in Immunology and DNAm data
indexes_common_glob = pheno.index.intersection(df_immuno.index)
is_region_equal_glob = pheno.loc[indexes_common_glob, 'Region'].equals(df_immuno.loc[indexes_common_glob, 'Region'])
is_sex_equal_glob = pheno.loc[indexes_common_glob, 'Sex'].equals(df_immuno.loc[indexes_common_glob, 'Sex'])
is_status_equal_glob = pheno.loc[indexes_common_glob, 'Status'].equals(df_immuno.loc[indexes_common_glob, 'Status'])
age_diff_glob = np.abs(pheno.loc[indexes_common_glob, 'Age'].values - df_immuno.loc[indexes_common_glob, 'Age'].values)
age_diff_max_glob = np.max(age_diff_glob)
print(f"is_region_equal_glob: {is_region_equal_glob}")
print(f"is_sex_equal_glob: {is_sex_equal_glob}")
print(f"is_status_equal_glob: {is_status_equal_glob}")
print(f"age_diff_max_glob: {age_diff_max_glob}")

df_immuno['is_dnam'] = False
df_immuno.loc[pheno.index.intersection(df_immuno.index), 'is_dnam'] = True
df_immuno = df_immuno.loc[(df_immuno["Status"] == "Control"), :]
df_immuno["Region"].replace({"Yakutiya": "Yakutia"}, inplace=True)
feats_immuno = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()
# Replace Age in DNAm:
pheno.loc[pheno.index.intersection(df_immuno.index), 'Age'] = df_immuno.loc[pheno.index.intersection(df_immuno.index), 'Age']
age_diff = np.abs(pheno.loc[pheno.index.intersection(df_immuno.index), 'Age'].values - df_immuno.loc[pheno.index.intersection(df_immuno.index), 'Age'].values)
age_diff_max = np.max(age_diff)
print(f"age_diff_max: {age_diff_max}")
# Immuno selection
if select_immuno == "260_ml_draft":
    df_immuno = df_immuno.loc[(df_immuno["260ai"] == True) | (df_immuno["Region"] == "Yakutia"), :]

betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas{dnam_suffix}.pkl")
feats_dnam = betas.columns.values
df_dnam = pd.merge(pheno, betas, left_index=True, right_index=True)
df_dnam = df_dnam.loc[(df_dnam["Status"] == "Control"), :]
df_dnam["Region and residence"] = "Central"
df_dnam.loc[(df_dnam["Region"] == "Yakutia") & (df_dnam["Residence"] == "City"), "Region and residence"] = "Yakutia (City)"
df_dnam.loc[(df_dnam["Region"] == "Yakutia") & (df_dnam["Residence"] == "Village"), "Region and residence"] = "Yakutia (Village)"
# DNAm selection
if select_dnam == "common_with_immuno":
    df_dnam = df_dnam.loc[df_dnam.index.intersection(df_immuno.index).values, :]
elif select_dnam == 'chronology_0':
    df_dnam = df_dnam.loc[df_dnam["Sample_Chronology"] == 0, :]

index_common = df_dnam.index.intersection(df_immuno.index).values

index_dnam_only = df_dnam.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

n_samples_immuno_central = len(df_immuno.index[df_immuno["Region"] == "Central"].values)
n_samples_immuno_yakutia = len(df_immuno.index[df_immuno["Region"] == "Yakutia"].values)
n_samples_dnam_central = len(df_dnam.index[df_dnam["Region"] == "Central"].values)
n_samples_dnam_yakutia = len(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)
print(f"n_samples_immuno_central: {n_samples_immuno_central}")
print(f"n_samples_immuno_yakutia: {n_samples_immuno_yakutia}")
print(f"n_samples_dnam_central: {n_samples_dnam_central}")
print(f"n_samples_dnam_yakutia: {n_samples_dnam_yakutia}")

### Delete sakha samples

In [None]:
ids_non_sakha = pd.read_excel(f"{path_save}/samples_to_delete.xlsx", index_col=0).index.values
df_dnam.drop(labels=ids_non_sakha, inplace=True, errors='ignore')
df_immuno.drop(labels=ids_non_sakha, inplace=True, errors='ignore')
n_samples_immuno_central = len(df_immuno.index[df_immuno["Region"] == "Central"].values)
n_samples_immuno_yakutia = len(df_immuno.index[df_immuno["Region"] == "Yakutia"].values)
n_samples_dnam_central = len(df_dnam.index[df_dnam["Region"] == "Central"].values)
n_samples_dnam_yakutia = len(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)
print(f"n_samples_immuno_central: {n_samples_immuno_central}")
print(f"n_samples_immuno_yakutia: {n_samples_immuno_yakutia}")
print(f"n_samples_dnam_central: {n_samples_dnam_central}")
print(f"n_samples_dnam_yakutia: {n_samples_dnam_yakutia}")

index_common = df_dnam.index.intersection(df_immuno.index).values

index_dnam_only = df_dnam.index.difference(df_immuno.index)
print(f"DNAm only indexes {len(index_dnam_only)}: {list(index_dnam_only)}")

# Save pheno table

In [None]:
age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x
age_types['mPACE'] = 'DunedinPACE'

cells = {f"{x}{dnam_suffix}": x for x in ["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"]}

df_ph = df_dnam.loc[:, ["Sentrix_ID", "Sentrix_Position", "Age", "Sex", "Region", "Status"] + list(age_types.keys()) + list(cells.keys())].copy()
df_ph.rename(columns=age_types, inplace=True)
df_ph.rename(columns=cells, inplace=True)
df_ph['idat_ID'] = df_ph['Sentrix_ID'].astype(str) + '_' + df_ph['Sentrix_Position'].astype(str)
df_gsm = pd.read_excel('D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/026_data_for_GEO/GSM.xlsx', index_col='idat_ID')
df_ph.loc[df_ph.index.values, 'GSM'] = df_gsm.loc[df_ph.loc[df_ph.index.values, 'idat_ID'].values, 'GSM'].values

first_columns = [
    'GSM',
    'Sentrix_ID',
    'Sentrix_Position',
    'idat_ID',
    'Age',
    'Sex',
    'Region',
    'Status',
]
df_ph = df_ph[first_columns + [col for col in df_ph.columns if col not in first_columns]]
df_ph.to_excel(f"{path_save}/samples.xlsx", index=True)

## DNAm and Immuno comparison

### Venn diagrams and Histograms

In [None]:
path_local = "dnam_and_immuno_comparison"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(df_immuno.index[df_immuno["Region"] == "Central"].values), set(df_dnam.index[df_dnam["Region"] == "Central"].values)),
    set_labels = ('Immuno', 'DNAm'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(df_immuno.index[df_immuno["Region"] == "Central"].values), set(df_dnam.index[df_dnam["Region"] == "Central"].values)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_save}/{path_local}/venn_Central.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/venn_Central.pdf", bbox_inches='tight')
plt.clf()

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(df_immuno.index[df_immuno["Region"] == "Yakutia"].values), set(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)),
    set_labels = ('Immuno', 'DNAm'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(df_immuno.index[df_immuno["Region"] == "Yakutia"].values), set(df_dnam.index[df_dnam["Region"] == "Yakutia"].values)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_save}/{path_local}/venn_Yakutia.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/venn_Yakutia.pdf", bbox_inches='tight')
plt.clf()

df_participants = df_dnam.loc[:, ["Age", "Sex", "Region", "Status"]]
df_participants["Data"] = "DNAm only"
df_participants.loc[index_common, "Data"] = "DNAm and Immuno"

hist_bins = np.linspace(5, 115, 23)

palette = {
    "DNAm and Immuno": "forestgreen",
    "DNAm only": "lawngreen",
}
hue_order = ['DNAm only', 'DNAm and Immuno']
fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_participants.loc[df_participants["Region"] == "Central", :],
    hue_order=hue_order,
    bins=hist_bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/{path_local}/hist_Central.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_Central.pdf", bbox_inches='tight')
plt.close(fig)
print(f"Central DNAm: {df_participants.loc[(df_participants['Data'] == 'DNAm and Immuno') & (df_participants['Region'] == 'Central'), :].shape[0]}")

palette = {
    "DNAm and Immuno": "royalblue",
    "DNAm only": "deepskyblue",
}
fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_participants.loc[df_participants["Region"] == "Yakutia", :],
    hue_order=hue_order,
    bins=hist_bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/{path_local}/hist_Yakutia.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/hist_Yakutia.pdf", bbox_inches='tight')
plt.close(fig)
print(f"Central DNAm: {df_participants.loc[(df_participants['Data'] == 'DNAm and Immuno') & (df_participants['Region'] == 'Yakutia'), :].shape[0]}")

# DNAm

## Setup

In [None]:
dict_problems_all = {
    "all_region": {
        "target": "Region",
        "path": "00_all_region",
        "color": {
            "Central": "gold",
            "Yakutia": "lightslategray"
        },
        "color_line": "black",
        "filter": {
            "Central": (df_dnam["Region"] == "Central"),
            "Yakutia": (df_dnam["Region"] == "Yakutia")
        },
        "base_filter": (df_dnam["Region"] == "Central"),
        "base_part": "Central",
        "all_filter": (df_dnam["Region"].isin(["Central", "Yakutia"])),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 1e-20,
        "dmp_fc": 0.0,
        "dmp_top_n": 1000,
        'dmr_pval': 0.05,
    },
    "central_sex": {
        "target": "Sex",
        "path": "01_central_sex",
        "color": {
            "F": "hotpink",
            "M": "skyblue"
        },
        "color_line": "black",
        "filter": {
            "F": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "F"),
            "M": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "M")
        },
        "base_filter": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "F"),
        "base_part": "F",
        "all_filter": (df_dnam["Region"] == "Central"),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_top_n": 1000,
        "dmp_fc": 0.0,
        'dmr_pval': 0.05,
    },
    "yakutia_sex": {
        "target": "Sex",
        "path": "02_yakutia_sex",
        "color": {
            "F": "firebrick",
            "M": "royalblue"
        },
        "color_line": 'black', #"lightsteelblue",
        "filter": {
            "F": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "F"),
            "M": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "M")
        },
        "base_filter": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "F"),
        "base_part": "F",
        "all_filter": (df_dnam["Region"] == "Yakutia"),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_fc": 0.0,
        "dmp_top_n": 1000,
        'dmr_pval': 0.05,
    },
    "females_region": {
        "target": "Region",
        "path": "03_females_region",
        "color": {
            "Central": "hotpink",
            "Yakutia": "firebrick"
        },
        "color_line": "black",
        "filter": {
            "Central": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "F"),
            "Yakutia": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "F")
        },
        "base_filter": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "F"),
        "base_part": "Central",
        "all_filter": (df_dnam["Sex"] == "F"),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_fc": 0.0,
        "dmp_top_n": 1000,
        'dmr_pval': 0.05,
    },
    "males_region": {
        "target": "Region",
        "path": "04_males_region",
        "color": {
            "Central": "skyblue",
            "Yakutia": "royalblue"
        },
        "color_line": "black",
        "filter": {
            "Central": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "M"),
            "Yakutia": (df_dnam["Region"] == "Yakutia") & (df_dnam["Sex"] == "M")
        },
        "base_filter": (df_dnam["Region"] == "Central") & (df_dnam["Sex"] == "M"),
        "base_part": "Central",
        "all_filter": (df_dnam["Sex"] == "M"),
        "dmp_filter_type": "top_n", # pvalfc
        "dmp_pval": 0.05,
        "dmp_fc": 0.0,
        "dmp_top_n": 1000,
        'dmr_pval': 0.05,
    },
}

problems_selected = ['all_region', 'central_sex', 'yakutia_sex']
dict_problems = {}
for problem in dict_problems_all:
    if problem in problems_selected:
        dict_problems[problem] = dict_problems_all[problem]

for key, val in dict_problems.items():
    pathlib.Path(f"{path_save}/{val['path']}").mkdir(parents=True, exist_ok=True)

## Create data for R

In [None]:
for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/data_for_R").mkdir(parents=True, exist_ok=True)
    pathlib.Path(f"{path_save}/{dict_problem['path']}/data_from_R").mkdir(parents=True, exist_ok=True)
    df_betas_problem = df_dnam.loc[dict_problem["all_filter"], feats_dnam].copy()
    df_betas_problem = df_betas_problem.T
    df_betas_problem.index.name = "CpG"
    df_betas_problem.to_pickle(f"{path_save}/{dict_problem['path']}/data_for_R/betas_R_{problem}.pkl")
    df_pheno_problem = df_dnam.loc[dict_problem["all_filter"], ["Age", "Sex", "Region"]]
    df_pheno_problem.to_pickle(f"{path_save}/{dict_problem['path']}/data_for_R/pheno_R_{problem}.pkl")

## 1. Samples histogram

In [None]:
hist_bins = np.linspace(5, 115, 23)

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/01_samples_hist").mkdir(parents=True, exist_ok=True)
    df_fig = df_dnam.loc[dict_problem["all_filter"], ['Age', 'Sex', 'Region']].copy()
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/01_samples_hist/fig.xlsx")
    dict_keys = {key: f"{key}: {df_dnam[dict_problem['filter'][key]].shape[0]}" for key in dict_problem['filter']}
    colors = {dict_keys[key]: val for key, val in dict_problem['color'].items()}
    df_fig[dict_problem['target']].replace(dict_keys, inplace=True)
    fig = plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_fig,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=dict_problem['target'],
        palette=colors
    )
    hist.set(xlim=(0, 120))
    plt.savefig(f"{path_save}/{dict_problem['path']}/01_samples_hist/hist.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{dict_problem['path']}/01_samples_hist/hist.pdf", bbox_inches='tight')
    plt.close(fig)

## 2. Cells

In [None]:
cells = {f"{x}{dnam_suffix}": x for x in ["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"]}
dist_num_bins = 15

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/02_cells").mkdir(parents=True, exist_ok=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], list(cells.keys()) + ["Sex", "Region", "Age"]]
    df_fig.rename(columns=cells, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/02_cells/fig.xlsx")

    df_stat = pd.DataFrame()
    for cell in tqdm(cells):
        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], cell].values
            df_stat.at[cell, f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[cell, f"median_{group}"] = np.median(vals[group])
            df_stat.at[cell, f"q75_{group}"], df_stat.at[cell, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[cell, f"iqr_{group}"] = df_stat.at[cell, f"q75_{group}"] - df_stat.at[cell, f"q25_{group}"]
        _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_stat.at[cell, "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/02_cells/stat.xlsx", index=True)

    for cell in tqdm(cells):
        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], cell].values

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color=dict_problem["color_line"],
                    fillcolor=dict_problem["color"][group],
                    marker=dict(color=dict_problem["color"][group], line=dict(color=dict_problem["color_line"],width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth = np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{cells[cell]}", f"p-value: {df_stat.at[cell, 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend={'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=800,
            height=600,
            margin=go.layout.Margin(
                l=120,
                r=50,
                b=70,
                t=50,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{dict_problem['path']}/02_cells/{cell}")

## 3. Ages

In [None]:
age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x

dist_num_bins = 15

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/03_ages").mkdir(parents=True, exist_ok=True)

    df_stat = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
    for age_type in tqdm(age_types):
        formula = f"{age_type} ~ Age"
        model = smf.ols(formula=formula, data=df_dnam.loc[dict_problem["base_filter"], :]).fit()
        df_dnam[f"{age_type}_linear_pred"] = model.predict(df_dnam)
        df_dnam[f"{age_type}Acc"] = df_dnam[age_type] - df_dnam[f"{age_type}_linear_pred"]

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}Acc"].values
            df_stat.at[f"{age_type}Acc", f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[f"{age_type}Acc", f"median_{group}"] = np.median(vals[group])
            df_stat.at[f"{age_type}Acc", f"q75_{group}"], df_stat.at[f"{age_type}Acc", f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[f"{age_type}Acc", f"iqr_{group}"] = df_stat.at[f"{age_type}Acc", f"q75_{group}"] - df_stat.at[f"{age_type}Acc", f"q25_{group}"]

        _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_stat.at[f"{age_type}Acc", "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/03_ages/stat.xlsx", index=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], ["Sex", "Region", "Age"] + list(age_types.keys()) + [f"{x}Acc" for x in age_types]]
    df_fig.rename(columns=age_types, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/03_ages/fig.xlsx")

    for age_type in tqdm(age_types):

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}Acc"].values

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5

            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color=dict_problem["color_line"],
                    fillcolor=dict_problem["color"][group],
                    marker=dict(color=dict_problem["color"][group], line=dict(color=dict_problem["color_line"],width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8,
                )
            )
        add_layout(fig, "", f"{age_types[age_type]}Acc", f"p-value: {df_stat.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=500,
            height=600,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=50,
                t=50,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages/violin_{age_type}Acc")

        min_val = df_dnam[["Age", age_type]].min().min()
        max_val = df_dnam[["Age", age_type]].max().max()
        shift_val = max_val - min_val
        min_val -= 0.05 * shift_val
        max_val += 0.05 * shift_val

        # Regular plot =======================================================
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_dnam.loc[dict_problem["base_filter"], f"Age"].values,
                y=df_dnam.loc[dict_problem["base_filter"], f"{age_type}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                line=dict(width=5),
                marker_color=dict_problem["color"][dict_problem["base_part"]],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in dict_problem["filter"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[dict_problem["filter"][group], f"Age"].values,
                    y=df_dnam.loc[dict_problem["filter"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=dict_problem["color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color=dict_problem["color_line"],
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages/scatter_{age_type}")

# 3.1 Ages without regression (raw values from Horvath's calculator)

In [None]:
age_types = {
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x

dist_num_bins = 15

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/03_ages_raw").mkdir(parents=True, exist_ok=True)

    df_stat = pd.DataFrame(index=[f"{x}AccRaw" for x in age_types], columns=["pval", "pval_fdr_bh"])
    for age_type in tqdm(age_types):
        df_dnam[f"{age_type}AccRaw"] = df_dnam[age_type] - df_dnam[f"Age"]

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}AccRaw"].values
            df_stat.at[f"{age_type}AccRaw", f"mean_{group}"] = np.mean(vals[group])
            df_stat.at[f"{age_type}AccRaw", f"median_{group}"] = np.median(vals[group])
            df_stat.at[f"{age_type}AccRaw", f"q75_{group}"], df_stat.at[f"{age_type}AccRaw", f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
            df_stat.at[f"{age_type}AccRaw", f"iqr_{group}"] = df_stat.at[f"{age_type}AccRaw", f"q75_{group}"] - df_stat.at[f"{age_type}AccRaw", f"q25_{group}"]

        _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_stat.at[f"{age_type}AccRaw", "pval"] = pval

    _, df_stat["pval_fdr_bh"], _, _ = multipletests(df_stat["pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/03_ages_raw/stat.xlsx", index=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], ["Sex", "Region", "Age"] + list(age_types.keys()) + [f"{x}AccRaw" for x in age_types]]
    df_fig.rename(columns=age_types, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/03_ages_raw/fig.xlsx")

    for age_type in tqdm(age_types):

        vals = {}
        for group in dict_problem["filter"]:
            vals[group] = df_dnam.loc[dict_problem["filter"][group], f"{age_type}AccRaw"].values

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5

            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color=dict_problem["color_line"],
                    fillcolor=dict_problem["color"][group],
                    marker=dict(color=dict_problem["color"][group], line=dict(color=dict_problem["color_line"],width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth=np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8,
                )
            )
        add_layout(fig, "", f"{age_types[age_type]}AccRaw", f"p-value: {df_stat.at[f'{age_type}AccRaw', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=500,
            height=600,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=50,
                t=50,
                pad=0,
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages_raw/violin_{age_type}Acc")

        min_val = df_dnam[["Age", age_type]].min().min()
        max_val = df_dnam[["Age", age_type]].max().max()
        shift_val = max_val - min_val
        min_val -= 0.05 * shift_val
        max_val += 0.05 * shift_val

        # Regular plot =======================================================
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                showlegend=False,
                name="",
                mode="lines",
                marker_color="black",
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in dict_problem["filter"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam.loc[dict_problem["filter"][group], f"Age"].values,
                    y=df_dnam.loc[dict_problem["filter"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=dict_problem["color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color=dict_problem["color_line"],
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_types[age_type]}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(autorange=False)
        fig.update_yaxes(autorange=False)
        fig.update_layout(title_xref='paper')
        fig.update_layout(xaxis_range=[min_val, max_val])
        fig.update_layout(yaxis_range=[min_val, max_val])
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_save}/{dict_problem['path']}/03_ages_raw/scatter_{age_type}")

## 4. mPACE

In [None]:
dist_num_bins = 15

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/04_mpace").mkdir(parents=True, exist_ok=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], ["Sex", "Region", "Age", "mPACE"]]
    df_fig.rename(columns={'mPACE': 'DunedinPACE'}, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/04_mpace/fig.xlsx")

    df_stat = pd.DataFrame()
    vals = {}
    for group in dict_problem["filter"]:
        vals[group] = df_dnam.loc[dict_problem["filter"][group], "mPACE"].values
        df_stat.at["mPACE", f"mean_{group}"] = np.mean(vals[group])
        df_stat.at["mPACE", f"median_{group}"] = np.median(vals[group])
        df_stat.at["mPACE", f"q75_{group}"], df_stat.at["mPACE", f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at["mPACE", f"iqr_{group}"] = df_stat.at["mPACE", f"q75_{group}"] - df_stat.at["mPACE", f"q25_{group}"]
    _, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_stat.at["mPACE", "pval"] = pval
    df_stat.to_excel(f"{path_save}/{dict_problem['path']}/04_mpace/stat.xlsx", index=True)

    fig = go.Figure()
    for group_id, group in enumerate(dict_problem["filter"]):
        if group_id == 0:
            pointpos = 1.5
        else:
            pointpos = -1.5
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color=dict_problem["color_line"],
                fillcolor=dict_problem["color"][group],
                marker=dict(color=dict_problem["color"][group], line=dict(color=dict_problem["color_line"],width=0.3), opacity=0.8),
                points='all',
                pointpos=pointpos,
                bandwidth = np.ptp(vals[group]) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", f"DunedinPACE", f"p-value: {df_stat.at['mPACE', 'pval']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=500,
        height=600,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=50,
            t=50,
            pad=0,
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/{dict_problem['path']}/04_mpace/violin")

## 5. Age correlations

In [None]:
def plot_unity(x, y, **kwargs):
    if np.max(x) <= 2: # Is it mPACE?
        x_points = np.linspace(0.5, 2, 2)
    else: # Or age-like?
        x_points = np.linspace(10, 110, 2)
    if np.max(y) <= 2: # Is it mPACE?
        y_points = np.linspace(0.5, 2, 2)
    else: # Or age-like?
        y_points = np.linspace(10, 110, 2)
    ax = plt.gca()
    ax.plot(x_points, y_points, color='k', marker=None, linestyle='--', linewidth=1.0)


def plot_regression(x, y, **kwargs):
    base_indexes = kwargs['base_indexes']
    base_color = kwargs['base_color']
    bkg_color = kwargs['bkg_color']
    if base_indexes.equals(x.index):
        df = pd.DataFrame({"x": x.values, "y": y.values})
        formula = "y ~ x"
        x_ptp = np.ptp(x.values)
        x_min = np.min(x.values) - 0.1 * x_ptp
        x_max = np.max(x.values) + 0.1 * x_ptp
        model = smf.ols(formula=formula, data=df).fit()
        df_line = pd.DataFrame({"x": [x_min, x_max]})
        df_line["y"] = model.predict(df_line)
        ax = plt.gca()
        ax.plot(df_line['x'].values, df_line['y'].values, color=bkg_color, marker=None, linestyle='-', linewidth=4.0)
        ax.plot(df_line['x'].values, df_line['y'].values, color=base_color, marker=None, linestyle='-', linewidth=2.0)


def annotate_corr(x, y, **kwargs):
    base_indexes = kwargs['base_indexes']
    colors = kwargs['colors']
    bkg_color = kwargs['bkg_color']
    corr, _ = stats.pearsonr(x, y)
    mae = mean_absolute_error(x, y)
    ax = plt.gca()
    if base_indexes.equals(x.index):
        color = colors[0]
        label = r'$\rho$ = ' + f"{corr:0.2f}"
        text = ax.annotate(label, xy = (0.5, 0.72), size=23, xycoords=ax.transAxes, ha='center', color=color, alpha=0.75)
        text.set_path_effects([path_effects.Stroke(linewidth=2, foreground=bkg_color), path_effects.Normal()])
        label = f"MAE = {mae:0.2f}"
        text = ax.annotate(label, xy = (0.5, 0.55), size=23, xycoords=ax.transAxes, ha='center', color=color, alpha=0.75)
        text.set_path_effects([path_effects.Stroke(linewidth=2, foreground=bkg_color), path_effects.Normal()])
    else:
        color = colors[1]
        label = r'$\rho$ = ' + f"{corr:0.2f}"
        text = ax.annotate(label, xy = (0.5, 0.32), size=23, xycoords=ax.transAxes, ha='center', color=color, alpha=0.75)
        text.set_path_effects([path_effects.Stroke(linewidth=2, foreground=bkg_color), path_effects.Normal()])
        label = f"MAE = {mae:0.2f}"
        text = ax.annotate(label, xy = (0.5, 0.15), size=23, xycoords=ax.transAxes, ha='center', color=color, alpha=0.75)
        text.set_path_effects([path_effects.Stroke(linewidth=2, foreground=bkg_color), path_effects.Normal()])


age_types = {
    f"DNAmAge{dnam_suffix}": "Horvath",
    f"DNAmAgeHannum{dnam_suffix}": "Hannum",
    f"DNAmPhenoAge{dnam_suffix}": "PhenoAge",
    f"DNAmGrimAge{dnam_suffix}": "GrimAge",
}
for x in ["PCHorvath1", "PCHorvath2", "PCHannum", "PCPhenoAge", "PCGrimAge"]:
    age_types[x] = x

for problem, dict_problem in dict_problems.items():
    pathlib.Path(f"{path_save}/{dict_problem['path']}/05_ages_corr").mkdir(parents=True, exist_ok=True)

    df_fig = df_dnam.loc[dict_problem["all_filter"], ["Age", "Sex", "Region"] + list(age_types.keys())]
    df_fig.rename(columns=age_types, inplace=True)
    df_fig.to_excel(f"{path_save}/{dict_problem['path']}/05_ages_corr/fig.xlsx")
    df_fig_feats = ["Age"] + list(age_types.values())

    sns.set_theme(style="whitegrid", font_scale=1.8)
    pair_grid = sns.PairGrid(
        data=df_fig,
        vars=df_fig_feats,
        hue=dict_problem["target"],
        hue_order=list(dict_problem["color"].keys()),
        palette=dict_problem["color"]
    )
    pair_grid.map_lower(plot_unity)
    pair_grid.map_lower(sns.scatterplot, s=35, alpha=0.75, linewidth=0)
    pair_grid.map_diag(sns.histplot, bins=np.linspace(5, 115, 23))
    pair_grid.map_lower(
        plot_regression,
        base_indexes=df_dnam.index[dict_problem["base_filter"]],
        base_color=dict_problem["color"][dict_problem["base_part"]],
        bkg_color=dict_problem["color_line"]
    )
    pair_grid.map_upper(
        annotate_corr,
        base_indexes=df_dnam.index[dict_problem["base_filter"]],
        colors=list(dict_problem["color"].values()),
        bkg_color=dict_problem["color_line"]
    )
    for x_axis_id in range(pair_grid.axes.shape[0]):
        for y_axis_id in range(pair_grid.axes.shape[1]):
            pair_grid.axes[x_axis_id, y_axis_id].spines[['right', 'top']].set_visible(True)
            if x_axis_id != y_axis_id:
                pass
            if x_axis_id <= y_axis_id:
                pair_grid.axes[x_axis_id, y_axis_id].grid(False)

    plt.savefig(f"{path_save}/{dict_problem['path']}/05_ages_corr/scatter_mtx.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_save}/{dict_problem['path']}/05_ages_corr/scatter_mtx.pdf", bbox_inches='tight')
    plt.clf()

## 6. DMPs

In [None]:
n_highlights = 0
n_examples = 10
dist_num_bins = 15

dim_red_methods_dict = {
    'PCA': ['PC 1', 'PC 2'],
    'SingularValueDecomposition': ['SVD 1', 'SVD 2'],
    'MultiDimensionalScaling': ['Multi Dimensional Scale 1', 'Multi Dimensional Scale 2'],
    'T-SNE': ['tSNE 1', 'tSNE 2'],
}

reg_enr_orders = {
    'CHR': [str(x) for x in range(1, 24)],
    'RELATION_TO_UCSC_CPG_ISLAND': ['S_Shelf', 'S_Shore', 'Island', 'N_Shore', 'N_Shelf', 'OpenSea'],
    'UCSC_REFGENE_GROUP': ['TSS1500', 'TSS200', '5\'UTR', '1stExon', 'Body', '3\'UTR']
}
reg_enr_col_names = {
    'CHR': "CHR",
    'RELATION_TO_UCSC_CPG_ISLAND': "Relation_to_Island",
    'UCSC_REFGENE_GROUP': "UCSC_RefGene_Group"
}
reg_enr_fig_sizes = {
    'CHR': (17, 10),
    'RELATION_TO_UCSC_CPG_ISLAND': (5, 10),
    'UCSC_REFGENE_GROUP': (5, 10)
}
reg_enr_colors = {
    'CHR': px.colors.qualitative.Dark24,
    'RELATION_TO_UCSC_CPG_ISLAND': px.colors.qualitative.Light24[17:23],
    'UCSC_REFGENE_GROUP': px.colors.qualitative.Light24[11:17]
}

for problem, dict_problem in dict_problems.items():
    dmr_filtering_type = dict_problem['dmp_filter_type']
    if dmr_filtering_type == 'pvalfc':
        dmr_path_suffix = f"pval({dict_problem['dmp_pval']:0.2e})_fc({dict_problem['dmp_fc']:0.2e})"
    else:
        dmr_path_suffix = f"top_n({dict_problem['dmp_top_n']})"
    path_dmp = f"{path_save}/{dict_problem['path']}/06_DMP/{dmr_path_suffix}"
    pathlib.Path(f"{path_dmp}/dim_red").mkdir(parents=True, exist_ok=True)
    pathlib.Path(f"{path_dmp}/examples").mkdir(parents=True, exist_ok=True)
    pathlib.Path(f"{path_dmp}/reg_enr").mkdir(parents=True, exist_ok=True)
    pathlib.Path(f"{path_dmp}/GSEA").mkdir(parents=True, exist_ok=True)

    df_dmps = pd.read_csv(f"{path_save}/{dict_problem['path']}/data_from_R/DMP.csv", index_col=0)
    df_dmps["CpG"] = df_dmps.index.values
    df_dmps.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
    df_dmps['print'] = df_dmps.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
    top_to_hightlight = df_dmps["print"].values[0:n_highlights]
    df_dmps['log_pval'] = -np.log10(df_dmps["adj.P.Val"])
    sns.set_theme(style='whitegrid')
    df_dmps.sort_values(["MAPINFO"], ascending=[True], inplace=True)
    mhat(
        df=df_dmps,
        chr='CHR',
        pv='adj.P.Val',
        path=f"{path_dmp}",
        valpha=1,
        markernames=tuple(top_to_hightlight),
        markeridcol='print',
        gstyle=2,
        dim=(8, 4),
        axtickfontsize=8
    )
    sns.set_theme(style='whitegrid')
    volcano(
        df=df_dmps,
        lfc='logFC',
        pv='adj.P.Val',
        pv_thr=(1, 1),
        lfc_thr=(0.0, 0.0),
        path=f"{path_dmp}",
        genenames=tuple(top_to_hightlight),
        geneid='print',
        gstyle=2,
        sign_line=False,
        color=(list(dict_problem["color"].values())[1], "lavender", list(dict_problem["color"].values())[0]),
        dim=(4, 4)
    )

    df_dmps.sort_values(["P.Value"], ascending=[True], inplace=True)
    if dmr_filtering_type == 'pvalfc':
        df_dmps_selected = df_dmps.loc[(df_dmps["adj.P.Val"] < dict_problem["dmp_pval"]) & ((df_dmps["logFC"] < -dict_problem["dmp_fc"]) | (df_dmps["logFC"] > dict_problem["dmp_fc"])), :]
    else:
        df_dmps_selected = df_dmps.head(dict_problem['dmp_top_n'])
    df_dmps_selected.to_excel(f"{path_dmp}/cpgs.xlsx")
    print(f"Number of CpGs: {df_dmps_selected.shape[0]}")

    dmps_genes = set()
    for cpg in df_dmps_selected.index.values:
        genes_raw = manifest.at[cpg, 'Gene']
        if isinstance(genes_raw, str):
            genes = genes_raw.split(';')
            dmps_genes.update(set(genes))
    if 'non-genic' in dmps_genes:
        dmps_genes.remove('non-genic')
    if ' ' in dmps_genes:
        dmps_genes.remove(' ')
    dmps_genes = list(dmps_genes)
    df_dmps_genes = pd.DataFrame({'gene': dmps_genes})
    df_dmps_genes.to_excel(f"{path_dmp}/genes.xlsx", index=False)
    print(f"Number of genes: {df_dmps_genes.shape[0]}")

    feats_dim_red = df_dmps_selected["CpG"].values
    df_dnam_dim_red = df_dnam.loc[dict_problem["all_filter"], list(feats_dim_red) + ["Age", "Sex", "Region"]].copy()
    data_dim_red = df_dnam_dim_red.loc[:, feats_dim_red].values
    classes_dim_red = df_dnam_dim_red.loc[:, 'Region'].values

    pca = PCA(n_components=2, whiten=False)
    data_pca = pca.fit_transform(data_dim_red)
    df_dnam_dim_red['PC 1'] = data_pca[:, 0]
    df_dnam_dim_red['PC 2'] = data_pca[:, 1]
    tsvd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5)
    tsvd.fit(data_dim_red)
    data_svd = tsvd.transform(data_dim_red)
    df_dnam_dim_red['SVD 1'] = data_svd[:, 0]
    df_dnam_dim_red['SVD 2'] = data_svd[:, 1]
    mds = MDS(n_components=2, metric=True)
    data_mds = mds.fit_transform(data_dim_red)
    df_dnam_dim_red['Multi Dimensional Scale 1'] = data_mds[:, 0]
    df_dnam_dim_red['Multi Dimensional Scale 2'] = data_mds[:, 1]
    tsne = TSNE(n_components=2, learning_rate=300, perplexity=30, early_exaggeration=12, init='random')
    data_tsne = tsne.fit_transform(data_dim_red)
    df_dnam_dim_red['tSNE 1'] = data_tsne[:, 0]
    df_dnam_dim_red['tSNE 2'] = data_tsne[:, 1]
    df_dnam_dim_red.loc[:, list(chain(*dim_red_methods_dict.values()))].to_excel(f"{path_dmp}/dim_red/table.xlsx", index=True)

    for method in dim_red_methods_dict:
        x_col = dim_red_methods_dict[method][0]
        y_col = dim_red_methods_dict[method][1]
        fig = go.Figure()
        for group in dict_problem["filter"]:
            fig.add_trace(
                go.Scatter(
                    x=df_dnam_dim_red.loc[dict_problem["filter"][group], x_col].values,
                    y=df_dnam_dim_red.loc[dict_problem["filter"][group], y_col].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=dict_problem["color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.8,
                        color=dict_problem["color"][group],
                        symbol="circle",
                        line=dict(
                            color="black",
                            width=1
                        )
                    )
                )
            )
        add_layout(fig, x_col, y_col, f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            width=850,
            height=800,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=100,
                t=50,
                pad=0,
            )
        )
        save_figure(fig, f"{path_dmp}/dim_red/{method}")

    df_dmps_examples = df_dmps_selected.sort_values(['adj.P.Val'], ascending=[True]).head(n_examples)
    for cpg_id, (cpg, row) in enumerate(df_dmps_examples.iterrows()):
        pval = row['adj.P.Val']
        log_fc = row['logFC']
        gene = manifest.at[cpg, 'Gene']

        fig = go.Figure()
        for group_id, group in enumerate(dict_problem["filter"]):
            if group_id == 0:
                pointpos = 1.5
            else:
                pointpos = -1.5
            vals = df_dnam.loc[dict_problem["filter"][group], cpg].values
            fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=dict_problem["color"][group],
                    marker = dict(color=dict_problem["color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    pointpos=pointpos,
                    bandwidth = np.ptp(vals) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}<br>log(Fold Change): {log_fc:0.2e}")
        fig.update_layout(title_xref='paper', title={'y': 0.95})
        fig.update_layout(legend_font_size=20)
        fig.update_xaxes(autorange=False, range=[-0.3, len(dict_problem["filter"]) - 0.7])
        fig.update_layout(legend={'itemsizing': 'constant'})
        fig.update_layout(
            violingap=0.35,
            violingroupgap=0.35,
            width=850,
            height=615,
            margin=go.layout.Margin(
                l=120,
                r=50,
                b=90,
                t=120,
                pad=0,
            )
        )
        save_figure(fig, f"{path_dmp}/examples/{cpg_id}_{cpg}")
    df_fig = df_dnam.loc[:, ["Age", "Sex", "Region"] + list(df_dmps_examples.index.values)]
    df_fig.to_excel(f"{path_dmp}/examples/fig.xlsx")

    df_dmps_fisher_target = manifest.loc[df_dmps_selected.index.values, :]
    df_dmps_fisher_global = manifest.loc[df_dmps.index.values, :]
    df_dmps_fisher_padding = df_dmps_fisher_global.loc[~df_dmps_fisher_global.index.isin(df_dmps_selected.index.values), :]
    for var in reg_enr_orders:
        columns=["11", "12", "21", "22", "sum", "pval", "odds_ratio"]
        df_var = pd.DataFrame(index=reg_enr_orders[var], columns=columns, data=np.zeros((len(reg_enr_orders[var]), len(columns))))
        df_var.index.name = reg_enr_col_names[var].replace("_", " ")
        for var_val in reg_enr_orders[var]:
            contingency_table = pd.DataFrame(index=["specific", "non-specific"], columns=["in_val", "not_in_val"])
            contingency_table.at["specific", "in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[reg_enr_col_names[var]] == var_val, :].shape[0]
            contingency_table.at["specific", "not_in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[reg_enr_col_names[var]] != var_val, :].shape[0]
            contingency_table.at["non-specific", "in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[reg_enr_col_names[var]] == var_val, :].shape[0]
            contingency_table.at["non-specific", "not_in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[reg_enr_col_names[var]] != var_val, :].shape[0]
            df_var.at[var_val, "11"] = contingency_table.at["specific", "in_val"]
            df_var.at[var_val, "12"] = contingency_table.at["specific", "not_in_val"]
            df_var.at[var_val, "21"] = contingency_table.at["non-specific", "in_val"]
            df_var.at[var_val, "22"] = contingency_table.at["non-specific", "not_in_val"]
            df_var.at[var_val, "sum"] = contingency_table.values.sum()
            odds_ratio, pval = stats.fisher_exact(contingency_table.to_numpy(), alternative='two-sided')
            if np.isnan(odds_ratio):
                odds_ratio = 1.0
            df_var.at[var_val, "odds_ratio"], df_var.at[var_val, "pval"] = odds_ratio, pval
        _, df_var['pval_fdr_bh'], _, _ = multipletests(df_var['pval'].values, 0.05, method='fdr_bh')
        df_var[r'$ \log_{10}(\mathrm{Odds\ ratio})$'] = np.log10(df_var.loc[:, 'odds_ratio'].values)
        df_var[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_var.loc[:, 'pval_fdr_bh'].values)

        plt.figure(figsize=reg_enr_fig_sizes[var])
        plt.xticks(rotation=90)
        sns.set_theme(style='whitegrid', font_scale=2)
        cmap = plt.get_cmap("viridis").copy()
        cmap.set_under('black')

        plot = plt.scatter(
            df_var.index,
            df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values,
            c=df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values,
            cmap=cmap,
            vmin=-np.log10(0.05)
        )
        plt.clf()
        cbar = plt.colorbar(plot, extend='min')

        df_var['bar_color'] = 'black'
        for df_var_index in df_var.index.values:
            if df_var.at[df_var_index, "pval_fdr_bh"] < 0.05:
                value_tmp = df_var.at[df_var_index, r'$ -\log_{10}(\mathrm{p-value})$']
                value_color = (value_tmp-cbar.vmin)/(cbar.vmax-cbar.vmin)
                df_var.at[df_var_index, 'bar_color'] = matplotlib.colors.rgb2hex(cbar.cmap(value_color))
        df_var.to_excel(f"{path_dmp}/reg_enr/fisher_{var}.xlsx")

        plt.xticks(rotation=90)
        cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center')
        ax = sns.barplot(
            data=df_var,
            x=df_var.index,
            y=r'$ \log_{10}(\mathrm{Odds\ ratio})$',
            palette=df_var.loc[:, 'bar_color'],
            dodge=False,
            edgecolor='black',
        )
        plt.savefig(f"{path_dmp}/reg_enr/fisher_{var}.png", bbox_inches='tight')
        plt.savefig(f"{path_dmp}/reg_enr/fisher_{var}.pdf", bbox_inches='tight')
        plt.close()

## Region-specific genes intersection

In [None]:
path_local = f"{path_save}/{dict_problems['all_region']['path']}/06_DMP/top_n(1000)"
genes_our = pd.read_excel(f"{path_local}/genes.xlsx", index_col='gene').index.values
genes_ref = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/genes/region_specific/Cardona2014.xlsx", index_col='gene').index.values

pathlib.Path(f"{path_local}/genes_intersection").mkdir(parents=True, exist_ok=True)

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(genes_our), set(genes_ref)),
    set_labels = ('DMPs', 'Cardona2014'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(genes_our), set(genes_ref)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_local}/genes_intersection/venn.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_local}/genes_intersection/venn.pdf", bbox_inches='tight')
plt.clf()

sections = get_sections([set(genes_our), set(genes_ref)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_local}/genes_intersection/{sec}.xlsx", index_label='gene')

df_genes = pd.DataFrame(index=genes_our)
df_genes["Cardona et. al. 2014"] = "No"
df_genes.loc[set(genes_our).intersection(set(genes_ref)), "Cardona et. al. 2014"] = "Yes"
df_genes.to_excel(f"{path_local}/genes_intersection/genes.xlsx", index_label='gene')

## Region-specific CpGs processing

In [None]:
path_local = f"{path_save}/{dict_problems['all_region']['path']}/06_DMP/top_n(1000)"
df_cpgs = pd.read_excel(f"{path_local}/cpgs.xlsx", index_col=0)
dict_col = {
    "logFC": "logFC",
    "adj.P.Val": "Adj. p-value",
    "Central_AVG": "Central avg",
    "Yakutia_AVG": "Yakutia avg",
    "deltaBeta": "Delta"
}
df_cpgs.rename(columns=dict_col, inplace=True)
df_cpgs.loc[:, "Gene"] = manifest.loc[df_cpgs.index.values, 'Gene']
df_cpgs.loc[:, "Relation to Island"] = manifest.loc[df_cpgs.index.values, 'Relation_to_Island']
df_cpgs.loc[:, "UCSC RefGene Group"] = manifest.loc[df_cpgs.index.values, 'UCSC_RefGene_Group']

df_cpgs = df_cpgs.loc[:, ["CHR", "MAPINFO", "Gene", "Relation to Island", "UCSC RefGene Group", "logFC", "Central avg", "Yakutia avg", "Delta"]]
df_cpgs.to_excel(f"{path_local}/cpgs_processed.xlsx", index_label='CpG')

## Sex-specific CpGs intersection

In [None]:
path_local = f"{path_save}/sex_specificity_in_regions"

df_cpgs_ctl = pd.read_csv(f"{path_save}/{dict_problems['central_sex']['path']}/data_from_R/DMP.csv", index_col=0)
cpgs_ctl = pd.read_excel(f"{path_save}/{dict_problems['central_sex']['path']}/06_DMP/top_n(1000)/cpgs.xlsx", index_col=0).index.values
df_cpgs_ctl["Significant in Central"] = "No"
df_cpgs_ctl.loc[cpgs_ctl, "Significant in Central"] = "Yes"
dict_col_ctl = {
    "logFC": "logFC in Central",
    "adj.P.Val": "Adj. p-value in Central",
    "F_AVG": "F avg in Central",
    "M_AVG": "M avg in Central",
    "deltaBeta": "Delta in Central"
}
df_cpgs_ctl.rename(columns=dict_col_ctl, inplace=True)

df_cpgs_ykt = pd.read_csv(f"{path_save}/{dict_problems['yakutia_sex']['path']}/data_from_R/DMP.csv", index_col=0)
cpgs_ykt = pd.read_excel(f"{path_save}/{dict_problems['yakutia_sex']['path']}/06_DMP/top_n(1000)/cpgs.xlsx", index_col=0).index.values
df_cpgs_ykt["Significant in Yakutia"] = "No"
df_cpgs_ykt.loc[cpgs_ykt, "Significant in Yakutia"] = "Yes"
dict_col_ykt = {
    "logFC": "logFC in Yakutia",
    "adj.P.Val": "Adj. p-value in Yakutia",
    "F_AVG": "F avg in Yakutia",
    "M_AVG": "M avg in Yakutia",
    "deltaBeta": "Delta in Yakutia"
}
df_cpgs_ykt.rename(columns=dict_col_ykt, inplace=True)

cpgs_grant2022 = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/cpgs/sex_specific/Grant2022.xlsx", index_col='CpG').index.values
cpgs_inoshita2015 = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/cpgs/sex_specific/Inoshita2015.xlsx", index_col='CpG').index.values
cpgs_mccarthy2014 = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/lists/cpgs/sex_specific/McCarthy2014.xlsx", index_col='CpG').index.values

df_cpgs_cmn = df_cpgs_ctl.loc[:, ["CHR", "MAPINFO"]]
cpgs_cmn = df_cpgs_cmn.index.values
df_cpgs_cmn.loc[cpgs_cmn, "Gene"] = manifest.loc[cpgs_cmn, 'Gene']
df_cpgs_cmn.loc[cpgs_cmn, "Relation to Island"] = manifest.loc[cpgs_cmn, 'Relation_to_Island']
df_cpgs_cmn.loc[cpgs_cmn, "UCSC RefGene Group"] = manifest.loc[cpgs_cmn, 'UCSC_RefGene_Group']

df_cpgs_cmn.loc[cpgs_cmn, "logFC in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "logFC in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "Adj. p-value in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "Adj. p-value in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "F avg in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "F avg in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "M avg in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "M avg in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "Delta in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "Delta in Central"]
df_cpgs_cmn.loc[cpgs_cmn, "Significant in Central"] = df_cpgs_ctl.loc[cpgs_cmn, "Significant in Central"]

df_cpgs_cmn.loc[cpgs_cmn, "logFC in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "logFC in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "Adj. p-value in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "Adj. p-value in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "F avg in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "F avg in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "M avg in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "M avg in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "Delta in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "Delta in Yakutia"]
df_cpgs_cmn.loc[cpgs_cmn, "Significant in Yakutia"] = df_cpgs_ykt.loc[cpgs_cmn, "Significant in Yakutia"]

df_cpgs_cmn["Grant et. al. 2022"] = "No"
df_cpgs_cmn.loc[set(cpgs_cmn).intersection(set(cpgs_grant2022)), "Grant et. al. 2022"] = "Yes"

df_cpgs_cmn["Inoshita et. al. 2015"] = "No"
df_cpgs_cmn.loc[set(cpgs_cmn).intersection(set(cpgs_inoshita2015)), "Inoshita et. al. 2015"] = "Yes"

df_cpgs_cmn["McCarthy et. al. 2014"] = "No"
df_cpgs_cmn.loc[set(cpgs_cmn).intersection(set(cpgs_mccarthy2014)), "McCarthy et. al. 2014"] = "Yes"

pathlib.Path(f"{path_local}/cpgs").mkdir(parents=True, exist_ok=True)
conds_cols = [
    "Significant in Central",
    "Significant in Yakutia",
    "Grant et. al. 2022",
    "Inoshita et. al. 2015",
    "McCarthy et. al. 2014"
]
df_intxn_order = pd.DataFrame(index=conds_cols)
df_intxn_order.to_excel(f"{path_local}/cpgs/conds_cols.xlsx", index_label='Set')
conditions = [df_cpgs_cmn[metric] == "Yes"  for metric in ["Significant in Central", "Significant in Yakutia"]]
df_cpgs_cmn = df_cpgs_cmn[disjunction(conditions)]
df_cpgs_cmn.to_excel(f"{path_local}/cpgs/table.xlsx", index_label='CpG')

sections = get_sections([set(cpgs_ctl), set(cpgs_ykt), set(cpgs_grant2022), set(cpgs_inoshita2015), set(cpgs_mccarthy2014)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_local}/cpgs/{sec}.xlsx", index_label='gene')

dict_upset_lists = {
    "McCarthy et. al. 2014": cpgs_mccarthy2014,
    "Inoshita et. al. 2015": cpgs_inoshita2015,
    "Grant et. al. 2022": cpgs_grant2022,
    'Sex-specific in Yakutia': cpgs_ykt,
    'Sex-specific in Central': cpgs_ctl,
}
upset_all = list(set().union(*list(dict_upset_lists.values())))
df_upset = pd.DataFrame(index=upset_all)
for k, v in dict_upset_lists.items():
    df_upset[k] = df_upset.index.isin(v)
df_upset = df_upset.set_index(list(dict_upset_lists.keys()))
tmp = plt.figure(figsize=(32, 12))
upset_fig = upsetplot.UpSet(
    df_upset,
    sort_categories_by='input',
    subset_size='count',
    show_counts=True,
    min_degree=0,
    element_size=None,
    totals_plot_elements=3,
    include_empty_subsets=False
)
upset_fig.style_subsets(present=["Sex-specific in Central", "Sex-specific in Yakutia"], edgecolor="red", linewidth=2)
upset_fig.style_subsets(present=["Sex-specific in Central", "Grant et. al. 2022"], absent=["Sex-specific in Yakutia"], facecolor="blue")
upset_fig.style_subsets(present=["Sex-specific in Yakutia", "Grant et. al. 2022"], absent=["Sex-specific in Central"], facecolor="green")
upset_fig.style_subsets(present=["Sex-specific in Yakutia", "Sex-specific in Central", "Grant et. al. 2022"], facecolor="yellow")
upset_fig.plot(tmp)
plt.savefig(f"{path_local}/cpgs/upset.png", bbox_inches='tight')
plt.savefig(f"{path_local}/cpgs/upset.pdf", bbox_inches='tight')
plt.close()

## Sex-specific GSEA intersection

In [None]:
path_local = f"{path_save}/sex_specificity_in_regions"

df_gsea_ctl = pd.read_csv(f"{path_save}/{dict_problems['central_sex']['path']}/data_from_R/GSEA(methylglm)_GO.csv", index_col="ID")
df_gsea_ctl["Significant in Central"] = "No"
df_gsea_ctl.loc[df_gsea_ctl['padj'] < 0.05, "Significant in Central"] = "Yes"
df_gsea_ctl.rename(columns={"padj": "Adj. p-value in Central"}, inplace=True)
terms_ctl = df_gsea_ctl.index[df_gsea_ctl["Significant in Central"] == "Yes"].values

df_gsea_ykt = pd.read_csv(f"{path_save}/{dict_problems['yakutia_sex']['path']}/data_from_R/GSEA(methylglm)_GO.csv", index_col="ID")
df_gsea_ykt["Significant in Yakutia"] = "No"
df_gsea_ykt.loc[df_gsea_ykt['padj'] < 0.05, "Significant in Yakutia"] = "Yes"
df_gsea_ykt.rename(columns={"padj": "Adj. p-value in Yakutia"}, inplace=True)
terms_ykt = df_gsea_ykt.index[df_gsea_ykt["Significant in Yakutia"] == "Yes"].values

pathlib.Path(f"{path_local}/gsea").mkdir(parents=True, exist_ok=True)
df_gsea_ctl.loc[df_gsea_ykt.index, "Adj. p-value in Yakutia"] = df_gsea_ykt.loc[df_gsea_ykt.index, "Adj. p-value in Yakutia"]
df_gsea_ctl.loc[df_gsea_ykt.index, "Significant in Yakutia"] = df_gsea_ykt.loc[df_gsea_ykt.index, "Significant in Yakutia"]
df_gsea = df_gsea_ctl.loc[(df_gsea_ctl["Significant in Yakutia"] == "Yes") | (df_gsea_ctl["Significant in Central"] == "Yes"), :]
df_gsea.to_excel(f"{path_local}/gsea/table.xlsx", index_label='ID')

fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(terms_ctl), set(terms_ykt)),
    set_labels = ('Central', 'Yakutia'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(terms_ctl), set(terms_ykt)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_local}/gsea/venn.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_local}/gsea/venn.pdf", bbox_inches='tight')
plt.clf()

sections = get_sections([set(terms_ctl), set(terms_ykt)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_local}/gsea/{sec}.xlsx", index_label='gene')

## #. DMRs

In [None]:
n_examples = 10
dist_num_bins = 15

colors_island = {
    'Island': px.colors.qualitative.Set1[0],
    'N_Shore': px.colors.qualitative.Set1[1],
    'S_Shore': px.colors.qualitative.Set1[2],
    'N_Shelf': px.colors.qualitative.Set1[3],
    'S_Shelf': px.colors.qualitative.Set1[4],
    'OpenSea': px.colors.qualitative.Set1[8],

}

colors_refgene = {
    'TSS1500': px.colors.qualitative.Vivid[0],
    'TSS200': px.colors.qualitative.Vivid[1],
    '5\'UTR': px.colors.qualitative.Vivid[2],
    '1stExon': px.colors.qualitative.Vivid[3],
    'Body': px.colors.qualitative.Vivid[4],
    '3\'UTR': px.colors.qualitative.Vivid[5],
    'non-genic': px.colors.qualitative.Vivid[8],
}

df_manifest = manifest.loc[feats_dnam, :]
for problem, dict_problem in dict_problems.items():

    if problem in dict_problems:
        path_dmp = f"{path_save}/{dict_problem['path']}/06_DMR/pval({dict_problem['dmr_pval']:0.2e})"
        pathlib.Path(f"{path_dmp}/examples").mkdir(parents=True, exist_ok=True)

        df_dmrs = pd.read_csv(f"{path_save}/{dict_problem['path']}/data_from_R/DMR.csv", index_col=0)
        dict_dmrs_cpgs = {}
        for dmr_id, (dmr, dmr_row) in tqdm(enumerate(df_dmrs.iterrows())):
            chr = dmr_row['seqnames']
            start_pos = dmr_row['start']
            end_pos = dmr_row['end']
            num_cpgs_in_dmr = dmr_row['L']
            pval = dmr_row['p.value']

            dmr_cpgs = df_manifest.loc[(df_manifest['chr'] == chr) & (df_manifest['Position'] >= start_pos) & (df_manifest['Position'] <= end_pos), :].copy()
            dmr_cpgs.sort_values(["Position"], ascending=[True], inplace=True)
            if num_cpgs_in_dmr != dmr_cpgs.shape[0]:
                raise ValueError(f"Wrong number of CpGs in {dmr} ({dmr_id}): expected={num_cpgs_in_dmr}, in manifest={dmr_cpgs.shape[0]}")
            dmr_cpgs['dmr'] = dmr
            dmr_cpgs['p_value'] = pval
            dmr_cpgs['num_cpgs_in_dmr'] = num_cpgs_in_dmr
            dict_dmrs_cpgs[dmr] = dmr_cpgs

            island_presence = {}
            refs_presence = {}
            genes_presence = {}
            if dmr_id < n_examples:
                fig = go.Figure()

                y_neg_means = []
                y_pos_means = []
                color_neg = dict_problem["color"][list(dict_problem["color"].keys())[0]]
                color_pos = dict_problem["color"][list(dict_problem["color"].keys())[1]]

                ptp = np.ptp(df_dnam.loc[dict_problem["all_filter"], dmr_cpgs.index.values].values.flatten())

                dmr_refs = set()
                dmr_genes = set()
                for cpg_id, (cpg, cpg_row) in enumerate(dmr_cpgs.iterrows()):
                    refs_raw = df_manifest.at[cpg, 'UCSC_RefGene_Group']
                    genes_raw = df_manifest.at[cpg, 'Gene']
                    if isinstance(refs_raw, str):
                        refs = refs_raw.split(';')
                        dmr_refs.update(set(refs))
                    if isinstance(genes_raw, str):
                        genes = genes_raw.split(';')
                        dmr_genes.update(set(genes))

                refs_pos = {}
                curr_pos = 0
                for ref in colors_refgene:
                    if ref in dmr_refs:
                        refs_pos[ref] = curr_pos
                        curr_pos += 1

                genes_pos = {}
                colors_gene = {}
                for gene_id, gene in enumerate(dmr_genes):
                    genes_pos[gene] = gene_id
                    if gene == 'non-genic':
                        colors_gene[gene] = 'black'
                    else:
                        colors_gene[gene] = random.choice(px.colors.qualitative.Alphabet)

                for cpg_id, (cpg, cpg_row) in enumerate(dmr_cpgs.iterrows()):

                    vals_neg = df_dnam.loc[dict_problem["filter"][list(dict_problem["filter"].keys())[0]], cpg].values
                    y_neg_means.append(np.mean(vals_neg))
                    fig.add_trace(
                        go.Violin(
                            x=[cpg_id] * len(vals_neg),
                            y=vals_neg,
                            name=cpg,
                            box_visible=True,
                            meanline_visible=True,
                            showlegend=False,
                            line_color='black',
                            fillcolor=color_neg,
                            marker=dict(color=color_neg, line=dict(color='black', width=0.3), opacity=0.8),
                            points='all',
                            bandwidth=np.ptp(vals_neg) / dist_num_bins,
                            opacity=0.8,
                            legendgroup=cpg,
                            scalegroup=cpg,
                            side='negative',
                            scalemode="width",
                            pointpos=-1.5
                        )
                    )

                    vals_pos = df_dnam.loc[dict_problem["filter"][list(dict_problem["filter"].keys())[1]], cpg].values
                    y_pos_means.append(np.mean(vals_pos))
                    fig.add_trace(
                        go.Violin(
                            x=[cpg_id] * len(vals_pos),
                            y=vals_pos,
                            name=cpg,
                            box_visible=True,
                            meanline_visible=True,
                            showlegend=False,
                            line_color='black',
                            fillcolor=color_pos,
                            marker=dict(color=color_pos, line=dict(color='black',width=0.3), opacity=0.8),
                            points='all',
                            bandwidth=np.ptp(vals_pos) / dist_num_bins,
                            opacity=0.8,
                            legendgroup=cpg,
                            scalegroup=cpg,
                            scalemode="width",
                            side='positive',
                            pointpos=1.5
                        )
                    )

                    island_pos = df_manifest.at[cpg, 'Relation_to_Island']
                    if island_pos not in island_presence:
                        island_presence[island_pos] = 1
                        show_legend = True
                    else:
                        island_presence[island_pos] += 1
                        show_legend = False
                    fig.add_trace(
                        go.Scatter(
                            x=[cpg_id-0.505, cpg_id+0.505],
                            y=[-0.15 * ptp, -0.15 * ptp],
                            showlegend=show_legend,
                            legendgroup=f"Relation to Island    ",
                            legendgrouptitle=dict(text=f"Relation to Island    ", font=dict(size=25)),
                            name=island_pos,
                            mode='lines',
                            line=dict(color=colors_island[island_pos], width=10)
                        )
                    )

                    refs_raw = df_manifest.at[cpg, 'UCSC_RefGene_Group']
                    refs = refs_raw.split(';')
                    for gene_ref in refs:
                        if gene_ref not in refs_presence:
                            refs_presence[gene_ref] = 1
                            show_legend = True
                        else:
                            refs_presence[gene_ref] += 1
                            show_legend = False
                        fig.add_trace(
                            go.Scatter(
                                x=[cpg_id-0.505, cpg_id+0.505],
                                y=[(-0.25 - 0.03 * refs_pos[gene_ref])* ptp, (-0.25 - 0.03 * refs_pos[gene_ref]) * ptp],
                                showlegend=show_legend,
                                legendgroup=f"UCSC RefGene Group    ",
                                legendgrouptitle=dict(text=f"UCSC RefGene Group    ", font=dict(size=25)),
                                name=gene_ref,
                                mode='lines',
                                line=dict(color=colors_refgene[gene_ref], width=10)
                            )
                        )

                    genes_raw = df_manifest.at[cpg, 'Gene']
                    genes = genes_raw.split(';')
                    for gene in genes:
                        if gene not in genes_presence:
                            genes_presence[gene] = 1
                            show_legend = True
                        else:
                            genes_presence[gene] += 1
                            show_legend = False
                        fig.add_trace(
                            go.Scatter(
                                x=[cpg_id-0.505, cpg_id+0.505],
                                y=[(-0.32 - 0.03 * len(refs_pos) - 0.03 * genes_pos[gene]) * ptp, (-0.32 - 0.03 * len(refs_pos) - 0.03 * genes_pos[gene]) * ptp],
                                showlegend=show_legend,
                                legendgroup=f"Gene    ",
                                legendgrouptitle=dict(text=f"Gene    ", font=dict(size=25)),
                                name=gene,
                                mode='lines',
                                line=dict(color=colors_gene[gene], width=10)
                            )
                        )

                fig.add_trace(
                    go.Scatter(
                        x=list(range(dmr_cpgs.shape[0])),
                        y=y_neg_means,
                        showlegend=False,
                        mode='lines+markers',
                        line=dict(color='black', width=6, shape='spline'),
                        marker=dict(color='black', line=dict(color='black', width=0.3), opacity=0.8),
                    )
                )
                fig.add_trace(
                    go.Scatter(
                        x=list(range(dmr_cpgs.shape[0])),
                        y=y_pos_means,
                        showlegend=False,
                        mode='lines+markers',
                        line=dict(color='black', width=6, shape='spline'),
                        marker=dict(color='black', line=dict(color='black', width=0.3), opacity=0.8),
                    )
                )
                fig.add_trace(
                    go.Scatter(
                        x=list(range(dmr_cpgs.shape[0])),
                        y=y_neg_means,
                        showlegend=False,
                        mode='lines+markers',
                        line=dict(color=color_neg, width=5, shape='spline'),
                        marker=dict(color=color_neg, line=dict(color='black', width=0.3), opacity=0.8),
                    )
                )
                fig.add_trace(
                    go.Scatter(
                        x=list(range(dmr_cpgs.shape[0])),
                        y=y_pos_means,
                        showlegend=False,
                        mode='lines',
                        line=dict(color=color_pos, width=5, shape='spline'),
                    )
                )

                title = f"{chr}<br>p-value: {pval:0.2e}<br>Number of CpGs in DMR: {num_cpgs_in_dmr}"
                add_layout(fig, "", f"Methylation level", title)
                fig.update_layout(
                    title=dict(xref='paper', x=1.0),
                    legend=dict(
                        orientation="h",
                        yanchor="bottom",
                        y=1.01,
                        xanchor="left",
                        x=0.0001,
                        itemsizing='constant',
                        font_size=22
                    ),
                    xaxis=dict(
                        tickmode='array',
                        tickvals=list(range(dmr_cpgs.shape[0])),
                        ticktext=dmr_cpgs.index.values,
                        tickfont=dict(size=22)
                    ),
                    yaxis=dict(
                        tickmode='array',
                        tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
                        ticktext=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
                    )
                )
                fig.update_layout(
                    violingap=0.39,
                    violingroupgap=0.39,
                    width=100 * num_cpgs_in_dmr,
                    height=1100,
                    margin=go.layout.Margin(
                        l=100,
                        r=50,
                        b=180,
                        t=150,
                        pad=0,
                    )
                )
                fig.update_xaxes(autorange=False, range=[-0.5, num_cpgs_in_dmr - 0.5])
                fig.update_yaxes(autorange=True)
                fig.update_xaxes(tickangle=270)
                save_figure(fig, f"{path_dmp}/examples/{dmr}")

        df_dmrs_cpgs = pd.concat(list(dict_dmrs_cpgs.values()))
        df_dmrs_cpgs.to_excel(f"{path_dmp}/table.xlsx")

# Try to plot earth

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np

path_save = 'D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/043_yakutia_EWAS/map'

fig = plt.figure(figsize=(10,6))

# set perspective angle
lat_viewing_angle = 40
lon_viewing_angle = 95

# call the basemap and use orthographic projection at viewing angle
m = Basemap(
    projection='ortho',
    lat_0=lat_viewing_angle,
    lon_0=lon_viewing_angle
)    

# coastlines, map boundary, fill continents/water, fill ocean, draw countries
m.drawcoastlines()
m.drawmapboundary(fill_color='powderblue', linewidth=0.8)
m.fillcontinents(color='white',lake_color='powderblue')

lat_lines = np.linspace(-90, 90, 19)
lon_lines = np.linspace(-180, 180, 19)

m.drawparallels(lat_lines, linewidth=0.3)
m.drawmeridians(lon_lines, linewidth=0.3)

central_lat = 44
central_lon = 56
m.scatter(
    latlon=True,
    x=central_lat,
    y=central_lon,
    marker='o',
    color='gold',
    s=400,
    zorder=10,
    alpha=0.9,
    edgecolor='black',
    linewidth=0.5
)

yakutia_lat = 130
yakutia_lon = 62
m.scatter(
    latlon=True,
    x=yakutia_lat,
    y=yakutia_lon,
    marker='o',
    color='lightslategray',
    s=400,
    zorder=10,
    alpha=0.9,
    edgecolor='black',
    linewidth=0.5
)

# plt.annotate('Central Russia', xy=(x, y),  xycoords='data',
#                 xytext=(-110, -10), textcoords='offset points',
#                 color='k',fontsize=12,bbox=dict(facecolor='w', alpha=0.5),
#                 arrowprops=dict(arrowstyle="fancy", color='k'),
#                 zorder=20)

plt.savefig(f"{path_save}/map.png", bbox_inches='tight', dpi=400, transparent=True)
plt.savefig(f"{path_save}/map.pdf", bbox_inches='tight', transparent=True)
plt.close(fig)