In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
from functools import reduce
import seaborn as sns
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
from scripts.python.preprocessing.serialization.routines.save import save_pheno_betas_to_pkl
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, SparsePCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Init dnam and immuno data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)
manifest['CHR'] = manifest['chr'].str[3::]

path_save = f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

immuno_num_samples = 1052
immuno_preprocessing = "raw" # "minmax_left(0.05)_right(0.95)_combat" # "raw"
immuno_df = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/df_type({immuno_preprocessing})_all({immuno_num_samples})_imp(fast_knn)_replace(quarter).xlsx", index_col="index")
immuno_df = immuno_df.loc[(immuno_df["Status"] == "Control"), :]
immuno_df["Region"].replace({"Yakutiya": "Yakutia"}, inplace=True)
immuno_feats = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()
if immuno_preprocessing == "raw":
    immuno_df = immuno_df.loc[(immuno_df["260ai"] == True) | (immuno_df["Region"] == "Yakutia"), :]
else:
    immuno_df.drop(["D-91"], inplace=True) # Was error in region for this subject

indexes_common = immuno_df.index[immuno_df["is_dnam"] == True].values

pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas.pkl")
dnam_feats = betas.columns.values
dnam_df = pd.merge(pheno, betas, left_index=True, right_index=True)
dnam_df = dnam_df.loc[(dnam_df["Status"] == "Control"), :]
dnam_df = dnam_df.loc[indexes_common, :]

is_region_equal = dnam_df.loc[indexes_common, 'Region'].equals(immuno_df.loc[indexes_common, 'Region'])
is_sex_equal = dnam_df.loc[indexes_common, 'Sex'].equals(immuno_df.loc[indexes_common, 'Sex'])
is_status_equal = dnam_df.loc[indexes_common, 'Status'].equals(immuno_df.loc[indexes_common, 'Status'])
age_diff_max = np.max(np.abs(dnam_df.loc[indexes_common, 'Age'].values - immuno_df.loc[indexes_common, 'Age'].values))
print(f"is_region_equal: {is_region_equal}")
print(f"is_sex_equal: {is_sex_equal}")
print(f"is_status_equal: {is_status_equal}")
print(f"age_diff_max: {age_diff_max}")

n_samples_immuno_central = len(immuno_df.index[immuno_df["Region"] == "Central"].values)
n_samples_immuno_yakutia = len(immuno_df.index[immuno_df["Region"] == "Yakutia"].values)
n_samples_dnam_central = len(dnam_df.index[dnam_df["Region"] == "Central"].values)
n_samples_dnam_yakutia = len(dnam_df.index[dnam_df["Region"] == "Yakutia"].values)
print(f"n_samples_immuno_central: {n_samples_immuno_central}")
print(f"n_samples_immuno_yakutia: {n_samples_immuno_yakutia}")
print(f"n_samples_dnam_central: {n_samples_dnam_central}")
print(f"n_samples_dnam_yakutia: {n_samples_dnam_yakutia}")

# Create data for R

In [None]:
pathlib.Path(f"{path_save}/data_for_R").mkdir(parents=True, exist_ok=True)

betas_R = dnam_df.loc[:, dnam_feats]
betas_R = betas_R.T
betas_R.index.name = "CpG"
betas_R.to_pickle(f"{path_save}/data_for_R/betas.pkl")

pheno_R = dnam_df.loc[:, ["Age", "Sex", "Region", "Sentrix_ID", "Sentrix_Position"]]
pheno_R.to_pickle(f"{path_save}/data_for_R/pheno.pkl")

# Data description

## Participants figure

In [None]:
path_local = "data_description/participants"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

# Data for figure
df_participants = immuno_df.loc[:, ["Age", "Sex", "Region", "Status"]]
df_participants["Data"] = "Immuno only"
df_participants.loc[indexes_common, "Data"] = "Immuno and DNAm"

# Params for figure
binrange = [0, 105]
bins = 15

palette = {
    "Immuno and DNAm": "forestgreen",
    "Immuno only": "lawngreen",
}
hue_order = ['Immuno only', 'Immuno and DNAm']
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_participants.loc[df_participants["Region"] == "Central", :],
    hue_order=hue_order,
    binrange=binrange,
    bins=bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/{path_local}/hist_Central.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/hist_Central.pdf", bbox_inches='tight')
plt.clf()

palette = {
    "Immuno and DNAm": "royalblue",
    "Immuno only": "deepskyblue",
}
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_participants.loc[df_participants["Region"] == "Yakutia", :],
    hue_order=hue_order,
    binrange=binrange,
    bins=bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/{path_local}/hist_Yakutia.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/hist_Yakutia.pdf", bbox_inches='tight')
plt.clf()

## Immuno features

### Create matrix

In [None]:
feats_plot = ["Age"] + list(immuno_feats)
df_immuno_corr_mtx = pd.DataFrame(data=np.zeros(shape=(len(feats_plot), len(feats_plot))), index=feats_plot, columns=feats_plot)
for f_id_1 in range(len(feats_plot)):
    for f_id_2 in range(f_id_1, len(feats_plot)):
        f_1 = feats_plot[f_id_1]
        f_2 = feats_plot[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = immuno_df.loc[:, f_1].values
            vals_2 = immuno_df.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_immuno_corr_mtx.at[f_2, f_1] = pval
            df_immuno_corr_mtx.at[f_1, f_2] = corr
        else:
            df_immuno_corr_mtx.at[f_2, f_1] = np.nan
selection = np.tri(df_immuno_corr_mtx.shape[0], df_immuno_corr_mtx.shape[1], -1, dtype=np.bool)
df_fdr = df_immuno_corr_mtx.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_immuno_corr_mtx_fdr = df_immuno_corr_mtx.copy()
for line_id in range(df_fdr.shape[0]):
    df_immuno_corr_mtx_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])

### Plot Matrix

In [None]:
path_local = "data_description/feats_immuno"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

sns.set_theme(style='whitegrid')

df_to_plot = df_immuno_corr_mtx_fdr.copy()
mtx_to_plot = df_to_plot.to_numpy()

mtx_triu = np.triu(mtx_to_plot, +1)
max_corr = np.max(mtx_triu)
min_corr = np.min(mtx_triu)
mtx_triu_mask = np.ma.masked_array(mtx_triu, mtx_triu==0)
cmap_triu = plt.get_cmap("bwr").copy()

mtx_tril = np.tril(mtx_to_plot, -1)
mtx_tril_mask = np.ma.masked_array(mtx_tril, mtx_tril==0)
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')

fig, ax = plt.subplots()

im_triu = ax.imshow(mtx_triu_mask, cmap=cmap_triu, vmin=-1, vmax=1)
cbar_triu = ax.figure.colorbar(im_triu, ax=ax, location='right')
cbar_triu.set_label(r"$\mathrm{Correlation\:coefficient}$", horizontalalignment='center', fontsize=10)

im_tril = ax.imshow(mtx_tril_mask, cmap=cmap_tril, vmin=-np.log10(0.05))
cbar_tril = ax.figure.colorbar(im_tril, ax=ax, location='right')
cbar_tril.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center', fontsize=10)

ax.grid(None)
ax.set_aspect("equal")
ax.set_xticks(np.arange(df_to_plot.shape[1]))
ax.set_yticks(np.arange(df_to_plot.shape[0]))
ax.set_xticklabels(df_to_plot.columns.values)
ax.set_yticklabels(df_to_plot.index.values)
plt.setp(ax.get_xticklabels(), rotation=90)
threshold = np.ptp(mtx_tril.flatten()) * 0.5
ax.tick_params(axis='both', which='major', labelsize=5)
ax.tick_params(axis='both', which='minor', labelsize=5)
textcolors = ("black", "white")
for i in range(df_to_plot.shape[0]):
    for j in range(df_to_plot.shape[1]):
        color = "black"
        if i > j:
            color = textcolors[int(mtx_tril[i, j] < threshold)]
        if np.isinf(mtx_to_plot[i, j]) or np.isnan(mtx_to_plot[i, j]):
            text = ax.text(j, i, f"", ha="center", va="center", color=color, fontsize=1.3)
        else:
            text = ax.text(j, i, f"{mtx_to_plot[i, j]:0.2f}", ha="center", va="center", color=color, fontsize=1.3)
fig.tight_layout()
plt.savefig(f"{path_save}/{path_local}/corr_mtx_fdr.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/corr_mtx_fdr.pdf", bbox_inches='tight', dpi=400)
plt.clf()
df_save = df_immuno_corr_mtx_fdr
df_save.to_excel(f"{path_save}/{path_local}/corr_mtx_fdr.xlsx", index=True)

# DNAm features

In [None]:
path_local = "data_description/feats_dnam"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

In [None]:
df_dnam_age = pd.read_csv(f"{path_save}/data_for_R/DMP_age.csv", index_col="CpG")
df_dnam_age["CpG"] = df_dnam_age.index.values
df_dnam_age['print'] = df_dnam_age.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
top_to_hightlight = df_dnam_age["print"].values[0:5]
df_dnam_age['log_pval'] = -np.log10(df_dnam_age["adj.P.Val"])

In [None]:
sns.set_theme(style='whitegrid')
df_dnam_age.sort_values(["MAPINFO"], ascending=[True], inplace=True)
mhat(
    df=df_dnam_age,
    chr='CHR',
    pv='adj.P.Val',
    path=f"{path_save}/{path_local}",
    valpha=1,
    markernames=tuple(top_to_hightlight),
    markeridcol='print',
    gstyle=2,
    dim=(12,4),
    axtickfontsize=8
)

# DNAm analysis
## Aux data

In [None]:
problem = {
    "Color": {
        "Central": "limegreen",
        "Yakutia": "royalblue",
    },
    "Filter": {
        "Central": dnam_df["Region"] == "Central",
        "Yakutia": dnam_df["Region"] == "Yakutia",
    },
    "BaseFilter": (dnam_df["Region"] == "Central") | (dnam_df["Region"] == "Yakutia"),
    "BasePart": "Central"
}

## Cells

In [None]:
path_local = "dnam_cells"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df_cells = pd.DataFrame(index=["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"], columns=["pval", "pval_fdr_bh"])
for cell in tqdm(df_cells.index.values):
    vals = {}
    for group in problem["Filter"]:
        vals[group] = dnam_df.loc[problem["Filter"][group], cell].values
        df_cells.at[cell, f"mean_{group}"] = np.mean(vals[group])
        df_cells.at[cell, f"median_{group}"] = np.median(vals[group])
        df_cells.at[cell, f"q75_{group}"], df_cells.at[cell, f"q25_{group}"] = np.percentile(vals[group], [75 ,25])
        df_cells.at[cell, f"iqr_{group}"] = df_cells.at[cell, f"q75_{group}"] - df_cells.at[cell, f"q25_{group}"]
    stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_cells.at[cell, "pval"] = pval
_, df_cells["pval_fdr_bh"], _, _ = multipletests(df_cells["pval"], 0.05, method='fdr_bh')
df_cells.to_excel(f"{path_save}/{path_local}/cells.xlsx", index=True)

dist_num_bins = 15
for cell in tqdm(df_cells.index.values):

    vals = {}
    for group in problem["Filter"]:
        vals[group] = dnam_df.loc[problem["Filter"][group], cell].values
        print(f"{group}: {len(vals[group])}")

    fig = go.Figure()
    for group in problem["Filter"]:
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals[group]) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", f"{cell}", f"p-value: {df_cells.at[cell, 'pval_fdr_bh']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=40,
            pad=0
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/{path_local}/{cell}")

## Age Accelerations

In [None]:
path_local = "dnam_age_accelerations"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
age_types = ['DNAmAgeHannum', 'DNAmAge', 'DNAmPhenoAge', 'DNAmGrimAge']
df_aas = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
for age_type in tqdm(age_types):
    formula = f"{age_type} ~ Age"
    model = smf.ols(formula=formula, data=dnam_df.loc[dnam_df["Region"] == problem["BasePart"]]).fit()
    dnam_df[f"{age_type}_linear_pred"] = model.predict(dnam_df)
    y_pred = model.predict(pheno)
    dnam_df[f"{age_type}Acc"] = dnam_df[age_type] - dnam_df[f"{age_type}_linear_pred"]
    vals = {}
    for group in problem["Filter"]:
        vals[group] = dnam_df.loc[problem["Filter"][group], f"{age_type}Acc"].values
        df_aas.at[f"{age_type}Acc", f"mean_{group}"] = np.mean(vals[group])
        df_aas.at[f"{age_type}Acc", f"median_{group}"] = np.median(vals[group])
        df_aas.at[f"{age_type}Acc", f"q75_{group}"], df_aas.at[f"{age_type}Acc", f"q25_{group}"] = np.percentile(vals[group], [75 ,25])
        df_aas.at[f"{age_type}Acc", f"iqr_{group}"] = df_aas.at[f"{age_type}Acc", f"q75_{group}"] - df_aas.at[f"{age_type}Acc", f"q25_{group}"]
        print(f"{group}: {len(vals[group])}")
    stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_aas.at[f"{age_type}Acc", "pval"] = pval
_, df_aas["pval_fdr_bh"], _, _ = multipletests(df_aas["pval"], 0.05, method='fdr_bh')
df_aas.to_excel(f"{path_save}/{path_local}/aas.xlsx", index=True)

dist_num_bins = 15
for age_type in tqdm(age_types):

    vals = {}
    for group in problem["Filter"]:
        vals[group] = dnam_df.loc[problem["Filter"][group], f"{age_type}Acc"].values

    fig = go.Figure()
    for group in problem["Filter"]:
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals[group]) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", f"{age_type}Acc", f"p-value: {df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=40,
            pad=0
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/{path_local}/violin_{age_type}Acc")

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=dnam_df.loc[dnam_df["Region"] == problem["BasePart"], f"Age"].values,
            y=dnam_df.loc[dnam_df["Region"] == problem["BasePart"], f"{age_type}_linear_pred"].values,
            showlegend=False,
            name="",
            mode="lines",
            marker_color=problem["Color"][problem["BasePart"]],
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    for group in problem["Filter"]:
        vals = dnam_df.loc[problem["Filter"][group], f"{age_type}"].values

        fig.add_trace(
            go.Scatter(
                x=dnam_df.loc[problem["Filter"][group], f"Age"].values,
                y=dnam_df.loc[problem["Filter"][group], f"{age_type}"].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=problem["Color"][group],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
    add_layout(fig, f"Age", f"{age_type}", f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=80,
            t=40,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/scatter_{age_type}")

## Mann-Whitney U test

In [None]:
path_local = "dnam_mann_whitney"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

In [None]:
cpgs = betas.columns.values
df_mw = pd.DataFrame(index=cpgs, columns=['chr', 'Position', 'Relation_to_Island', 'UCSC_RefGene_Group', 'Gene', 'stat', 'pval', 'pval_fdr_bh'])
df_mw.index.name = 'CpG'
for cpg_id, cpg in tqdm(enumerate(cpgs), desc='Mann-Whitney U test', total=len(cpgs)):
    df_mw.at[cpg, 'chr'] = manifest.at[cpg, 'chr']
    df_mw.at[cpg, 'Position'] = manifest.at[cpg, 'Position']
    df_mw.at[cpg, 'Relation_to_Island'] = manifest.at[cpg, 'Relation_to_Island']
    df_mw.at[cpg, 'UCSC_RefGene_Group'] = manifest.at[cpg, 'UCSC_RefGene_Group']
    df_mw.at[cpg, 'Gene'] = manifest.at[cpg, 'Gene']
    vals = {}
    for group in problem["Filter"]:
        vals[group] = dnam_df.loc[problem["Filter"][group], cpg].values
    stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_mw.at[cpg, 'stat'] = stat
    df_mw.at[cpg, 'pval'] = pval
_, df_mw['pval_fdr_bh'], _, _ = multipletests(df_mw['pval'], 0.05, method='fdr_bh')
df_mw.to_excel(f"{path_save}/{path_local}/table.xlsx")

In [None]:
df_mw = pd.read_excel(f"{path_save}/{path_local}/table.xlsx", index_col="CpG")

In [None]:
df_mw['CHR'] = pd.to_numeric(df_mw['chr'].str[3::])
df_mw["CpG"] = df_mw.index.values
df_mw.sort_values(["pval_fdr_bh"], ascending=[True], inplace=True)
df_mw['print'] = df_mw.apply(lambda row: f"{row['CpG']} ({row['Gene']})", axis=1)
top_to_hightlight = df_mw["print"].values[0:5]
df_mw.sort_values(["Position"], ascending=[True], inplace=True)

In [None]:
pval_lim = 1e-10
df_mw_selected = df_mw.loc[df_mw["pval_fdr_bh"] < pval_lim, :]
df_mw_selected.sort_values(["pval_fdr_bh"], ascending=[True], inplace=True)
df_mw_selected.to_excel(f"{path_save}/{path_local}/selected.xlsx")
genes_mw_selected = set()
for cpg in df_mw_selected.index.values:
    genes_raw = manifest.at[cpg, 'Gene']
    if isinstance(genes_raw, str):
        genes = genes_raw.split(';')
        genes_mw_selected.update(set(genes))
if 'non-genic' in genes_mw_selected:
    genes_mw_selected.remove('non-genic')
if ' ' in genes_mw_selected:
    genes_mw_selected.remove(' ')
genes_mw_selected = list(genes_mw_selected)
genes_mw_df = pd.DataFrame({'gene':genes_mw_selected})
genes_mw_df.to_excel(f"{path_save}/{path_local}/genes.xlsx", index=False)

In [None]:
pval_show_type = "color" # "cross"
orders = {
    'CHR': [str(x) for x in range(1, 24)],
    'RELATION_TO_UCSC_CPG_ISLAND': ['S_Shelf', 'S_Shore', 'Island', 'N_Shore', 'N_Shelf', 'OpenSea'],
    'UCSC_REFGENE_GROUP': ['TSS1500', 'TSS200', '5\'UTR', '1stExon', 'Body', '3\'UTR']
}
col_names = {
    'CHR': "CHR",
    'RELATION_TO_UCSC_CPG_ISLAND': "Relation_to_Island",
    'UCSC_REFGENE_GROUP': "UCSC_RefGene_Group"
}
fig_sizes = {
    'CHR': (17, 10),
    'RELATION_TO_UCSC_CPG_ISLAND': (5, 10),
    'UCSC_REFGENE_GROUP': (5, 10)
}
colors = {
    'CHR': px.colors.qualitative.Dark24,
    'RELATION_TO_UCSC_CPG_ISLAND': px.colors.qualitative.Light24[17:23],
    'UCSC_REFGENE_GROUP': px.colors.qualitative.Light24[11:17]
}
df_mw_fisher_target = manifest.loc[df_mw_selected.index.values, :]
df_mw_fisher_global = manifest.loc[df_mw.index.values, :]
df_mw_fisher_padding = df_mw_fisher_global.loc[~df_mw_fisher_global.index.isin(df_mw_selected.index.values), :]
for var in orders:
    columns=["11", "12", "21", "22", "sum", "pval", "odds_ratio"]
    df_var = pd.DataFrame(index=orders[var], columns=columns, data=np.zeros((len(orders[var]), len(columns))))
    df_var.index.name = col_names[var].replace("_", " ")
    for var_val in orders[var]:
        contingency_table = pd.DataFrame(index=["specific", "non-specific"], columns=["in_val", "not_in_val"])
        contingency_table.at["specific", "in_val"] = df_mw_fisher_target.loc[df_mw_fisher_target[col_names[var]] == var_val, :].shape[0]
        contingency_table.at["specific", "not_in_val"] = df_mw_fisher_target.loc[df_mw_fisher_target[col_names[var]] != var_val, :].shape[0]
        contingency_table.at["non-specific", "in_val"] = df_mw_fisher_padding.loc[df_mw_fisher_padding[col_names[var]] == var_val, :].shape[0]
        contingency_table.at["non-specific", "not_in_val"] = df_mw_fisher_padding.loc[df_mw_fisher_padding[col_names[var]] != var_val, :].shape[0]
        df_var.at[var_val, "11"] = contingency_table.at["specific", "in_val"]
        df_var.at[var_val, "12"] = contingency_table.at["specific", "not_in_val"]
        df_var.at[var_val, "21"] = contingency_table.at["non-specific", "in_val"]
        df_var.at[var_val, "22"] = contingency_table.at["non-specific", "not_in_val"]
        df_var.at[var_val, "sum"] = contingency_table.values.sum()
        odds_ratio, pval = stats.fisher_exact(contingency_table.to_numpy(), alternative='two-sided')
        if np.isnan(odds_ratio):
            odds_ratio = 1.0
        df_var.at[var_val, "odds_ratio"], df_var.at[var_val, "pval"] = odds_ratio, pval
    _, df_var['pval_fdr_bh'], _, _ = multipletests(df_var['pval'].values, 0.05, method='fdr_bh')
    df_var[r'$ \log_{10}(\mathrm{Odds\ ratio})$'] = np.log10(df_var.loc[:, 'odds_ratio'].values)
    df_var[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_var.loc[:, 'pval_fdr_bh'].values)
    df_var.to_excel(f"{path_save}/{path_local}/fisher_{var}.xlsx")

    plt.figure(figsize=fig_sizes[var])
    plt.xticks(rotation=90)
    sns.set_theme(style='whitegrid', font_scale=2)
    if pval_show_type == "color":
        plot = plt.scatter(df_var.index, df_var.loc[:, r'$ \log_{10}(\mathrm{Odds\ ratio})$'].values, c=df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values, cmap='Reds')
        plt.clf()
        cbar = plt.colorbar(plot)
        cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center')
        ax = sns.barplot(data=df_var, x=df_var.index, y=r'$ \log_{10}(\mathrm{Odds\ ratio})$', hue=r'$ -\log_{10}(\mathrm{p-value})$', palette='Reds', dodge=False)
        ax.legend_.remove()
    else:
        bar = sns.barplot(data=df_var, x=df_var.index, y=r'$ \log_{10}(\mathrm{Odds\ ratio})$', palette=colors[var], edgecolor='black')
        for bar_index, this_bar in enumerate(bar.patches):
            if df_var.at[df_var.index[bar_index], "pval_fdr_bh"] < 0.05:
                this_bar.set_hatch('x')
            this_bar.set_edgecolor('skyblue')
    plt.savefig(f"{path_save}/{path_local}/fisher_{var}.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/fisher_{var}.pdf", bbox_inches='tight')
    plt.close()

In [None]:
sns.set_theme(style='whitegrid')
mhat(
    df=df_mw,
    chr='CHR',
    pv='pval_fdr_bh',
    path=f"{path_save}/{path_local}",
    valpha=1,
    markernames=tuple(top_to_hightlight),
    markeridcol='print',
    gwas_sign_line=True,
    gwasp=pval_lim,
    gstyle=2,
    dim=(12,4),
    axtickfontsize=8
)

In [None]:
n_top = 10
dist_num_bins = 25
pathlib.Path(f"{path_save}/{path_local}/examples").mkdir(parents=True, exist_ok=True)
df_mw_top = df_mw.sort_values(['pval_fdr_bh'], ascending=[True]).head(n_top)
for cpg_id, (cpg, row) in enumerate(df_mw_top.iterrows()):
    pval = row['pval_fdr_bh']
    gene = manifest.at[cpg, 'Gene']

    fig = go.Figure()
    for group in problem["Filter"]:
        vals = dnam_df.loc[problem["Filter"][group], cpg].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{cpg_id}_{cpg}")

## ChAMP DMPs

### Setup

In [None]:
path_local = "dnam_DMPs_region"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

### Read ChAMP results

In [None]:
df_dmps = pd.read_csv(f"{path_save}/data_for_R/DMP_region.csv", index_col="CpG")
df_dmps["CpG"] = df_dmps.index.values
df_dmps.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
df_dmps['print'] = df_dmps.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
df_dmps['log_pval'] = -np.log10(df_dmps["adj.P.Val"])

### Obtain gene list

In [None]:
pval_lim = 1e-10
fc_lim = 0.1
df_dmps_selected = df_dmps.loc[(df_dmps["adj.P.Val"] < pval_lim) & ((df_dmps["logFC"] < -fc_lim) | (df_dmps["logFC"] > fc_lim)), :]
df_dmps_selected.sort_values(["adj.P.Val"], ascending=[True], inplace=True)
top_to_hightlight = df_dmps_selected["print"].values[0:2]
df_dmps_selected.to_excel(f"{path_save}/{path_local}/selected.xlsx")
genes_dmps_selected = set()
for cpg in df_dmps_selected.index.values:
    genes_raw = manifest.at[cpg, 'Gene']
    if isinstance(genes_raw, str):
        genes = genes_raw.split(';')
        genes_dmps_selected.update(set(genes))
if 'non-genic' in genes_dmps_selected:
    genes_dmps_selected.remove('non-genic')
if ' ' in genes_dmps_selected:
    genes_dmps_selected.remove(' ')
genes_dmps_selected = list(genes_dmps_selected)
genes_dmps_df = pd.DataFrame({'gene':genes_dmps_selected})
genes_dmps_df.to_excel(f"{path_save}/{path_local}/genes.xlsx", index=False)

### Perform dimensionality reduction

In [None]:
feats_dim_red = df_dmps_selected["CpG"].values
df_dnam_dim_red = dnam_df.loc[:, list(feats_dim_red) + ["Age", "Sex", "Region"]]
data_dim_red = df_dnam_dim_red.loc[:, feats_dim_red].values
classes_dim_red = df_dnam_dim_red.loc[:, 'Region'].values

In [None]:
print(f"PCA")
pca = PCA(n_components=2, whiten=False)
data_pca = pca.fit_transform(data_dim_red)
df_dnam_dim_red['PC 1'] = data_pca[:, 0]
df_dnam_dim_red['PC 2'] = data_pca[:, 1]

print(f"Incremental PCA")
n_batches = 32
ipca = IncrementalPCA(n_components=2)
for data_batch in np.array_split(data_dim_red, n_batches):
    ipca.partial_fit(data_batch)
data_ipca = ipca.transform(data_dim_red)
df_dnam_dim_red['Incremental PC 1'] = data_ipca[:, 0]
df_dnam_dim_red['Incremental PC 2'] = data_ipca[:, 1]

print(f"Kernel PCA")
kpca = KernelPCA(kernel='rbf', fit_inverse_transform=True, gamma=None, n_components=2)
data_kpca = kpca.fit_transform(data_dim_red)
df_dnam_dim_red['Kernel PC 1'] = data_kpca[:, 0]
df_dnam_dim_red['Kernel PC 2'] = data_kpca[:, 1]

print(f"SVD")
tsvd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5)
tsvd.fit(data_dim_red)
data_svd = tsvd.transform(data_dim_red)
df_dnam_dim_red['SVD 1'] = data_svd[:, 0]
df_dnam_dim_red['SVD 2'] = data_svd[:, 1]

print(f"GRP")
GRP = GaussianRandomProjection(n_components=2, eps=0.5)
GRP.fit(data_dim_red)
data_grp = GRP.transform(data_dim_red)
df_dnam_dim_red['Gaussian Random Projection 1'] = data_grp[:, 0]
df_dnam_dim_red['Gaussian Random Projection 2'] = data_grp[:, 1]

print(f"SRP")
SRP = SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False)
SRP.fit(data_dim_red)
data_srp = SRP.transform(data_dim_red)
df_dnam_dim_red['Sparse Random Projection 1'] = data_srp[:, 0]
df_dnam_dim_red['Sparse Random Projection 2'] = data_srp[:, 1]

print(f"MDS")
mds = MDS(n_components=2, metric=True)
data_mds = mds.fit_transform(data_dim_red)
df_dnam_dim_red['Multi Dimensional Scale 1'] = data_mds[:, 0]
df_dnam_dim_red['Multi Dimensional Scale 2'] = data_mds[:, 1]

print(f"ISOMAP")
isomap = Isomap(n_components=2, n_neighbors=5)
isomap.fit(data_dim_red)
data_isomap = isomap.transform(data_dim_red)
df_dnam_dim_red['IsoMap 1'] = data_isomap[:, 0]
df_dnam_dim_red['IsoMap 2'] = data_isomap[:, 1]

print(f"MiniBatchDictionaryLearning")
miniBatchDictLearning = MiniBatchDictionaryLearning(n_components=2, batch_size=200, alpha=1, n_iter=25)
miniBatchDictLearning.fit(data_dim_red)
data_batch = miniBatchDictLearning.fit_transform(data_dim_red)
df_dnam_dim_red['MBDL 1'] = data_batch[:, 0]
df_dnam_dim_red['MBDL 2'] = data_batch[:, 1]

print(f"ICA")
fastICA = FastICA(n_components=2, algorithm='parallel', whiten=True, tol=1e-3, max_iter=1000)
data_ica = fastICA.fit_transform(data_dim_red)
df_dnam_dim_red['IC 1'] = data_ica[:, 0]
df_dnam_dim_red['IC 2'] = data_ica[:, 1]

print(f"t-SNE")
tsne = TSNE(n_components=2, learning_rate=300, perplexity=30, early_exaggeration=12, init='random')
data_tsne = tsne.fit_transform(data_dim_red)
df_dnam_dim_red['tSNE 1'] = data_tsne[:, 0]
df_dnam_dim_red['tSNE 2'] = data_tsne[:, 1]

print(f"LLE")
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, method='modified')
lle.fit(data_dim_red)
data_lle = lle.transform(data_dim_red)
df_dnam_dim_red['LLE 1'] = data_lle[:, 0]
df_dnam_dim_red['LLE 2'] = data_lle[:, 1]

In [None]:
pathlib.Path(f"{path_save}/{path_local}/dim_red").mkdir(parents=True, exist_ok=True)
dim_red_methods_dict = {
    'PCA': ['PC 1', 'PC 2'],
    'IncrementalPCA': ['Incremental PC 1', 'Incremental PC 2'],
    'KernelPCA': ['Kernel PC 1', 'Kernel PC 2'],
    'SingularValueDecomposition': ['SVD 1', 'SVD 2'],
    'GaussianRandomProjection': ['Gaussian Random Projection 1', 'Gaussian Random Projection 2'],
    'SparseRandomProjection': ['Sparse Random Projection 1', 'Sparse Random Projection 2'],
    'MultiDimensionalScaling': ['Multi Dimensional Scale 1', 'Multi Dimensional Scale 2'],
    'Isomap': ['IsoMap 1', 'IsoMap 2'],
    'MiniBatchDictionaryLearning': ['MBDL 1', 'MBDL 2'],
    'ICA': ['IC 1', 'IC 2'],
    'T-SNE': ['tSNE 1', 'tSNE 2'],
    'LocallyLinearEmbedding': ['LLE 1', 'LLE 2']
}
for method in dim_red_methods_dict:
    x_col = dim_red_methods_dict[method][0]
    y_col = dim_red_methods_dict[method][1]
    fig = go.Figure()
    for group in problem["Color"]:
        fig.add_trace(
            go.Scatter(
                x=df_dnam_dim_red.loc[df_dnam_dim_red["Region"] == group, x_col].values,
                y=df_dnam_dim_red.loc[df_dnam_dim_red["Region"] == group, y_col].values,
                showlegend=True,
                name=group,
                mode="markers",
                line_color=problem["Color"][group],
                marker=dict(
                    size=12,
                    opacity=0.9,
                    color=problem["Color"][group],
                    symbol="x",
                    line=dict(
                        color="black",
                        width=0.6
                    )
                )
            )
        )
    add_layout(fig, x_col, y_col, f"")
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        autosize=False,
        width=600,
        height=600,
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=80,
            t=40,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/dim_red/{method}")

### Obtain entrez genes lists with possible synonyms

In [None]:
mg = mygene.MyGeneInfo()
print(f"genes_dmps_selected: {len(genes_dmps_selected)}")
df_queries_all = []
genes_missed = []
number_of_synonyms = 0
for gene in tqdm(genes_dmps_selected):
    df_query = mg.query(gene, scopes='entrezgene', species='human', as_dataframe=True)
    if df_query.empty:
        genes_missed.append(gene)
    else:
        df_queries_all.append(df_query)
        if gene not in set(df_query.loc[:, "symbol"].values):
            number_of_synonyms += 1
            print(f"{gene} not in {list(df_query.loc[:, 'symbol'].values)}")
print(f"Total number of synonyms: {number_of_synonyms}")

df_query_all = pd.concat(df_queries_all)
df_query_all.to_excel(f"{path_save}/{path_local}/df_query_all.xlsx", index=True)

genes_dmps_missed_df = pd.DataFrame({'gene': genes_missed})
genes_dmps_missed_df.to_excel(f"{path_save}/{path_local}/genes_dmps_mygene_missed.xlsx", index=False)

genes_dmps_selected_all = list(set(df_query_all.loc[:, "symbol"].values))
genes_dmps_selected_all_df = pd.DataFrame({'gene': genes_dmps_selected_all})
genes_dmps_selected_all_df.to_excel(f"{path_save}/{path_local}/genes_dmps_selected_mygene_all.xlsx", index=False)

### Perform GSEA for selected gene libraries

In [None]:
genes_lists = gp.get_library_name("Human")
df_genes_lists = pd.DataFrame(index=genes_lists)
df_genes_lists.to_excel(f"{path_save}/{path_local}/genes_lists.xlsx", index=True)

genes_dict_of_lists = {
    "origin": genes_dmps_selected,
    "mygene_all": genes_dmps_selected_all
}

for genes in genes_dict_of_lists:
    dfs_enrichr = []
    for genes_list in genes_lists:
        pathlib.Path(f"{path_save}/{path_local}/GSEA/{genes}/{genes_list}").mkdir(parents=True, exist_ok=True)
        df_enrichr = gp.enrichr(
            gene_list=genes_dict_of_lists[genes],
            gene_sets=genes_list,
            organism='Human',
            outdir=f"{path_save}/{path_local}/GSEA/{genes}/{genes_list}",
            cutoff=1.00,
            verbose=True,
            no_plot=True
        )
        dfs_enrichr.append(df_enrichr.results)
    dfs_enrichr = pd.concat(dfs_enrichr)
    dfs_enrichr.to_excel(f"{path_save}/{path_local}/GSEA/{genes}/results.xlsx", index=True)
    dfs_enrichr.to_pickle(f"{path_save}/{path_local}/GSEA/{genes}/results.pkl")

In [None]:
library_dict = gp.parser.get_library('GO_Molecular_Function_2021', organism='Human')
len(library_dict["response to cold (GO:0009409)"])

In [None]:
library_dict = gp.parser.get_library('BioCarta_2015', organism='Human')
len(set(library_dict["telomeres telomerase cellular aging and immortality"]).intersection(set(genes_dmps_selected)))

### Plot significant GSEA terms

In [None]:
target_genes_lists = [
    "GO_Biological_Process_2021",
    "GO_Molecular_Function_2021",
    "GO_Cellular_Component_2021",
    "Reactome_2016",
    "KEGG_2021_Human",
    "WikiPathways_2019_Human",
]

gsea_cols = ["Gene_set", "Term", "Overlap", "P-value", "Adjusted P-value", "Odds Ratio", "Combined Score"]
for genes in genes_dict_of_lists:

    dfs_enrichr = pd.read_pickle(f"{path_save}/{path_local}/GSEA/{genes}/results.pkl")
    dfs_enrichr = dfs_enrichr.loc[(dfs_enrichr["Adjusted P-value"] < 0.05) & (dfs_enrichr["Gene_set"].isin(target_genes_lists)), gsea_cols]

    if dfs_enrichr.empty == False:
        dfs_enrichr[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(dfs_enrichr.loc[:, 'Adjusted P-value'].values)
        dfs_enrichr.rename(columns={'Gene_set': 'Gene set'}, inplace=True)
        dfs_enrichr.to_excel(f"{path_save}/{path_local}/GSEA/{genes}/terms.xlsx")
        plt.figure(figsize=(10, 0.5 * dfs_enrichr.shape[0]))
        sns.set_theme(style='whitegrid', font_scale=2)
        bar = sns.barplot(
            data=dfs_enrichr,
            hue="Gene set",
            y=dfs_enrichr["Term"].values,
            x=r'$ -\log_{10}(\mathrm{p-value})$',
            palette=px.colors.qualitative.Light24,
            edgecolor='black',
            orient="h",
            dodge=False
        )
        sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
        plt.savefig(f"{path_save}/{path_local}/GSEA/{genes}/terms.png", bbox_inches='tight')
        plt.savefig(f"{path_save}/{path_local}/GSEA/{genes}/terms.pdf", bbox_inches='tight')
        plt.close()

In [None]:
pval_show_type = "color" # "cross"
orders = {
    'CHR': [str(x) for x in range(1, 24)],
    'RELATION_TO_UCSC_CPG_ISLAND': ['S_Shelf', 'S_Shore', 'Island', 'N_Shore', 'N_Shelf', 'OpenSea'],
    'UCSC_REFGENE_GROUP': ['TSS1500', 'TSS200', '5\'UTR', '1stExon', 'Body', '3\'UTR']
}
col_names = {
    'CHR': "CHR",
    'RELATION_TO_UCSC_CPG_ISLAND': "Relation_to_Island",
    'UCSC_REFGENE_GROUP': "UCSC_RefGene_Group"
}
fig_sizes = {
    'CHR': (17, 10),
    'RELATION_TO_UCSC_CPG_ISLAND': (5, 10),
    'UCSC_REFGENE_GROUP': (5, 10)
}
colors = {
    'CHR': px.colors.qualitative.Dark24,
    'RELATION_TO_UCSC_CPG_ISLAND': px.colors.qualitative.Light24[17:23],
    'UCSC_REFGENE_GROUP': px.colors.qualitative.Light24[11:17]
}
df_dmps_fisher_target = manifest.loc[df_dmps_selected.index.values, :]
df_dmps_fisher_global = manifest.loc[df_dmps.index.values, :]
df_dmps_fisher_padding = df_dmps_fisher_global.loc[~df_dmps_fisher_global.index.isin(df_dmps_selected.index.values), :]
for var in orders:
    columns=["11", "12", "21", "22", "sum", "pval", "odds_ratio"]
    df_var = pd.DataFrame(index=orders[var], columns=columns, data=np.zeros((len(orders[var]), len(columns))))
    df_var.index.name = col_names[var].replace("_", " ")
    for var_val in orders[var]:
        contingency_table = pd.DataFrame(index=["specific", "non-specific"], columns=["in_val", "not_in_val"])
        contingency_table.at["specific", "in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[col_names[var]] == var_val, :].shape[0]
        contingency_table.at["specific", "not_in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[col_names[var]] != var_val, :].shape[0]
        contingency_table.at["non-specific", "in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[col_names[var]] == var_val, :].shape[0]
        contingency_table.at["non-specific", "not_in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[col_names[var]] != var_val, :].shape[0]
        df_var.at[var_val, "11"] = contingency_table.at["specific", "in_val"]
        df_var.at[var_val, "12"] = contingency_table.at["specific", "not_in_val"]
        df_var.at[var_val, "21"] = contingency_table.at["non-specific", "in_val"]
        df_var.at[var_val, "22"] = contingency_table.at["non-specific", "not_in_val"]
        df_var.at[var_val, "sum"] = contingency_table.values.sum()
        odds_ratio, pval = stats.fisher_exact(contingency_table.to_numpy(), alternative='two-sided')
        if np.isnan(odds_ratio):
            odds_ratio = 1.0
        df_var.at[var_val, "odds_ratio"], df_var.at[var_val, "pval"] = odds_ratio, pval
    _, df_var['pval_fdr_bh'], _, _ = multipletests(df_var['pval'].values, 0.05, method='fdr_bh')
    df_var[r'$ \log_{10}(\mathrm{Odds\ ratio})$'] = np.log10(df_var.loc[:, 'odds_ratio'].values)
    df_var[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_var.loc[:, 'pval_fdr_bh'].values)
    df_var.to_excel(f"{path_save}/{path_local}/fisher_{var}.xlsx")

    plt.figure(figsize=fig_sizes[var])
    plt.xticks(rotation=90)
    sns.set_theme(style='whitegrid', font_scale=2)
    if pval_show_type == "color":
        plot = plt.scatter(df_var.index, df_var.loc[:, r'$ \log_{10}(\mathrm{Odds\ ratio})$'].values, c=df_var.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values, cmap='Reds')
        plt.clf()
        cbar = plt.colorbar(plot)
        plt.xticks(rotation=90)
        cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center')
        ax = sns.barplot(data=df_var, x=df_var.index, y=r'$ \log_{10}(\mathrm{Odds\ ratio})$', hue=r'$ -\log_{10}(\mathrm{p-value})$', palette='Reds', dodge=False, edgecolor='black')
        ax.legend_.remove()
    else:
        bar = sns.barplot(data=df_var, x=df_var.index, y=r'$ \log_{10}(\mathrm{Odds\ ratio})$', palette=colors[var], edgecolor='black')
        for bar_index, this_bar in enumerate(bar.patches):
            if df_var.at[df_var.index[bar_index], "pval_fdr_bh"] < 0.05:
                this_bar.set_hatch('x')
            this_bar.set_edgecolor('skyblue')
    plt.savefig(f"{path_save}/{path_local}/fisher_{var}.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/fisher_{var}.pdf", bbox_inches='tight')
    plt.close()

In [None]:
sns.set_theme(style='whitegrid')
df_dmps.sort_values(["MAPINFO"], ascending=[True], inplace=True)
mhat(
    df=df_dmps,
    chr='CHR',
    pv='adj.P.Val',
    path=f"{path_save}/{path_local}",
    valpha=1,
    markernames=tuple(top_to_hightlight),
    markeridcol='print',
    gstyle=2,
    dim=(12, 4),
    axtickfontsize=8
)

In [None]:
sns.set_theme(style='whitegrid')
volcano(
    df=df_dmps,
    lfc='logFC',
    pv='adj.P.Val',
    pv_thr=(pval_lim, pval_lim),
    lfc_thr=(fc_lim, fc_lim),
    path=f"{path_save}/{path_local}",
    genenames=tuple(top_to_hightlight),
    geneid='print',
    gstyle=2,
    sign_line=True
)

In [None]:
n_top = 10
dist_num_bins = 25
pathlib.Path(f"{path_save}/{path_local}/examples").mkdir(parents=True, exist_ok=True)
df_dmps_top = df_dmps.sort_values(['adj.P.Val'], ascending=[True]).head(n_top)
for cpg_id, (cpg, row) in enumerate(df_dmps_top.iterrows()):
    pval = row['adj.P.Val']
    gene = manifest.at[cpg, 'Gene']

    fig = go.Figure()
    for group in problem["Filter"]:
        vals = dnam_df.loc[problem["Filter"][group], cpg].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=problem["Color"][group],
                marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                bandwidth = np.ptp(vals) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{cpg_id}_{cpg}")

# Immunology data analysis

## Aux data

In [None]:
problem = {
    "Color": {
        "Central": "limegreen",
        "Yakutia": "royalblue",
    },
    "Filter": {
        "Central": immuno_df["Region"] == "Central",
        "Yakutia": immuno_df["Region"] == "Yakutia",
    },
    "BaseFilter": (immuno_df["Region"] == "Central") | (immuno_df["Region"] == "Yakutia"),
    "BasePart": "Central"
}

## SImAge Results

In [None]:
simage_df = pd.read_excel(f"E:/YandexDisk/Work/pydnameth/draft/07_central_vs_yakutia/raw/fig_immuno_clock/df.xlsx", index_col="index")
immuno_df.loc[immuno_df.index.values, "SImAge"] = simage_df.loc[immuno_df.index.values, "Estimation"]

In [None]:
path_local = "immuno_age_accelerations"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

formula = f"SImAge ~ Age"
model = smf.ols(formula=formula, data=immuno_df.loc[immuno_df["Region"] == problem["BasePart"]]).fit()
immuno_df[f"SImAge_linear_pred"] = model.predict(immuno_df)
y_pred = model.predict(pheno)
immuno_df[f"SImAgeAcc"] = immuno_df["SImAge"] - immuno_df[f"SImAge_linear_pred"]
df_aa = pd.DataFrame(index=["SImAge"], columns=["pval", "pval_fdr_bh"])
vals = {}
for group in problem["Filter"]:
    vals[group] = immuno_df.loc[problem["Filter"][group], f"SImAgeAcc"].values
    df_aa.at[f"SImAgeAcc", f"mean_{group}"] = np.mean(vals[group])
    df_aa.at[f"SImAgeAcc", f"median_{group}"] = np.median(vals[group])
    df_aa.at[f"SImAgeAcc", f"q75_{group}"], df_aa.at[f"SImAgeAcc", f"q25_{group}"] = np.percentile(vals[group], [75 ,25])
    df_aa.at[f"SImAgeAcc", f"iqr_{group}"] = df_aa.at[f"SImAgeAcc", f"q75_{group}"] - df_aa.at[f"SImAgeAcc", f"q25_{group}"]
    print(f"{group}: {len(vals[group])}")
stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
df_aa.at[f"SImAgeAcc", "pval"] = pval
df_aa.to_excel(f"{path_save}/{path_local}/aa.xlsx", index=True)

dist_num_bins = 25
fig = go.Figure()
for group in problem["Filter"]:
    fig.add_trace(
        go.Violin(
            y=vals[group],
            name=group,
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor=problem["Color"][group],
            marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth = np.ptp(vals[group]) / dist_num_bins,
            opacity=0.8
        )
    )
add_layout(fig, "", f"SImAgeAcc", f"p-value: {df_aa.at[f'SImAgeAcc', 'pval']:0.2e}")
fig.update_layout(title_xref='paper')
fig.update_layout(legend_font_size=20)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_layout(
    margin=go.layout.Margin(
        l=110,
        r=20,
        b=50,
        t=40,
        pad=0
    )
)
fig.update_layout(legend_y=1.01)
save_figure(fig, f"{path_save}/{path_local}/violin_SImAgeAcc")

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=immuno_df.loc[immuno_df["Region"] == problem["BasePart"], f"Age"].values,
        y=immuno_df.loc[immuno_df["Region"] == problem["BasePart"], f"SImAge_linear_pred"].values,
        showlegend=False,
        name="",
        mode="lines",
        marker_color=problem["Color"][problem["BasePart"]],
        marker=dict(
            size=8,
            opacity=0.75,
            line=dict(
                color="black",
                width=0.5
            )
        )
    )
)
for group in problem["Filter"]:
    fig.add_trace(
        go.Scatter(
            x=immuno_df.loc[problem["Filter"][group], f"Age"].values,
            y=immuno_df.loc[problem["Filter"][group], f"SImAge"].values,
            showlegend=True,
            name=group,
            mode="markers",
            line_color=problem["Color"][group],
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
add_layout(fig, f"Age", f"SImAge", f"")
fig.update_layout(legend_font_size=20)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_layout(
    margin=go.layout.Margin(
        l=110,
        r=20,
        b=80,
        t=40,
        pad=0,
    )
)
save_figure(fig, f"{path_save}/{path_local}/scatter_SImAge")

## Mann-Whitney test and fold change

In [None]:
path_local = "immuno_mw"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df_immuno_stat = pd.DataFrame(index=immuno_feats, columns=["mw_pval", "mw_pval_fdr_bh"])
df_immuno_stat.index.name = "feat"
for feat in tqdm(immuno_feats):
    vals = {}
    for group in problem["Filter"]:
        vals[group] = immuno_df.loc[problem["Filter"][group], feat].values
        df_immuno_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_immuno_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_immuno_stat.at[feat, f"q75_{group}"], df_immuno_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_immuno_stat.at[feat, f"iqr_{group}"] = df_immuno_stat.at[feat, f"q75_{group}"] - df_immuno_stat.at[feat, f"q25_{group}"]
    stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
    df_immuno_stat.at[feat, "log_fold_change"] = np.log2(np.mean(immuno_df.loc[immuno_df["Region"] == "Yakutia", feat].values)) - np.log2(np.mean(immuno_df.loc[immuno_df["Region"] == "Central", feat].values))
    df_immuno_stat.at[feat, "mw_pval"] = pval
    stat, pval, med, tbl = median_test(*vals.values())
    df_immuno_stat.at[feat, "mood_pval"] = pval
_, df_immuno_stat["mw_pval_fdr_bh"], _, _ = multipletests(df_immuno_stat["mw_pval"], 0.05, method='fdr_bh')
_, df_immuno_stat["mood_pval_fdr_bh"], _, _ = multipletests(df_immuno_stat["mood_pval"], 0.05, method='fdr_bh')
df_immuno_stat.to_excel(f"{path_save}/{path_local}/table.xlsx", index=True)

In [None]:
fc_lim = 0.5
pval_lim = 0.05
df_immuno_stat.sort_values(["mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_immuno_stat['print'] = df_immuno_stat.index.values
top_to_hightlight = df_immuno_stat["print"].values[0:10]
sns.set_theme(style='whitegrid')
volcano(
    df=df_immuno_stat,
    lfc='log_fold_change',
    pv='mw_pval_fdr_bh',
    pv_thr=(pval_lim, pval_lim),
    lfc_thr=(fc_lim, fc_lim),
    path=f"{path_save}/{path_local}",
    genenames=tuple(top_to_hightlight),
    geneid='print',
    gstyle=2,
    dotsize=10,
    sign_line=True
)

In [None]:
pval_col = "mood_pval_fdr_bh"
dist_num_bins = 50
pathlib.Path(f"{path_save}/{path_local}/examples").mkdir(parents=True, exist_ok=True)
df_immuno_mw_top = df_immuno_stat.sort_values([pval_col], ascending=[True])
top_features_ranges = {
    'IL2': [-2, 20],
    'IL25': [-100, 2000],
    'CD40LG': [-150, 10000],
    'IL10': [-3, 50],
    'IL17F': [-20, 800],
    'IL3': [-2, 10],
    'IL17A': [-5, 50],
}
top_features_bandwidth={
    'IL2': {'Central': 0.5, 'Yakutia': 0.5},
    'IL25': {'Central': 20, 'Yakutia': 20},
    'CD40LG': {'Central': 100, 'Yakutia': 100},
    'IL10': {'Central': 1, 'Yakutia': 1},
    'IL17F': {'Central': 10, 'Yakutia': 10},
    'IL3': {'Central': 0.2, 'Yakutia': 0.2},
    'IL17A': {'Central': 2, 'Yakutia': 2},
}

for feat_id, (feat, row) in enumerate(df_immuno_mw_top.iterrows()):
    pval = row[pval_col]
    fc = row['log_fold_change']
    fig = go.Figure()

    for group in problem["Filter"]:
        vals = immuno_df.loc[problem["Filter"][group], feat].values
        if feat not in top_features_bandwidth:
            fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=problem["Color"][group],
                    marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(vals) / dist_num_bins,
                    opacity=0.8
                )
            )
        else:
             fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=problem["Color"][group],
                    marker = dict(color=problem["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=top_features_bandwidth[feat][group],
                    opacity=0.8
                )
            )
    add_layout(fig, "", feat, f"p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    if feat in top_features_ranges:
        fig.update_yaxes(autorange=False)
        fig.update_layout(yaxis_range=top_features_ranges[feat])
    fig.update_layout(legend_font_size=25)
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_xaxes(tickfont_size=25)
    fig.update_layout(
        margin=go.layout.Margin(
            l=140,
            r=20,
            b=50,
            t=50,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/examples/{feat_id}_{feat}")