In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
from functools import reduce
import seaborn as sns
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu
from scripts.python.preprocessing.serialization.routines.save import save_pheno_betas_to_pkl
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from tqdm import tqdm
from random import Random
from bioinfokit import analys, visuz
import sys

# Init dnam and immuno data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

path_save = f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

immun_num_samples = 1052
immun_preprocessing = "minmax_left(0.05)_right(0.95)_combat"
immun_df = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/df_type({immun_preprocessing})_all({immun_num_samples})_imp(fast_knn)_replace(quarter).xlsx", index_col="index")
immun_df = immun_df.loc[(immun_df["Status"] == "Control"), :]
immun_feats = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()

indexes_common = immun_df.index[immun_df["is_dnam"] == True].values

pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas.pkl")
dnam_feats = betas.columns.values
dnam_df = pd.merge(pheno, betas, left_index=True, right_index=True)
dnam_df = dnam_df.loc[(dnam_df["Status"] == "Control"), :]
dnam_df = dnam_df.loc[indexes_common, :]

n_samples_central = len(immun_df.index[immun_df["Region"] == "Central"].values)
n_samples_yakutiya = len(immun_df.index[immun_df["Region"] == "Yakutiya"].values)

# Create data for R

In [None]:
pathlib.Path(f"{path_save}/data_for_R").mkdir(parents=True, exist_ok=True)

betas_R = dnam_df.loc[:, dnam_feats]
betas_R = betas_R.T
betas_R.index.name = "CpG"
betas_R.to_pickle(f"{path_save}/data_for_R/betas.pkl")

pheno_R = dnam_df.loc[:, ["Age", "Sex", "Region", "DNAmPart", "Sentrix_ID", "Sentrix_Position", "CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran", 'DNAmAgeHannum', 'DNAmAge', 'DNAmPhenoAge', 'DNAmGrimAge']]
pheno_R.to_pickle(f"{path_save}/data_for_R/pheno.pkl")

# Data description

## Participants figure

In [None]:
path_local = "data_description/participants"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

# Data for figure
df_figure = immun_df.loc[:, ["Age", "Sex", "Region", "Status"]]
df_figure["Data"] = "Immuno only"
df_figure.loc[indexes_common, "Data"] = "Immuno and DNAm"

# Params for figure
binrange = [0, 105]
bins = 15

palette = {
    "Immuno and DNAm": "forestgreen",
    "Immuno only": "lawngreen",
}
hue_order = ['Immuno only', 'Immuno and DNAm']
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_figure.loc[df_figure["Region"] == "Central", :],
    hue_order=hue_order,
    binrange=binrange,
    bins=bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/{path_local}/hist_Central.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/hist_Central.pdf", bbox_inches='tight')
plt.clf()

palette = {
    "Immuno and DNAm": "royalblue",
    "Immuno only": "deepskyblue",
}
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_figure.loc[df_figure["Region"] == "Yakutiya", :],
    hue_order=hue_order,
    binrange=binrange,
    bins=bins,
    x="Age",
    hue="Data",
    palette=palette,
    multiple="stack"
)
plt.savefig(f"{path_save}/{path_local}/hist_Yakutiya.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/hist_Yakutiya.pdf", bbox_inches='tight')
plt.clf()

## Immun features

In [None]:
feats_plot = ["Age"] + list(immun_feats)
df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_plot), len(feats_plot))), index=feats_plot, columns=feats_plot)
for f_id_1 in range(len(feats_plot)):
    for f_id_2 in range(f_id_1, len(feats_plot)):
        f_1 = feats_plot[f_id_1]
        f_2 = feats_plot[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = immun_df.loc[:, f_1].values
            vals_2 = immun_df.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_corr.at[f_2, f_1] = pval
            df_corr.at[f_1, f_2] = corr
        else:
            df_corr.at[f_2, f_1] = np.nan
selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=np.bool)
df_fdr = df_corr.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_corr_fdr = df_corr.copy()
for line_id in range(df_fdr.shape[0]):
    df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])

In [None]:
path_local = "data_description/feats_immun"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

sns.set_theme(style='whitegrid')

df_to_plot = df_corr_fdr.copy()
mtx_to_plot = df_to_plot.to_numpy()

mtx_triu = np.triu(mtx_to_plot, +1)
max_corr = np.max(mtx_triu)
min_corr = np.min(mtx_triu)
mtx_triu_mask = np.ma.masked_array(mtx_triu, mtx_triu==0)
cmap_triu = plt.get_cmap("bwr").copy()

mtx_tril = np.tril(mtx_to_plot, -1)
mtx_tril_mask = np.ma.masked_array(mtx_tril, mtx_tril==0)
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')

fig, ax = plt.subplots()

im_triu = ax.imshow(mtx_triu_mask, cmap=cmap_triu, vmin=-1, vmax=1)
cbar_triu = ax.figure.colorbar(im_triu, ax=ax, location='right')
cbar_triu.set_label(r"$\mathrm{Correlation\:coefficient}$", horizontalalignment='center', fontsize=10)

im_tril = ax.imshow(mtx_tril_mask, cmap=cmap_tril, vmin=-np.log10(0.05))
cbar_tril = ax.figure.colorbar(im_tril, ax=ax, location='right')
cbar_tril.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center', fontsize=10)

ax.grid(None)
ax.set_aspect("equal")
ax.set_xticks(np.arange(df_to_plot.shape[1]))
ax.set_yticks(np.arange(df_to_plot.shape[0]))
ax.set_xticklabels(df_to_plot.columns.values)
ax.set_yticklabels(df_to_plot.index.values)
plt.setp(ax.get_xticklabels(), rotation=90)
threshold = np.ptp(mtx_tril.flatten()) * 0.5
ax.tick_params(axis='both', which='major', labelsize=5)
ax.tick_params(axis='both', which='minor', labelsize=5)
textcolors = ("black", "white")
for i in range(df_to_plot.shape[0]):
    for j in range(df_to_plot.shape[1]):
        color = "black"
        if i > j:
            color = textcolors[int(mtx_tril[i, j] < threshold)]
        if np.isinf(mtx_to_plot[i, j]) or np.isnan(mtx_to_plot[i, j]):
            text = ax.text(j, i, f"", ha="center", va="center", color=color, fontsize=1.3)
        else:
            text = ax.text(j, i, f"{mtx_to_plot[i, j]:0.2f}", ha="center", va="center", color=color, fontsize=1.3)
fig.tight_layout()
plt.savefig(f"{path_save}/{path_local}/corr_mtx_fdr.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/{path_local}/corr_mtx_fdr.pdf", bbox_inches='tight', dpi=400)
plt.clf()
df_save = df_corr_fdr
df_save.to_excel(f"{path_save}/{path_local}/corr_mtx_fdr.xlsx", index=True)

# DNAm features

In [None]:
df_figure = pd.read_csv(f"{path_save}/data_for_R/DMP_age.csv", index_col="CpG")
df_figure["CpG"] = df_figure.index.values
df_figure['print'] = df_figure.apply(lambda row: f"{row['CpG']} ({row['gene']})", axis=1)
top_to_hightlight = df_figure["print"].values[0:5]
path_local = "data_description/feats_dnam"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df_figure['log_pval'] = -np.log10(df_figure["adj.P.Val"])

In [None]:
def geneplot_mhat(df, markeridcol, chr, pv, gwasp, markernames, gfont, gstyle, ax):
    if markeridcol is not None:
        if markernames is not None and markernames is True:
            for i in df[markeridcol].unique():
                if df.loc[df[markeridcol] == i, pv].iloc[0] <= gwasp:
                    if gstyle == 1:
                        plt.text(df.loc[df[markeridcol] == i, 'ind'].iloc[0], df.loc[df[markeridcol] == i, 'tpval'].iloc[0],
                                str(i), fontsize=gfont)
                    elif gstyle == 2:
                        plt.annotate(i, xy=(df.loc[df[markeridcol] == i, 'ind'].iloc[0], df.loc[df[markeridcol] == i, 'tpval'].iloc[0]),
                                     xycoords='data', xytext=(5, -15), textcoords='offset points', size=6,
                                     bbox=dict(boxstyle="round", alpha=0.2),
                                     arrowprops=dict(arrowstyle="wedge,tail_width=0.5", alpha=0.2, relpos=(0, 0)))
        elif markernames is not None and isinstance(markernames, (tuple, list)):
            for i in df[markeridcol].unique():
                if i in markernames:
                    if gstyle == 1:
                        plt.text(df.loc[df[markeridcol] == i, 'ind'].iloc[0], df.loc[df[markeridcol] == i, 'tpval'].iloc[0],
                            str(i), fontsize=gfont)
                    elif gstyle == 2:
                        plt.annotate(i, xy=(df.loc[df[markeridcol] == i, 'ind'].iloc[0], df.loc[df[markeridcol] == i, 'tpval'].iloc[0]),
                                     xycoords='data', xytext=(5, -15), textcoords='offset points', size=6,
                                     bbox=dict(boxstyle="round", alpha=0.2),
                                     arrowprops=dict(arrowstyle="wedge,tail_width=0.5", alpha=0.2, relpos=(0, 0)))
        elif markernames is not None and isinstance(markernames, dict):
            for i in df[markeridcol].unique():
                if i in markernames:
                    if gstyle == 1:
                        plt.text(df.loc[df[markeridcol] == i, 'ind'].iloc[0], df.loc[df[markeridcol] == i, 'tpval'].iloc[0],
                             markernames[i], fontsize=gfont)
                    elif gstyle == 2:
                        plt.annotate(markernames[i], xy=(
                        df.loc[df[markeridcol] == i, 'ind'].iloc[0], df.loc[df[markeridcol] == i, 'tpval'].iloc[0]),
                                     xycoords='data', xytext=(5, -15), textcoords='offset points', size=6,
                                     bbox=dict(boxstyle="round", alpha=0.2),
                                     arrowprops=dict(arrowstyle="wedge,tail_width=0.5", alpha=0.2, relpos=(0, 0)))
    else:
        raise Exception("provide 'markeridcol' parameter")


def mhat(df="dataframe", chr=None, pv=None, log_scale=True, color=None, dim=(6,4), ar=90, gwas_sign_line=False,
         gwasp=5E-08, dotsize=1, markeridcol=None, markernames=None, gfont=8, valpha=1,
         axxlabel=None, axylabel=None, axlabelfontsize=9, axlabelfontname="Arial", axtickfontsize=6,
         axtickfontname="Arial", ylm=None, gstyle=1, figname='manhattan', theme=None, path=''):

    _x, _y = 'Chromosomes', r'$ -\log_{10}(\mathrm{p-value})$'
    rand_colors = ('#a7414a', '#282726', '#6a8a82', '#a37c27', '#563838', '#0584f2', '#f28a30', '#f05837',
                   '#6465a5', '#00743f', '#be9063', '#de8cf0', '#888c46', '#c0334d', '#270101', '#8d2f23',
                   '#ee6c81', '#65734b', '#14325c', '#704307', '#b5b3be', '#f67280', '#ffd082', '#ffd800',
                   '#ad62aa', '#21bf73', '#a0855b', '#5edfff', '#08ffc8', '#ca3e47', '#c9753d', '#6c5ce7',
                   '#a997df', '#513b56', '#590925', '#007fff', '#bf1363', '#f39237', '#0a3200', '#8c271e')
    if log_scale:
        # minus log10 of P-value
        df['tpval'] = -np.log10(df[pv])
    else:
        # for Fst values
        df['tpval'] = df[pv]
    # df = df.sort_values(chr)
    # if the column contains numeric strings
    df = df.loc[pd.to_numeric(df[chr], errors='coerce').sort_values().index]
    # add indices
    df['ind'] = range(len(df))
    df_group = df.groupby(chr)
    if color is not None and len(color) == 2:
        color_1 = int(df[chr].nunique() / 2) * [color[0]]
        color_2 = int(df[chr].nunique() / 2) * [color[1]]
        if df[chr].nunique() % 2 == 0:
            color_list = list(reduce(lambda x, y: x+y, zip(color_1, color_2)))
        elif df[chr].nunique() % 2 == 1:
            color_list = list(reduce(lambda x, y: x+y, zip(color_1, color_2)))
            color_list.append(color[0])
    elif color is not None and len(color) == df[chr].nunique():
        color_list = color
    elif color is None:
        # select colors randomly from the list based in number of chr
        color_list = Random().sample(rand_colors, df[chr].nunique())
    else:
        print("Error: in color argument")
        sys.exit(1)

    xlabels = []
    xticks = []
    if theme == 'dark':
        plt.style.use('dark_background')
    fig, ax = plt.subplots(figsize=dim)
    i = 0
    for label, df1 in df.groupby(chr):
        df1.plot(kind='scatter', x='ind', y='tpval', color=color_list[i], s=dotsize, alpha=valpha, ax=ax)
        df1_max_ind = df1['ind'].iloc[-1]
        df1_min_ind = df1['ind'].iloc[0]
        xlabels.append(label)
        xticks.append((df1_max_ind - (df1_max_ind - df1_min_ind) / 2))
        i += 1

    # add GWAS significant line
    if gwas_sign_line is True:
        ax.axhline(y=-np.log10(gwasp), linestyle='--', color='#7d7d7d', linewidth=1)
    if markernames is not None:
        geneplot_mhat(df, markeridcol, chr, pv, gwasp, markernames, gfont, gstyle, ax=ax)
    ax.margins(x=0)
    ax.margins(y=0)
    ax.set_xticks(xticks)
    if log_scale:
        ax.set_ylim([0, max(df['tpval'] + 1)])
    if ylm:
        ylm = np.arange(ylm[0], ylm[1], ylm[2])
    else:
        ylm = np.arange(0, max(df['tpval']+1), 1)
    #ax.set_yticks(ylm)
    plt.grid(visible=False, axis='x')
    ax.set_xticklabels(xlabels, rotation=ar, fontsize=axtickfontsize, fontname=axtickfontname)
    # ax.set_yticklabels(ylm, fontsize=axtickfontsize, fontname=axtickfontname, rotation=ar)
    if axxlabel:
        _x = axxlabel
    if axylabel:
        _y = axylabel
    ax.set_xlabel(_x, fontsize=axlabelfontsize, fontname=axlabelfontname)
    ax.set_ylabel(_y, fontsize=axlabelfontsize, fontname=axlabelfontname)
    plt.savefig(f"{path}/{figname}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path}/{figname}.pdf", bbox_inches='tight', dpi=400)
    plt.clf()
    plt.close()

In [None]:
sns.set_theme(style='whitegrid')
df_figure.sort_values(["MAPINFO"], ascending=[True], inplace=True)
mhat(
    df=df_figure,
    chr='CHR',
    pv='adj.P.Val',
    path=f"{path_save}/{path_local}",
    valpha=1,
    markernames=tuple(top_to_hightlight), #("cg16867657", "cg22454769"),
    markeridcol='print',
    gstyle=2
)

In [None]:

df_chr = df_figure['CHR'].value_counts().to_frame()
df_chr.rename(columns={"CHR": "n_points"})
df_chr.index.name = "CHR"
df_chr.sort_index(inplace=True)

fig = go.Figure()
chr_shift = df_figure.shape[0] // 100
curr_start = 0
for row_id, (index, row) in enumerate(df_chr.iterrows()):
    df_tmp = df_figure.loc[df_figure["CHR"] == index, :]
    df_tmp.sort_values(["MAPINFO"], ascending=[True], inplace=True)
    xs = np.linspace(0, df_tmp.shape[0], df_tmp.shape[0]) + curr_start
    curr_start += chr_shift + xs[-1]
    df_chr.at[index, "middle_position"] = xs[len(xs) // 2]
    ys = df_tmp.loc[:, "log_pval"].values

    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            showlegend=True,
            name=index,
            mode='markers',
            marker=dict(
                color=px.colors.qualitative.Dark24[row_id],
                symbol='circle',
                size=2,
                opacity=0.9,
                line=dict(
                    color=px.colors.qualitative.Dark24[row_id],
                    width=0.1
                )
            )
        )
    )

add_layout(fig,  "Chromosomes", "$\\huge{-\log_{10}(\\text{p-value})}$", f"")
fig.update_layout(showlegend=False)
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = df_chr["middle_position"].values,
        ticktext = df_chr.index.values
    )
)
fig.update_yaxes(autorange=False)
#fig.update_layout(yaxis_range=[-1, len(pvals_corr)])
fig.update_yaxes(tickfont_size=25)
fig.update_xaxes(tickfont_size=25)
fig.update_layout(
    autosize=False,
    width=800,
    height=1000,
    legend={'itemsizing': 'constant'},
    margin=go.layout.Margin(
        l=175,
        r=20,
        b=100,
        t=40,
        pad=0
    )
)
save_figure(fig, f"{path_save}/{path_local}/mhat")

In [None]:
visuz.GeneExpression.volcano(df=df_volcano, lfc='logFC', pv='adj.P.Val', pv_thr=(1e-8, 1e-8), lfc_thr=(0.1, 0.1),
                             figtype="png")

# Update betas indexes

In [None]:
if list(pheno.loc[:, "index_origin"].values) == list(betas.index.values):
    print("Change index")
    pheno.index.name = "index"
    betas.set_index(pheno.index, inplace=True, verify_integrity=False)
    betas.index.name = "index"
    save_pheno_betas_to_pkl(pheno, betas, f"{path}/{platform}/{dataset}")

# Update pheno_xtd

In [None]:
pheno_xtd = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
pheno_merged = pd.concat([pheno, pheno_xtd.loc[:, pheno_xtd.columns.difference(pheno.columns)]], axis=1)
pheno_merged.index.name = 'index'
pheno_merged.to_excel(f"{path}/{platform}/{dataset}/pheno_1.xlsx")
pheno_merged.to_pickle(f"{path}/{platform}/{dataset}/pheno_1.pkl")
# with open(f'{path}/{platform}/{dataset}/features/immuno.txt') as f:
#     features = f.read().splitlines()
# pheno.drop(features + ["PhenoAge"], axis=1, inplace=True)
# pheno.to_excel(f"{path}/{platform}/{dataset}/pheno_1.xlsx")
# pheno.to_pickle(f"{path}/{platform}/{dataset}/pheno_1.pkl")

# Select subjects

In [None]:
cpgs = betas.columns.values
df = pd.merge(pheno, betas, left_index=True, right_index=True)
df = df.loc[(df["Status"] == "Control") & (df["Sample_Chronology"] < 2) & (df["COVID"] == "no"), :]

problems = {
    "Region": {
        "Color": {
            "Central": "cyan",
            "Yakutia": "magenta",
        },
        "Filter": {
            "Central": df["Region"] == "Central",
            "Yakutia": df["Region"] == "Yakutia",
        },
        "BaseFilter": (df["Region"] == "Central") | (df["Region"] == "Yakutia"),
        "BasePart": "Central"
    },
    "DNAmPart": {
        "Color": {
            1: "orange",
            2: "lime"
        },
        "Filter": {
            1: (df["Region"] == "Central") & (df["DNAmPart"] == 1),
            2: (df["Region"] == "Central") & (df["DNAmPart"] == 2),
        },
        "BaseFilter": df["Region"] == "Central",
        "BasePart": 1
    }
}

# Histograms

In [None]:
sns.histplot(data=df, x="Age", hue="Sex", palette={"F": "r", "M": "b"}, bins=15)
pathlib.Path(f"{path_save}/hist").mkdir(parents=True, exist_ok=True)
plt.savefig(f"{path_save}/hist/Sex.png", bbox_inches='tight')
plt.savefig(f"{path_save}/hist/Sex.pdf", bbox_inches='tight')
plt.clf()

for problem in problems:
    sns.histplot(data=df.loc[problems[problem]["BaseFilter"]], x="Age", hue=problem, palette=problems[problem]["Color"], bins=15)
    pathlib.Path(f"{path_save}/hist").mkdir(parents=True, exist_ok=True)
    plt.savefig(f"{path_save}/hist/{problem}.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/hist/{problem}.pdf", bbox_inches='tight')
    plt.clf()

# Cells
## Region

In [None]:
for problem in problems:
    pathlib.Path(f"{path_save}/cells/{problem}").mkdir(parents=True, exist_ok=True)
    df_cells = pd.DataFrame(index=["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"], columns=["pval", "pval_fdr_bh"])
    for cell in tqdm(df_cells.index.values):
        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], cell].values
        stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_cells.at[cell, "pval"] = pval
    _, df_cells["pval_fdr_bh"], _, _ = multipletests(df_cells["pval"], 0.05, method='fdr_bh')

    dist_num_bins = 15
    for cell in tqdm(df_cells.index.values):

        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], cell].values

        fig = go.Figure()
        for group in problems[problem]["Filter"]:
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=True,
                    line_color='black',
                    fillcolor=problems[problem]["Color"][group],
                    marker = dict(color=problems[problem]["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth = np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{cell}", f"p-value: {df_cells.at[cell, 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            margin=go.layout.Margin(
                l=110,
                r=20,
                b=50,
                t=90,
                pad=0
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/cells/{problem}/{cell}")

# Age Accelerations

In [None]:
for problem in problems:
    age_types = ['DNAmAgeHannum', 'DNAmAge', 'DNAmPhenoAge', 'DNAmGrimAge']
    df_aas = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
    for age_type in tqdm(age_types):
        formula = f"{age_type} ~ Age"
        model = smf.ols(formula=formula, data=df.loc[df[problem] == problems[problem]["BasePart"]]).fit()
        df[f"{problem}_{age_type}_linear_pred"] = model.predict(df)
        y_pred = model.predict(pheno)
        df[f"{problem}{age_type}Acc"] = df[age_type] - df[f"{problem}_{age_type}_linear_pred"]

        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], f"{problem}{age_type}Acc"].values
        stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_aas.at[f"{age_type}Acc", "pval"] = pval
    _, df_aas["pval_fdr_bh"], _, _ = multipletests(df_aas["pval"], 0.05, method='fdr_bh')

    dist_num_bins = 15
    pathlib.Path(f"{path_save}/accelerations/{problem}").mkdir(parents=True, exist_ok=True)
    for age_type in tqdm(age_types):

        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], f"{problem}{age_type}Acc"].values

        fig = go.Figure()
        for group in problems[problem]["Filter"]:
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=True,
                    line_color='black',
                    fillcolor=problems[problem]["Color"][group],
                    marker = dict(color=problems[problem]["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth = np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{age_type}Acc", f"p-value: {df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            margin=go.layout.Margin(
                l=110,
                r=20,
                b=50,
                t=90,
                pad=0
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/accelerations/{problem}/violin_{age_type}Acc")

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=df.loc[df[problem] == problems[problem]["BasePart"], f"Age"].values,
                y=df.loc[df[problem] == problems[problem]["BasePart"], f"{problem}_{age_type}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                marker_color=problems[problem]["Color"][problems[problem]["BasePart"]],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in problems[problem]["Filter"]:
            vals = df.loc[problems[problem]["Filter"][group], f"{age_type}"].values

            fig.add_trace(
                go.Scatter(
                    x=df.loc[problems[problem]["Filter"][group], f"Age"].values,
                    y=df.loc[problems[problem]["Filter"][group], f"{age_type}"].values,
                    showlegend=True,
                    name=group,
                    mode="markers",
                    line_color=problems[problem]["Color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color="black",
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_type}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            margin=go.layout.Margin(
                l=80,
                r=20,
                b=80,
                t=65,
                pad=0
            )
        )
        save_figure(fig, f"{path_save}/accelerations/{problem}/scatter_{age_type}")


# Significance tests

In [None]:

for problem in problems:
    pathlib.Path(f"{path_save}/significance_tests/{problem}").mkdir(parents=True, exist_ok=True)
    cpgs = betas.columns.values
    df_sign = pd.DataFrame(index=cpgs, columns=['chr', 'Position', 'Relation_to_Island', 'UCSC_RefGene_Group', 'Gene', 'stat', 'pval', 'pval_fdr_bh'])
    df.index.name = 'CpG'
    for cpg_id, cpg in tqdm(enumerate(cpgs), desc='Mann-Whitney U test', total=len(cpgs)):
        df_sign.at[cpg, 'chr'] = manifest.at[cpg, 'chr']
        df_sign.at[cpg, 'Position'] = manifest.at[cpg, 'Position']
        df_sign.at[cpg, 'Relation_to_Island'] = manifest.at[cpg, 'Relation_to_Island']
        df_sign.at[cpg, 'UCSC_RefGene_Group'] = manifest.at[cpg, 'UCSC_RefGene_Group']
        df_sign.at[cpg, 'Gene'] = manifest.at[cpg, 'Gene']
        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], cpg].values
        stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_sign.at[cpg, 'stat'] = stat
        df_sign.at[cpg, 'pval'] = pval
    _, df_sign['pval_fdr_bh'], _, _ = multipletests(df_sign['pval'], 0.05, method='fdr_bh')
    df_sign.to_excel(f"{path_save}/significance_tests/{problem}/mw.xlsx")

    n_top = 10
    dist_num_bins = 25
    pathlib.Path(f"{path_save}/significance_tests/{problem}/examples").mkdir(parents=True, exist_ok=True)
    df_sign_top = df_sign.sort_values(['pval_fdr_bh'], ascending=[True]).head(n_top)
    for cpg_id, (cpg, row) in enumerate(df_sign_top.iterrows()):
        pval = row['pval_fdr_bh']
        gene = manifest.at[cpg, 'Gene']

        fig = go.Figure()
        for group in problems[problem]["Filter"]:
            vals = df.loc[problems[problem]["Filter"][group], cpg].values
            fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=True,
                    line_color='black',
                    fillcolor=problems[problem]["Color"][group],
                    marker = dict(color=problems[problem]["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth = np.ptp(vals) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(tickfont_size=15)
        fig.update_layout(
            margin=go.layout.Margin(
                l=110,
                r=20,
                b=50,
                t=80,
                pad=0
            )
        )
        fig.update_layout(
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.25,
                xanchor="center",
                x=0.5
            )
        )
        save_figure(fig, f"{path_save}/significance_tests/{problem}/examples/{cpg_id}_{cpg}")

# Generate data for R

In [None]:
pathlib.Path(f"{path_save}/data_for_R").mkdir(parents=True, exist_ok=True)

betas_R = df.loc[:, cpgs]
betas_R = betas_R.T
betas_R.index.name = "CpG"
betas_R.to_pickle(f"{path_save}/data_for_R/betas.pkl")

pheno_R = df.loc[:, ["Age", "Region", "DNAmPart", "Sentrix_ID", "Sentrix_Position"]]
pheno_R.to_pickle(f"{path_save}/data_for_R/pheno.pkl")

# Generate list for GO

In [None]:
#path_for_go = f"{path_save}/significance_tests/Region"
path_for_go = f"{path_save}/data_for_R"
df_for_go = pd.read_excel(f"{path_for_go}/for_go.xlsx", index_col="CpG")
genes_all = set()
for cpg_id, (cpg, row) in enumerate(df_for_go.iterrows()):
    genes_raw = row['gene']
    if isinstance(genes_raw, str):
        genes = genes_raw.split(';')
        genes_all.update(set(genes))
if 'non-genic' in genes_all:
    genes_all.remove('non-genic')
if ' ' in genes_all:
    genes_all.remove(' ')
genes_all = list(genes_all)
genes_df = pd.DataFrame({'gene':genes_all})
genes_df.to_excel(f"{path_for_go}/genes_for_go.xlsx", index=False)

# Volcano plot

In [None]:
df_volcano = pd.read_csv(f"{path_save}/data_for_R/dmp.csv", index_col="CpG")

In [None]:
visuz.GeneExpression.volcano(df=df_volcano, lfc='logFC', pv='adj.P.Val', pv_thr=(1e-8, 1e-8), lfc_thr=(0.1, 0.1), figtype="png")