In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
from functools import reduce
from scipy.stats import kruskal, mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from tqdm import tqdm

# Init data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

path_save = f"{path}/{platform}/{dataset}/special/034_central_vs_yakutia"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

pheno = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas.pkl")

# Update pheno_xtd

In [None]:
pheno_xtd = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
pheno_xtd.set_index('ID', inplace=True)
pheno_xtd.index.name = 'index'
pheno_merged = pd.concat([pheno, pheno_xtd.loc[:, pheno_xtd.columns.difference(pheno.columns)]], axis=1)
pheno_merged.index.name = 'index'
pheno_merged.to_excel(f"{path}/{platform}/{dataset}/pheno_1.xlsx")
pheno_merged.to_pickle(f"{path}/{platform}/{dataset}/pheno_1.pkl")
with open(f'{path}/{platform}/{dataset}/features/immuno.txt') as f:
    features = f.read().splitlines()
pheno.drop(features + ["PhenoAge"], axis=1, inplace=True)
pheno.to_excel(f"{path}/{platform}/{dataset}/pheno_1.xlsx")
pheno.to_pickle(f"{path}/{platform}/{dataset}/pheno_1.pkl")

# Select subjects

In [None]:
cpgs = betas.columns.values
df = pd.merge(pheno, betas, left_index=True, right_index=True)
df = df.loc[(df["Status"] == "Control") & (df["Sample_Chronology"] < 2) & (df["COVID"] == "no"), :]

problems = {
    "Region": {
        "Color": {
            "Central": "cyan",
            "Yakutia": "magenta",
        },
        "Filter": {
            "Central": df["Region"] == "Central",
            "Yakutia": df["Region"] == "Yakutia",
        },
        "BaseFilter": (df["Region"] == "Central") | (df["Region"] == "Yakutia"),
        "BasePart": "Central"
    },
    "DNAmPart": {
        "Color": {
            1: "orange",
            2: "lime"
        },
        "Filter": {
            1: (df["Region"] == "Central") & (df["DNAmPart"] == 1),
            2: (df["Region"] == "Central") & (df["DNAmPart"] == 2),
        },
        "BaseFilter": df["Region"] == "Central",
        "BasePart": 1
    }
}

# Histograms

In [None]:
sns.histplot(data=df, x="Age", hue="Sex", palette={"F": "r", "M": "b"}, bins=15)
pathlib.Path(f"{path_save}/hist").mkdir(parents=True, exist_ok=True)
plt.savefig(f"{path_save}/hist/Sex.png", bbox_inches='tight')
plt.savefig(f"{path_save}/hist/Sex.pdf", bbox_inches='tight')
plt.clf()

for problem in problems:
    sns.histplot(data=df.loc[problems[problem]["BaseFilter"]], x="Age", hue=problem, palette=problems[problem]["Color"], bins=15)
    pathlib.Path(f"{path_save}/hist").mkdir(parents=True, exist_ok=True)
    plt.savefig(f"{path_save}/hist/{problem}.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/hist/{problem}.pdf", bbox_inches='tight')
    plt.clf()

# Cells
## Region

In [None]:
for problem in problems:
    pathlib.Path(f"{path_save}/cells/{problem}").mkdir(parents=True, exist_ok=True)
    df_cells = pd.DataFrame(index=["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"], columns=["pval", "pval_fdr_bh"])
    for cell in tqdm(df_cells.index.values):
        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], cell].values
        stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_cells.at[cell, "pval"] = pval
    _, df_cells["pval_fdr_bh"], _, _ = multipletests(df_cells["pval"], 0.05, method='fdr_bh')

    dist_num_bins = 15
    for cell in tqdm(df_cells.index.values):

        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], cell].values

        fig = go.Figure()
        for group in problems[problem]["Filter"]:
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=True,
                    line_color='black',
                    fillcolor=problems[problem]["Color"][group],
                    marker = dict(color=problems[problem]["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth = np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{cell}", f"p-value: {df_cells.at[cell, 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            margin=go.layout.Margin(
                l=110,
                r=20,
                b=50,
                t=90,
                pad=0
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/cells/{problem}/{cell}")

# Age Accelerations

In [None]:
for problem in problems:
    age_types = ['DNAmAgeHannum', 'DNAmAge', 'DNAmPhenoAge', 'DNAmGrimAge']
    df_aas = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
    for age_type in tqdm(age_types):
        formula = f"{age_type} ~ Age"
        model = smf.ols(formula=formula, data=df.loc[df[problem] == problems[problem]["BasePart"]]).fit()
        df[f"{problem}_{age_type}_linear_pred"] = model.predict(df)
        y_pred = model.predict(pheno)
        df[f"{problem}{age_type}Acc"] = df[age_type] - df[f"{problem}_{age_type}_linear_pred"]

        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], f"{problem}{age_type}Acc"].values
        stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_aas.at[f"{age_type}Acc", "pval"] = pval
    _, df_aas["pval_fdr_bh"], _, _ = multipletests(df_aas["pval"], 0.05, method='fdr_bh')

    dist_num_bins = 15
    pathlib.Path(f"{path_save}/accelerations/{problem}").mkdir(parents=True, exist_ok=True)
    for age_type in tqdm(age_types):

        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], f"{problem}{age_type}Acc"].values

        fig = go.Figure()
        for group in problems[problem]["Filter"]:
            fig.add_trace(
                go.Violin(
                    y=vals[group],
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=True,
                    line_color='black',
                    fillcolor=problems[problem]["Color"][group],
                    marker = dict(color=problems[problem]["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth = np.ptp(vals[group]) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", f"{age_type}Acc", f"p-value: {df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            margin=go.layout.Margin(
                l=110,
                r=20,
                b=50,
                t=90,
                pad=0
            )
        )
        fig.update_layout(legend_y=1.01)
        save_figure(fig, f"{path_save}/accelerations/{problem}/violin_{age_type}Acc")

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=df.loc[df[problem] == problems[problem]["BasePart"], f"Age"].values,
                y=df.loc[df[problem] == problems[problem]["BasePart"], f"{problem}_{age_type}_linear_pred"].values,
                showlegend=False,
                name="",
                mode="lines",
                marker_color=problems[problem]["Color"][problems[problem]["BasePart"]],
                marker=dict(
                    size=8,
                    opacity=0.75,
                    line=dict(
                        color="black",
                        width=0.5
                    )
                )
            )
        )
        for group in problems[problem]["Filter"]:
            vals = df.loc[problems[problem]["Filter"][group], f"{age_type}"].values
            add_scatter_trace(fig, df.loc[problems[problem]["Filter"][group], f"Age"].values, df.loc[problems[problem]["Filter"][group], f"{age_type}"].values, group)

            fig.add_trace(
                go.Scatter(
                    x=df.loc[problems[problem]["Filter"][group], f"Age"].values,
                    y=df.loc[problems[problem]["Filter"][group], f"{age_type}"].values,
                    showlegend=False,
                    name=group,
                    mode="markers",
                    line_color=problems[problem]["Color"][group],
                    marker=dict(
                        size=8,
                        opacity=0.75,
                        line=dict(
                            color="black",
                            width=0.5
                        )
                    )
                )
            )
        add_layout(fig, f"Age", f"{age_type}", f"")
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(
            margin=go.layout.Margin(
                l=80,
                r=20,
                b=80,
                t=65,
                pad=0
            )
        )
        save_figure(fig, f"{path_save}/accelerations/{problem}/scatter_{age_type}")


# Significance tests

In [None]:

for problem in problems:
    pathlib.Path(f"{path_save}/significance_tests/{problem}").mkdir(parents=True, exist_ok=True)
    cpgs = betas.columns.values
    df_sign = pd.DataFrame(index=cpgs, columns=['chr', 'Position', 'Relation_to_Island', 'UCSC_RefGene_Group', 'Gene', 'stat', 'pval', 'pval_fdr_bh'])
    df.index.name = 'CpG'
    for cpg_id, cpg in tqdm(enumerate(cpgs), desc='Mann-Whitney U test', total=len(cpgs)):
        df_sign.at[cpg, 'chr'] = manifest.at[cpg, 'chr']
        df_sign.at[cpg, 'Position'] = manifest.at[cpg, 'Position']
        df_sign.at[cpg, 'Relation_to_Island'] = manifest.at[cpg, 'Relation_to_Island']
        df_sign.at[cpg, 'UCSC_RefGene_Group'] = manifest.at[cpg, 'UCSC_RefGene_Group']
        df_sign.at[cpg, 'Gene'] = manifest.at[cpg, 'Gene']
        vals = {}
        for group in problems[problem]["Filter"]:
            vals[group] = df.loc[problems[problem]["Filter"][group], cpg].values
        stat, pval = mannwhitneyu(*vals.values(), alternative='two-sided')
        df_sign.at[cpg, 'stat'] = stat
        df_sign.at[cpg, 'pval'] = pval
    _, df_sign['pval_fdr_bh'], _, _ = multipletests(df_sign['pval'], 0.05, method='fdr_bh')
    df_sign.to_excel(f"{path_save}/significance_tests/{problem}/mw.xlsx")

    n_top = 10
    dist_num_bins = 25
    pathlib.Path(f"{path_save}/significance_tests/{problem}/examples").mkdir(parents=True, exist_ok=True)
    df_sign_top = df_sign.sort_values(['pval_fdr_bh'], ascending=[True]).head(n_top)
    for cpg_id, (cpg, row) in enumerate(df_sign_top.iterrows()):
        pval = row['pval_fdr_bh']
        gene = manifest.at[cpg, 'Gene']

        fig = go.Figure()
        for group in problems[problem]["Filter"]:
            vals = df.loc[problems[problem]["Filter"][group], cpg].values
            fig.add_trace(
                go.Violin(
                    y=vals,
                    name=group,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=True,
                    line_color='black',
                    fillcolor=problems[problem]["Color"][group],
                    marker = dict(color=problems[problem]["Color"][group], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth = np.ptp(vals) / dist_num_bins,
                    opacity=0.8
                )
            )
        add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
        fig.update_layout(title_xref='paper')
        fig.update_layout(legend_font_size=20)
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_xaxes(tickfont_size=15)
        fig.update_layout(
            margin=go.layout.Margin(
                l=110,
                r=20,
                b=50,
                t=80,
                pad=0
            )
        )
        fig.update_layout(
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.25,
                xanchor="center",
                x=0.5
            )
        )
        save_figure(fig, f"{path_save}/significance_tests/{problem}/examples/{cpg_id}_{cpg}")