In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
from functools import reduce
from scipy.stats import kruskal, mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from tqdm.notebook import tqdm

# Init data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

path_save = f"{path}/{platform}/{dataset}/special/034_immuno_generate_subset"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

pheno = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas.pkl")

# Update pheno_xtd

In [None]:
pheno_xtd = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
pheno_xtd.set_index('ID', inplace=True)
pheno_xtd.index.name = 'index'
pheno_merged = pd.concat([pheno, pheno_xtd.loc[:, pheno_xtd.columns.difference(pheno.columns)]], axis=1)
pheno_merged.index.name = 'index'
pheno_merged.to_excel(f"{path}/{platform}/{dataset}/pheno_1.xlsx")
pheno_merged.to_pickle(f"{path}/{platform}/{dataset}/pheno_1.pkl")
with open(f'{path}/{platform}/{dataset}/features/immuno.txt') as f:
    features = f.read().splitlines()
pheno.drop(features + ["PhenoAge"], axis=1, inplace=True)
pheno.to_excel(f"{path}/{platform}/{dataset}/pheno_1.xlsx")
pheno.to_pickle(f"{path}/{platform}/{dataset}/pheno_1.pkl")

# Select subjects

In [None]:
cpgs = betas.columns.values
df = pd.merge(pheno, betas, left_index=True, right_index=True)
df = df.loc[(df["Status"] == "Control") & (df["Sample_Chronology"] < 2) & (df["COVID"] == "no"), :]

# Histograms

In [None]:
sns.histplot(data=df, x="Age", hue="Sex", palette={"F": "r", "M": "b"}, bins=15)
pathlib.Path(f"{path_save}/hist").mkdir(parents=True, exist_ok=True)
plt.savefig(f"{path_save}/hist/Sex.png", bbox_inches='tight')
plt.savefig(f"{path_save}/hist/Sex.pdf", bbox_inches='tight')
plt.clf()

sns.histplot(data=df, x="Age", hue="Region", palette={"Central": "c", "Yakutia": "m"}, bins=15)
pathlib.Path(f"{path_save}/hist").mkdir(parents=True, exist_ok=True)
plt.savefig(f"{path_save}/hist/Region.png", bbox_inches='tight')
plt.savefig(f"{path_save}/hist/Region.pdf", bbox_inches='tight')
plt.clf()

# Cells

In [None]:
df_cells = pd.DataFrame(index=["CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran"], columns=["pval", "pval_fdr_bh"])
for cell in tqdm(df_cells.index.values):
    vals_central = df.loc[df["Region"] == "Central", cell].values
    vals_yakutia = df.loc[df["Region"] == "Yakutia", cell].values
    stat, pval = mannwhitneyu(vals_central, vals_yakutia, alternative='two-sided')
    df_cells.at[cell, "pval"] = pval
_, df_cells["pval_fdr_bh"], _, _ = multipletests(df_cells["pval"], 0.05, method='fdr_bh')

dist_num_bins = 15
pathlib.Path(f"{path_save}/cells").mkdir(parents=True, exist_ok=True)
for cell in tqdm(df_cells.index.values):
    vals_central = df.loc[df["Region"] == "Central", cell].values
    vals_yakutia = df.loc[df["Region"] == "Yakutia", cell].values
    fig = go.Figure()
    fig.add_trace(
        go.Violin(
            y=vals_central,
            name=f"Central",
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='black',
            fillcolor="cyan",
            marker = dict(color="cyan", line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth = np.ptp(vals_central) / dist_num_bins,
            opacity=0.8
        )
    )
    fig.add_trace(
        go.Violin(
            y=vals_yakutia,
            name=f"Yakutia",
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='black',
            fillcolor="magenta",
            marker=dict(color="magenta", line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals_yakutia) / dist_num_bins,
            opacity=0.8
        )
    )
    add_layout(fig, "", f"{cell}", f"p-value: {df_cells.at[cell, 'pval_fdr_bh']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=90,
            pad=0
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/cells/{cell}")

# Age Accelerations

In [None]:
age_types = ['DNAmAgeHannum', 'DNAmAge', 'DNAmPhenoAge', 'DNAmGrimAge']
df_aas = pd.DataFrame(index=[f"{x}Acc" for x in age_types], columns=["pval", "pval_fdr_bh"])
for age_type in tqdm(age_types):
    formula = f"{age_type} ~ Age"
    model = smf.ols(formula=formula, data=df.loc[df["Region"] == "Central"]).fit()
    df[f"{age_type}_linear_pred"] = model.predict(df)
    y_pred = model.predict(pheno)
    df[f"{age_type}Acc"] = df[age_type] - df[f"{age_type}_linear_pred"]

    vals_central = df.loc[df["Region"] == "Central", f"{age_type}Acc"].values
    vals_yakutia = df.loc[df["Region"] == "Yakutia", f"{age_type}Acc"].values
    stat, pval = mannwhitneyu(vals_central, vals_yakutia, alternative='two-sided')
    df_aas.at[f"{age_type}Acc", "pval"] = pval
_, df_aas["pval_fdr_bh"], _, _ = multipletests(df_aas["pval"], 0.05, method='fdr_bh')

dist_num_bins = 15
pathlib.Path(f"{path_save}/accelerations").mkdir(parents=True, exist_ok=True)
for age_type in tqdm(age_types):
    vals_central = df.loc[df["Region"] == "Central", f"{age_type}Acc"].values
    vals_yakutia = df.loc[df["Region"] == "Yakutia", f"{age_type}Acc"].values
    fig = go.Figure()
    fig.add_trace(
        go.Violin(
            y=vals_central,
            name=f"Central",
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='black',
            fillcolor="cyan",
            marker = dict(color="cyan", line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth = np.ptp(vals_central) / dist_num_bins,
            opacity=0.8
        )
    )
    fig.add_trace(
        go.Violin(
            y=vals_yakutia,
            name=f"Yakutia",
            box_visible=True,
            meanline_visible=True,
            showlegend=True,
            line_color='black',
            fillcolor="magenta",
            marker=dict(color="magenta", line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(vals_yakutia) / dist_num_bins,
            opacity=0.8
        )
    )
    add_layout(fig, "", f"{age_type}Acc", f"p-value: {df_aas.at[f'{age_type}Acc', 'pval_fdr_bh']:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=90,
            pad=0
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/accelerations/violin_{age_type}Acc")

    fig = go.Figure()
    add_scatter_trace(fig, df.loc[df["Region"] == "Central", f"Age"].values, df.loc[df["Region"] == "Central", f"{age_type}"].values, f"Central")
    add_scatter_trace(fig, df.loc[df["Region"] == "Central", f"Age"].values, df.loc[df["Region"] == "Central", f"{age_type}_linear_pred"].values, "", "lines")
    add_scatter_trace(fig, df.loc[df["Region"] == "Yakutia", f"Age"].values, df.loc[df["Region"] == "Yakutia", f"{age_type}"].values, f"Yakutia")
    add_layout(fig, f"Age", f"{age_type}", f"")
    fig.update_layout({'colorway': ['cyan', 'cyan', 'magenta']})
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        margin=go.layout.Margin(
            l=80,
            r=20,
            b=80,
            t=65,
            pad=0
        )
    )
    # fig.update_yaxes(autorange=False)
    # fig.update_xaxes(autorange=False)
    # fig.update_layout(yaxis_range=[10, 110])
    # fig.update_layout(xaxis_range=[10, 100])
    save_figure(fig, f"{path_save}/accelerations/scatter_{age_type}")
