In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import pickle
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
import matplotlib.pyplot as plt
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot as upset
import missingno as msno
from pyod.models.lunar import LUNAR
from matplotlib_venn import venn2, venn2_circles
from glob import glob
from hydra import compose, initialize
from omegaconf import OmegaConf
import omegaconf
import os
import ast

# 0. Setup

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets"
path_dataset = f"{path}/GPL21145/GSEUNN"
path_save = f"{path_dataset}/special/042_agena"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

In [None]:
dataset = "GSEUNN"
datasets_info = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)
manifest['CHR'] = manifest['chr'].str[3::]

dnam_suffix = "_harm"

pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
pheno.index.name = "index"
pheno.drop("I64_old", inplace=True)
betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas{dnam_suffix}.pkl")
feats_dnam = betas.columns.values
df_dnam = pd.merge(pheno, betas, left_index=True, right_index=True)

In [None]:
path_agena = f"{path_dataset}/data/agena"
df_agena = pd.read_excel(f"{path_agena}/source(данные_для_обработки)_date(140123).xlsx", index_col="index")
feats_agena = pd.read_excel(f"{path_agena}/feats.xlsx")['features'].values
feats_common = list(set(feats_dnam).intersection(set(feats_agena)))
df_agena = df_agena.loc[:, feats_common] * 0.01

index_common = sorted(list(set(df_agena.index.values).intersection(set(df_dnam.index.values))))
index_agena_only = set(df_agena.index.values) - set(df_dnam.index.values)

In [None]:
pathlib.Path(f"{path_save}/samples").mkdir(parents=True, exist_ok=True)

rel_diff_df = pd.DataFrame(index=index_common)

for sample in index_common:
    agena_i = df_agena.loc[sample, feats_common]
    agena_i.dropna(how='all')
    cpgs_i = sorted(list(set(agena_i.index.values).intersection(set(betas.columns.values))))
    df_i = df_dnam.loc[[sample], cpgs_i]

    fig = go.Figure()
    for cpg_id, cpg in enumerate(cpgs_i):
        distrib_i = df_dnam.loc[:, cpg].values
        fig.add_trace(
            go.Violin(
                x=[cpg] * len(distrib_i),
                y=distrib_i,
                box_visible=True,
                meanline_visible=True,
                line_color='grey',
                showlegend=False,
                opacity=1.0
            )
        )

        showlegend = False
        if cpg_id == 0:
            showlegend = True

        meth_epic = df_i.at[sample, cpg]
        meth_agena = agena_i.at[cpg]
        tmp = (meth_agena - meth_epic) / meth_epic * 100.0
        rel_diff_df.at[sample, cpg] = tmp

        fig.add_trace(
            go.Scatter(
                x=[cpg],
                y=[meth_epic],
                showlegend=showlegend,
                name="850K",
                mode="markers",
                marker=dict(
                    size=15,
                    opacity=0.7,
                    line=dict(
                        width=1
                    ),
                    color='red'
                ),
            )
        )

        fig.add_trace(
            go.Scatter(
                x=[cpg],
                y=[meth_agena],
                showlegend=showlegend,
                name="Agena",
                mode="markers",
                marker=dict(
                    size=12,
                    opacity=0.7,
                    line=dict(
                        width=1
                    ),
                    color='blue'
                ),
            )
        )

    add_layout(fig, f"", 'Methylation level', f"{sample}")
    fig.update_xaxes(tickangle=270)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout(margin=go.layout.Margin(
        l=120,
        r=20,
        b=120,
        t=90,
        pad=0
    ))
    save_figure(fig, f"{path_save}/samples/{sample}")

rel_diff_df.to_excel(f"{path_save}/rel_diff.xlsx", index=True)

fig = go.Figure()
for cpg_id, cpg in enumerate(feats_common):
    series_i = rel_diff_df.loc[index_common, cpg].dropna()
    series_i = series_i.astype('float64')
    distrib_i = series_i.values

    showlegend = False
    if cpg_id == 0:
        showlegend = True

    fig.add_trace(
        go.Violin(
            x=[cpg] * len(distrib_i),
            y=distrib_i,
            showlegend=False,
            box_visible=True,
            meanline_visible=True,
            line_color='black',
            line=dict(width=0.35),
            fillcolor='grey',
            marker=dict(color='grey', line=dict(color='black', width=0.3), opacity=0.8),
            points=False,
            bandwidth=np.ptp(distrib_i) / 25,
            opacity=0.8
        )
    )
add_layout(fig, "", "Relative difference, %", f"")
fig.update_xaxes(tickangle=270)
fig.update_xaxes(tickfont_size=15)
fig.update_layout(margin=go.layout.Margin(
    l=120,
    r=20,
    b=120,
    t=50,
    pad=0
))
fig.update_layout(title_xref='paper')
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_layout(legend_font_size=20)
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    )
)
save_figure(fig, f"{path_save}/rel_diff")

In [None]:

pathlib.Path(f"{path_save}/feats").mkdir(parents=True, exist_ok=True)

pvals = []
values_dict = {'ID': index_common}
for cpg_id, cpg in enumerate(feats_common):
    values_dict[f"{cpg}_850K"] = df_dnam.loc[index_common, cpg].values
    values_dict[f"{cpg}_agena"] = df_agena.loc[index_common, cpg].values
    epic_mw_data = df_dnam.loc[index_common, cpg].dropna(how='all').values
    agena_mw_data = df_agena.loc[index_common, cpg].dropna(how='all').values
    stat, pval = mannwhitneyu(epic_mw_data, agena_mw_data, alternative='two-sided')
    pvals.append(pval)

values_df = pd.DataFrame(values_dict)
values_df.set_index("ID", inplace=True)
values_df.to_excel(f"{path_save}/values.xlsx", index=True)
_, pvals_corr, _, _ = multipletests(pvals, 0.05, method='fdr_bh')
pvals_df = pd.DataFrame(index=feats_common)
pvals_df['pvals'] = pvals
pvals_df['pvals_fdr_bh'] = pvals_corr
pvals_df.to_excel(f"{path_save}/pvals.xlsx", index=True)

for cpg_id, cpg in enumerate(feats_common):

    pval = pvals_df.at[cpg, 'pvals_fdr_bh']
    epic_data = df_dnam.loc[index_common, cpg].dropna(how='all').values
    agena_data = df_agena.loc[index_common, cpg].dropna(how='all').values

    fig = go.Figure()
    fig.add_trace(
        go.Violin(
            y=epic_data,
            name=f"850K",
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor='blue',
            marker=dict(color='blue', line=dict(color='black', width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(epic_data) / 25,
            opacity=0.8
        )
    )
    fig.add_trace(
        go.Violin(
            y=agena_data,
            name=f"Agena",
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor='red',
            marker=dict(color='red', line=dict(color='black', width=0.3), opacity=0.8),
            points='all',
            bandwidth=np.ptp(agena_data) / 25,
            opacity=0.8
        )
    )
    gene = manifest.at[cpg, 'Gene']
    add_layout(fig, "", "Beta value", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/feats/{cpg_id:3d}_{cpg}")