In [1]:
import pandas as pd
from scripts.python.routines.manifest import get_manifest
from scripts.python.preprocessing.serialization.routines.pheno_betas_checking import get_pheno_betas_with_common_subjects
import pathlib
from scripts.python.meta.tasks.GPL13534_Blood.routines import perform_test_for_controls
from tqdm import tqdm
import numpy as np
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.save import save_figure
import plotly.graph_objects as go
import plotly.express as px

In [2]:
thld_above = 0.5
thld_below = 0.05

path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
manifest = get_manifest('GPL13534')

# disease = "Schizophrenia"
# dataset_statuses = {
#     'GSE84727': ['Control', 'Schizophrenia'],
#     'GSE80417': ['Control', 'Schizophrenia'],
#     'GSE152027': ['Control', 'Schizophrenia'],
#     'GSE116379': ['Control', 'Schizophrenia'],
#     'GSE41169': ['Control', 'Schizophrenia'],
#     'GSE116378': ['Control', 'Schizophrenia'],
#     'GSE87571': ['Control'],
# }
# datasets_trn_val = ['GSE84727', 'GSE80417']
# datasets_tst = ['GSE152027', 'GSE116379', 'GSE41169', 'GSE116378', 'GSE87571']

disease = "Parkinson"
dataset_statuses = {
    'GSE145361': ['Control', 'Parkinson'],
    'GSE111629': ['Control', 'Parkinson'],
}
datasets_trn_val = ['GSE145361']
datasets_tst = ['GSE111629']

task_name = f"GigaScienceLastHope/{disease}"
path_wd = f"{path}/meta/tasks/{task_name}"
pathlib.Path(f"{path_wd}/harmonized/cpgs/figs").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"{path_wd}/harmonized/cpgs/diffs").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"{path_wd}/non_harmonized/cpgs/figs").mkdir(parents=True, exist_ok=True)

In [3]:
# reading harmonized and non-harmonized data
pheno_all_nh = pd.DataFrame()
pheno_all_nh.index.name = 'subject_id'
mvals_all_nh = pd.DataFrame()
pheno_all_h = pd.DataFrame()
pheno_all_h.index.name = 'subject_id'
mvals_all_h = pd.DataFrame()
for d_id, dataset in enumerate(dataset_statuses):
    print(dataset)
    pheno = pd.read_pickle(f"{path_wd}/origin/pheno_{dataset}.pkl")
    pheno_cols = pheno.columns.values

    # non-harmonized
    mvals_nh = pd.read_pickle(f"{path_wd}/origin/mvalsT_{dataset}.pkl")
    mvals_nh = mvals_nh.T
    mvals_nh_cols = mvals_nh.columns.values
    df_nh = pd.merge(pheno, mvals_nh, left_index=True, right_index=True)
    pheno_nh = df_nh.loc[:, pheno_cols]
    mvals_nh = df_nh.loc[:, mvals_nh_cols]
    pheno_all_nh = pheno_all_nh.append(pheno_nh, verify_integrity=True)
    mvals_nh = mvals_nh.T
    if d_id == 0:
        mvals_all_nh = mvals_nh
    else:
        mvals_all_nh = mvals_all_nh.merge(mvals_nh, how='inner', left_index=True, right_index=True)

    # harmonized
    mvals_h = pd.read_csv(f"{path_wd}/harmonized/r/mvalsT_{dataset}_regRCPqn.txt", delimiter="\t", index_col='ID_REF')
    mvals_h = mvals_h.T
    mvals_h_cols = mvals_h.columns.values
    df_h = pd.merge(pheno, mvals_h, left_index=True, right_index=True)
    pheno_h = df_h.loc[:, pheno_cols]
    mvals_h = df_h.loc[:, mvals_h_cols]
    pheno_all_h = pheno_all_h.append(pheno_h, verify_integrity=True)
    mvals_h = mvals_h.T
    if d_id == 0:
        mvals_all_h = mvals_h
    else:
        mvals_all_h = mvals_all_h.merge(mvals_h, how='inner', left_index=True, right_index=True)

mvals_all_nh = mvals_all_nh.T
mvals_all_nh.index.name = "subject_id"
mvals_all_nh = mvals_all_nh.astype('float32')
print(f"Number of total subjects in mvals_all_nh: {mvals_all_nh.shape[0]}")
print(f"Number of total CpGs in mvals_all_nh: {mvals_all_nh.shape[1]}")
pheno_all_nh, mvals_all_nh = get_pheno_betas_with_common_subjects(pheno_all_nh, mvals_all_nh)
feats_nh = pheno_all_nh.columns.values
cpgs_nh = mvals_all_nh.columns.values
df_all_nh = pd.merge(pheno_all_nh, mvals_all_nh, left_index=True, right_index=True)

mvals_all_h = mvals_all_h.T
mvals_all_h.index.name = "subject_id"
mvals_all_h = mvals_all_h.astype('float32')
print(f"Number of total subjects in mvals_all_h: {mvals_all_h.shape[0]}")
print(f"Number of total CpGs in mvals_all_h: {mvals_all_h.shape[1]}")
pheno_all_h, mvals_all_h = get_pheno_betas_with_common_subjects(pheno_all_h, mvals_all_h)
feats_h = pheno_all_h.columns.values
cpgs_h = mvals_all_h.columns.values
df_all_h = pd.merge(pheno_all_h, mvals_all_h, left_index=True, right_index=True)

# Check indexes
if df_all_nh.shape != df_all_h.shape:
    raise ValueError(f"Wrong shape")
if list(df_all_nh.index.values) != list(df_all_h.index.values):
    raise ValueError(f"Wrong indexes")

GSE145361



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



GSE111629



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Number of total subjects in mvals_all_nh: 2460
Number of total CpGs in mvals_all_nh: 411867
In pheno and betas subjects are the same
In pheno and betas subjects have the same order
Number of total subjects in mvals_all_h: 2460
Number of total CpGs in mvals_all_h: 411867
In pheno and betas subjects are the same
In pheno and betas subjects have the same order


In [4]:
# Save dfs
df_trn_val_nh = df_all_nh.loc[df_all_nh['Dataset'].isin(datasets_trn_val), :]
print(f"df_trn_val_nh shape: {df_trn_val_nh.shape}")
df_trn_val_nh.to_pickle(f"{path_wd}/non_harmonized/data_trn_val.pkl")
df_tst_nh = df_all_nh.loc[df_all_nh['Dataset'].isin(datasets_tst), :]
print(f"df_tst_nh shape: {df_tst_nh.shape}")
df_tst_nh.to_pickle(f"{path_wd}/non_harmonized/data_tst.pkl")

df_trn_val_h = df_all_h.loc[df_all_h['Dataset'].isin(datasets_trn_val), :]
print(f"df_trn_val_h shape: {df_trn_val_h.shape}")
df_trn_val_h.to_pickle(f"{path_wd}/harmonized/data_trn_val.pkl")
df_tst_h = df_all_h.loc[df_all_h['Dataset'].isin(datasets_tst), :]
print(f"df_tst_h shape: {df_tst_h.shape}")
df_tst_h.to_pickle(f"{path_wd}/harmonized/data_tst.pkl")

df_trn_val_nh shape: (1889, 411869)
df_tst_nh shape: (571, 411869)
df_trn_val_h shape: (1889, 411869)
df_tst_h shape: (571, 411869)


In [5]:
# Calc statistics
cpgs_metrics_harmonized_df = perform_test_for_controls(list(dataset_statuses.keys()), manifest, df_all_h, cpgs_h, f"{path_wd}/harmonized/cpgs/figs", "M value")
for cpg_id, cpg in enumerate(tqdm(cpgs_h)):
    cpgs_metrics_harmonized_df.loc[cpg, "mean"] = df_all_h[cpg].mean()
    cpgs_metrics_harmonized_df.loc[cpg, "median"] = df_all_h[cpg].median()
cpgs_metrics_harmonized_df.to_excel(f"{path_wd}/harmonized/cpgs/{cpgs_metrics_harmonized_df.shape[0]}.xlsx", index=True)

cpgs_metrics_origin_df = pd.read_excel(f"{path_wd}/origin/cpgs/{cpgs_metrics_harmonized_df.shape[0]}.xlsx", index_col="features")
cpgs_info = cpgs_metrics_origin_df.merge(cpgs_metrics_harmonized_df, left_index=True, right_index=True, suffixes=('_origin', '_harmonized'))
cpgs_info['log_diff_harmonized'] = np.log10(cpgs_info.loc[:, 'pval_fdr_bh_harmonized'].values) - np.log10(cpgs_info.loc[:, 'pval_fdr_bh_origin'].values)
cpgs_changed = cpgs_info.loc[(cpgs_info['pval_fdr_bh_harmonized'] > thld_above) & (cpgs_info['pval_fdr_bh_origin'] < thld_below), :]
cpgs_changed.sort_values(['log_diff_harmonized'], ascending=[False], inplace=True)
cpgs_changed.to_excel(f"{path_wd}/harmonized/cpgs/cpgs_changed_{thld_above}_{thld_below}.xlsx", index=True)

100%|██████████| 411867/411867 [16:49<00:00, 407.86it/s]
100%|██████████| 411867/411867 [02:46<00:00, 2473.78it/s]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [8]:
# plotting
cpgs_to_plot_df = cpgs_changed.head(20)
for cpg_id, (cpg, row) in enumerate(cpgs_to_plot_df.iterrows()):
    dist_num_bins = 25
    pval = row['pval_fdr_bh_origin']
    gene = manifest.at[cpg, 'Gene']
    fig = go.Figure()
    for dataset in dataset_statuses:
        vals = df_all_nh.loc[(df_all_nh['Status'] == 'Control') & (df_all_nh['Dataset'] == dataset), cpg].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=dataset,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                marker=dict(line=dict(width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "M value", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout({'colorway': px.colors.qualitative.Set1})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_wd}/harmonized/cpgs/diffs/{cpg_id:03d}_{cpg}_origin")

    pval = row['pval_fdr_bh_harmonized']
    gene = manifest.at[cpg, 'Gene']
    fig = go.Figure()
    for dataset in dataset_statuses:
        vals = df_all_h.loc[(df_all_h['Status'] == 'Control') & (df_all_h['Dataset'] == dataset), cpg].values
        fig.add_trace(
            go.Violin(
                y=vals,
                name=dataset,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                marker=dict(line=dict(width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "M value", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout({'colorway': px.colors.qualitative.Set1})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_wd}/harmonized/cpgs/diffs/{cpg_id:03d}_{cpg}_harmonized")