In [1]:
import pandas as pd
from scripts.python.routines.manifest import get_manifest
from scripts.python.preprocessing.serialization.routines.pheno_betas_checking import get_pheno_betas_with_common_subjects
import pathlib
from scripts.python.meta.tasks.GPL13534_Blood.routines import perform_test_for_controls
from tqdm import tqdm
import numpy as np
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.save import save_figure
import plotly.graph_objects as go
import plotly.express as px

# Setup

In [9]:
thld_above = 0.5
thld_below = 0.05

path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
manifest = get_manifest('GPL13534')

disease = "Schizophrenia"
if disease == "Schizophrenia":
    dataset_statuses = {
        'GSE84727': ['Control', 'Schizophrenia'],
        'GSE80417': ['Control', 'Schizophrenia'],
        'GSE152027': ['Control', 'Schizophrenia'],
        'GSE116379': ['Control', 'Schizophrenia'],
    }
    datasets_trn_val = ['GSE84727', 'GSE80417']
    datasets_tst = ['GSE152027', 'GSE116379']
    target_cps = ['cg04210544', 'cg04863850']

else:
    dataset_statuses = {
        'GSE145361': ['Control', 'Parkinson'],
        'GSE111629': ['Control', 'Parkinson'],
        'GSE72774': ['Control', 'Parkinson'],
    }
    datasets_trn_val = ['GSE145361', 'GSE111629']
    datasets_tst = ['GSE72774']
    target_cps = ['cg23835377', 'cg00488734']

task_name = f"GPL13534_Blood/{disease}"
path_wd = f"{path}/meta/tasks/{task_name}"
path_save = "E:/YandexDisk/Work/pydnameth/draft/03_somewhere/GigaDB/Figure3"
pathlib.Path(f"{path_save}/{disease}").mkdir(parents=True, exist_ok=True)

# Load data

In [11]:
pheno_trn_val = pd.DataFrame()
pheno_trn_val.index.name = 'subject_id'
mvals_trn_val = pd.DataFrame()
origin_df = pd.DataFrame()
for d_id, dataset in enumerate(datasets_trn_val):
    pheno_origin = pd.read_pickle(f"{path_wd}/origin/pheno_trn_val_{dataset}.pkl")
    mvals_origin = pd.read_pickle(f"{path_wd}/origin/mvalsT_trn_val_{dataset}.pkl")
    mvals_origin = mvals_origin.T
    origin_df_i = pd.merge(pheno_origin, mvals_origin, left_index=True, right_index=True)
    origin_df = origin_df.append(origin_df_i, verify_integrity=True)
df_trn_val = pd.read_pickle(f"{path_wd}/harmonized/data_trn_val.pkl")


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



# Plots and data

In [12]:
data_cols = ['GSM'] + [f"{x}_{y}_before" for x in target_cps for y in datasets_trn_val] + [f"{x}_{y}_after" for x in target_cps for y in datasets_trn_val]
for cpg in target_cps:
    dist_num_bins = 25
    gene = manifest.at[cpg, 'Gene']
    fig = go.Figure()
    for dataset in datasets_trn_val:
        gsms_i = origin_df.loc[(origin_df['Status'] == 'Control') & (origin_df['Dataset'] == dataset), cpg].index.values
        vals_i = origin_df.loc[(origin_df['Status'] == 'Control') & (origin_df['Dataset'] == dataset), cpg].values

        data_df = pd.DataFrame(columns=['GSM', f"{cpg}_{dataset}_before"])
        data_df['GSM'] = gsms_i
        data_df[f"{cpg}_{dataset}_before"] = vals_i
        data_df.to_csv(f"{path_save}/{disease}/{cpg}_{dataset}_before.csv", index=False)

        fig.add_trace(
            go.Violin(
                y=vals_i,
                name=dataset,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                marker=dict(line=dict(width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals_i) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "M value", f"{cpg} ({gene})")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout({'colorway': px.colors.qualitative.Set1})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{disease}/{cpg}_before")

    gene = manifest.at[cpg, 'Gene']
    fig = go.Figure()
    for dataset in datasets_trn_val:
        gsms_i = df_trn_val.loc[(origin_df['Status'] == 'Control') & (origin_df['Dataset'] == dataset), cpg].index.values
        vals_i = df_trn_val.loc[(df_trn_val['Status'] == 'Control') & (df_trn_val['Dataset'] == dataset), cpg].values

        data_df = pd.DataFrame(columns=['GSM', f"{cpg}_{dataset}_before"])
        data_df['GSM'] = gsms_i
        data_df[f"{cpg}_{dataset}_before"] = vals_i
        data_df.to_csv(f"{path_save}/{disease}/{cpg}_{dataset}_after.csv", index=False)

        fig.add_trace(
            go.Violin(
                y=vals_i,
                name=dataset,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                marker=dict(line=dict(width=0.3), opacity=0.8),
                points='all',
                bandwidth=np.ptp(vals_i) / dist_num_bins,
                opacity=0.8
            )
        )
    add_layout(fig, "", "M value", f"{cpg} ({gene})")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout({'colorway': px.colors.qualitative.Set1})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/{disease}/{cpg}_after")