In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
from scipy.stats import mannwhitneyu
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from tqdm.notebook import tqdm
import upsetplot as upset
import matplotlib.pyplot as plt

# Read data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform)

path_save = f"{path}/{platform}/{dataset}/special/029_report_mega_summer_2022"
pathlib.Path(f"{path_save}/1_2_sex_specific/vio").mkdir(parents=True, exist_ok=True)

status_col = get_column_name(dataset, 'Status').replace(' ','_')
age_col = get_column_name(dataset, 'Age').replace(' ','_')
sex_col = get_column_name(dataset, 'Sex').replace(' ','_')
status_dict = get_status_dict(dataset)
status_passed_fields = status_dict['Control'] + status_dict['Case']
sex_dict = get_sex_dict(dataset)
continuous_vars = {}
categorical_vars = {status_col: [x.column for x in status_passed_fields], sex_col: list(sex_dict.values())}
pheno = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
pheno = filter_pheno(dataset, pheno, continuous_vars, categorical_vars)
betas = pd.read_pickle(f"{path}/{platform}/{dataset}/betas.pkl")
betas = betas_drop_na(betas)
df = pd.merge(pheno, betas, left_index=True, right_index=True)

ctrl = df.loc[df['Group'] == 'Control']
esrd = df.loc[df['Group'] == 'ESRD']

# 1.2 Sex-specific

In [None]:
cpgs = betas.columns.values
df_res = pd.DataFrame(index=cpgs, columns=['Gene', 'stat', 'pval', 'pval_fdr_bh'])
df.index.name = 'CpG'
for cpg_id, cpg in tqdm(enumerate(cpgs), desc='Mann-Whitney U test', total=len(cpgs)):
    df_res.at[cpg, 'Gene'] = manifest.at[cpg, 'Gene']
    data_1 = ctrl.loc[ctrl["Sex"] == "F", cpg].values
    data_2 = ctrl.loc[ctrl["Sex"] == "M", cpg].values
    stat, pval = mannwhitneyu(data_1, data_2)
    df_res.at[cpg, 'stat'] = stat
    df_res.at[cpg, 'pval'] = pval
_, df_res['pval_fdr_bh'], _, _ = multipletests(df_res['pval'], 0.05, method='fdr_bh')
df_res.to_excel(f"{path_save}/1_2_sex_specific/mw.xlsx")

In [None]:
n_top = 10
df_res_top = df_res.sort_values(['pval_fdr_bh'], ascending=[True]).head(n_top)
for cpg_id, (cpg, row) in enumerate(df_res_top.iterrows()):
    dist_num_bins = 25
    pval = row['pval_fdr_bh']
    gene = manifest.at[cpg, 'Gene']
    fig = go.Figure()
    vals = ctrl.loc[ctrl["Sex"] == "F", cpg].values
    fig.add_trace(
        go.Violin(
            y=vals,
            name='F',
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            marker=dict(line=dict(width=0.3), opacity=1),
            points='all',
            bandwidth=np.ptp(vals) / dist_num_bins,
            opacity=0.8
        )
    )
    vals = ctrl.loc[ctrl["Sex"] == "M", cpg].values
    fig.add_trace(
        go.Violin(
            y=vals,
            name='M',
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            marker=dict(line=dict(width=0.3), opacity=1),
            points='all',
            bandwidth=np.ptp(vals) / dist_num_bins,
            opacity=0.8
        )
    )
    add_layout(fig, "", "Methylation", f"{cpg} ({gene})<br>p-value: {pval:0.2e}")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(tickfont_size=15)
    fig.update_layout({'colorway': ['red', 'blue']})
    fig.update_layout(
        margin=go.layout.Margin(
            l=110,
            r=20,
            b=50,
            t=80,
            pad=0
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.25,
            xanchor="center",
            x=0.5
        )
    )
    save_figure(fig, f"{path_save}/1_2_sex_specific/vio/{cpg_id:03d}_{cpg}")

In [None]:
df_sa_2014 = pd.read_excel(f"{path_save}/1_2_sex_specific/12864_2014_6710_MOESM4_ESM.xlsx", index_col="Target ID")
cpgs_sa_2014 = df_sa_2014.index.values

df_sa_2022 = pd.read_csv(f"{path_save}/1_2_sex_specific/13148_2022_1279_MOESM1_ESM.csv", index_col="Row.names")
cpgs_sa_2022 = df_sa_2022.index.values

df_sa_unn = df_res.loc[df_res['pval_fdr_bh'] < 0.05, :]
cpgs_sa_unn = df_sa_unn.index.values

cpgs_lists = {
    'McCarthy2014': cpgs_sa_2014,
    'Grant2022': cpgs_sa_2022,
    'UNN': cpgs_sa_unn
}
all_cpgs = results_union = set().union(*list(cpgs_lists.values()))
df_upset = pd.DataFrame(index=all_cpgs)
for k, v in cpgs_lists.items():
    df_upset[k] = df_upset.index.isin(v)
df_upset = df_upset.set_index(list(cpgs_lists.keys()))
fig = upset.UpSet(df_upset, subset_size='count', show_counts=True, min_degree=1).plot()
plt.savefig(f"{path_save}/1_2_sex_specific/upset.png", bbox_inches='tight')
plt.savefig(f"{path_save}/1_2_sex_specific/upset.pdf", bbox_inches='tight')