In [None]:
import pandas as pd
import numpy as np
import scipy
from scripts.python.routines.betas import betas_drop_na
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
import seaborn as sns
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
import pathlib

# Read the data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

feats = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()
feats_raw_dict = {x: f"{x}_raw" for x in feats}
feats_harm_dict = {x: f"{x}_harm" for x in feats}

df_raw = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/260_imp(fast_knn)_replace(quarter).xlsx", index_col="index")
df_raw = df_raw.loc[:, feats]
df_raw.rename(columns=feats_raw_dict, inplace=True)
df_harm = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/global_data/all_1052_121222/df_samples(ctrl_550_from_all_1052_121222)_proc(minmax_left(0.0)_right(1.0)_combat)_imp(fast_knn)_replace(quarter).xlsx", index_col="index")
df_harm = df_harm.loc[:, feats + ["Age", "Sex", "Status", "Region"]]
df_harm.rename(columns=feats_harm_dict, inplace=True)

df = pd.merge(df_raw, df_harm, left_index=True, right_index=True)
print(df.shape)

path_save = f"{path}/{platform}/{dataset}/special/039_harmonized_vs_raw"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

# Correlation

In [None]:
path_local = "age_corr"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df_stat = pd.DataFrame(index=feats, columns=["raw_mw_pval", "raw_mw_pval_fdr_bh", "harm_mw_pval", "harm_mw_pval_fdr_bh"])
df_stat.index.name = "feat"
for feat in feats:
    feat_raw = feats_raw_dict[feat]
    feat_harm = feats_harm_dict[feat]
    ages = df.loc[:, "Age"].values
    raw_vals = df.loc[:, feat_raw].values
    harm_vals = df.loc[:, feat_harm].values
    _, raw_pval = stats.pearsonr(raw_vals, ages)
    _, harm_pval = stats.pearsonr(harm_vals, ages)
    df_stat.at[feat, "raw_mw_pval"] = raw_pval
    df_stat.at[feat, "harm_mw_pval"] = harm_pval
_, df_stat["raw_mw_pval_fdr_bh"], _, _ = multipletests(df_stat["raw_mw_pval"], 0.05, method='fdr_bh')
_, df_stat["harm_mw_pval_fdr_bh"], _, _ = multipletests(df_stat["harm_mw_pval"], 0.05, method='fdr_bh')
df_stat.to_excel(f"{path_save}/{path_local}/table.xlsx", index=True)

In [None]:
df_stat.index.name = "Biomarker"

df_stat.sort_values(["raw_mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_stat.loc[:, "raw_mw_pval_fdr_bh"].values.astype(float))
plt.figure(figsize=(10, 20))
sns.set_theme(style='whitegrid', font_scale=2)
sns.barplot(
    data=df_stat,
    y=df_stat.index,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    edgecolor='black',
    orient="h",
    dodge=False
)
plt.savefig(f"{path_save}/{path_local}/raw_bar_mw_pval_fdr_bh.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/raw_bar_mw_pval_fdr_bh.pdf", bbox_inches='tight')
plt.close()

df_stat.sort_values(["harm_mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_stat.loc[:, "harm_mw_pval_fdr_bh"].values.astype(float))
plt.figure(figsize=(10, 20))
sns.set_theme(style='whitegrid', font_scale=2)
sns.barplot(
    data=df_stat,
    y=df_stat.index,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    edgecolor='black',
    orient="h",
    dodge=False
)
plt.savefig(f"{path_save}/{path_local}/harm_bar_mw_pval_fdr_bh.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/harm_bar_mw_pval_fdr_bh.pdf", bbox_inches='tight')
plt.close()

# Scatter and violin plots

In [None]:
path_local = f"plots"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

for feat in feats:
    feat_raw = feats_raw_dict[feat]
    feat_harm = feats_harm_dict[feat]

    min_val = df[[feat_raw, feat_harm]].min().min()
    max_val = df[[feat_raw, feat_harm]].max().max()
    shift_val = max_val - min_val
    min_val -= 0.05 * shift_val
    max_val += 0.05 * shift_val

    fig = go.Figure()
    fig.add_trace(
        go.Violin(
            y=df.loc[:, feat_raw],
            name="Raw",
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor="red",
            marker=dict(color="red", line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth = np.ptp(df.loc[:, feat_raw]) / 25,
            opacity=0.8
        )
    )
    fig.add_trace(
        go.Violin(
            y=df.loc[:, feat_harm],
            name="Harmonized",
            box_visible=True,
            meanline_visible=True,
            showlegend=False,
            line_color='black',
            fillcolor="blue",
            marker=dict(color="blue", line=dict(color='black',width=0.3), opacity=0.8),
            points='all',
            bandwidth = np.ptp(df.loc[:, feat_harm]) / 25,
            opacity=0.8
        )
    )
    add_layout(fig, "", f"{feat}", f"")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        width=800,
        height=800,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=150,
            t=50,
            pad=0,
        )
    )
    fig.update_layout(legend_y=1.01)
    save_figure(fig, f"{path_save}/{path_local}/violin_{feat}")

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=df.loc[:, feat_raw].values,
            y=df.loc[:, feat_harm].values,
            showlegend=False,
            name="",
            mode="markers",
            marker_color="red",
            marker=dict(
                size=8,
                opacity=0.85,
                color="red",
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    fig.add_trace(
        go.Scatter(
            x=[min_val, max_val],
            y=[min_val, max_val],
            showlegend=False,
            name="",
            mode="lines",
            marker_color="black",
            marker=dict(
                size=8,
                opacity=0.75,
                line=dict(
                    color="black",
                    width=0.5
                )
            )
        )
    )
    add_layout(fig, f"Raw", f"Harmonized", f"{feat}")
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(autorange=False)
    fig.update_yaxes(autorange=False)
    fig.update_layout(title_xref='paper')
    fig.update_layout(xaxis_range=[min_val, max_val])
    fig.update_layout(yaxis_range=[min_val, max_val])
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.update_layout(
        width=800,
        height=800,
        margin=go.layout.Margin(
            l=150,
            r=50,
            b=150,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path_save}/{path_local}/scatter_{feat}")