In [None]:
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
from scipy import stats
import plotly.express as px
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
import seaborn as sns
from glob import glob
import pathlib
from sklearn.metrics import mean_absolute_error
from scipy import stats
import patchworklib as pw
import os
import functools
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
import shap
from slugify import slugify
import matplotlib.lines as mlines
from scripts.python.routines.plot.p_value import add_p_value_annotation



def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# 1. Setup path

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/051_small_immuno_clocks_reviewer_3"
pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)

# 2. Fill data with original sample names and add necessary columns

In [None]:

df = pd.read_excel(f"{path}/data_origin.xlsx", index_col=0)
df_map = pd.read_excel(f"{path}/data_mapping.xlsx", index_col=1)
df.loc[df.index, 'sample_name'] = df_map.loc[df.index, 'old_index']
df.loc[df['Dataset'] == 'Train/Validation', 'Part'] = 'trn_val'
df.loc[df['Dataset'] == 'Test Controls', 'Part'] = 'tst_ctrl'
df.loc[df['Dataset'] == 'Test ESRD', 'Part'] = 'tst_esrd'
df.to_excel(f"{path}/data.xlsx", index_label='index')

# 3. Collect ML results

In [None]:
model = 'widedeep_ft_transformer_trn_val_tst'

path_runs = f"{path}/models/{model}/multiruns"

files = glob(f"{path_runs}/*/*/metrics_all_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)

for file in files:

    head, tail = os.path.split(file)

    # Metrics
    df_metrics = pd.read_excel(file, index_col="metric")
    for metric in df_metrics.index.values:
        df_res.at[file, metric + "_val"] = df_metrics.at[metric, "val"]
        df_res.at[file, metric + "_trn"] = df_metrics.at[metric, "trn"]
        df_res.at[file, metric + "_tst_ctrl"] = df_metrics.at[metric, "tst_ctrl"]
        df_res.at[file, metric + "_tst_esrd"] = df_metrics.at[metric, "tst_esrd"]
        df_res.at[file, metric + "_trn_val"] = df_metrics.at[metric, "trn_val"]
        df_res.at[file, metric + "_val_tst_ctrl"] = df_metrics.at[metric, "val_tst_ctrl"]
        df_res.at[file, metric + "_trn_val_tst_ctrl"] = df_metrics.at[metric, "trn_val_tst_ctrl"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res["train_more_val"] = False
df_res["selected"] = False
df_res.loc[df_res["mean_absolute_error_trn"] > df_res["mean_absolute_error_val"], "train_more_val"] = True

first_columns = [
    'selected',
    'train_more_val',
    'mean_absolute_error_trn',
    'mean_absolute_error_val',
    'mean_absolute_error_tst_ctrl',
    'mean_absolute_error_val_tst_ctrl',
    'mean_absolute_error_trn_val_tst_ctrl',
    'pearson_corr_coef_trn',
    'pearson_corr_coef_val',
    'pearson_corr_coef_tst_ctrl',
    'pearson_corr_coef_val_tst_ctrl',
    'pearson_corr_coef_trn_val_tst_ctrl',
    'mean_absolute_error_cv_mean_trn',
    'mean_absolute_error_cv_std_trn',
    'mean_absolute_error_cv_mean_val',
    'mean_absolute_error_cv_std_val',
    'pearson_corr_coef_cv_mean_trn',
    'pearson_corr_coef_cv_std_trn',
    'pearson_corr_coef_cv_mean_val',
    'pearson_corr_coef_cv_std_val',
]
df_res = df_res[first_columns + [col for col in df_res.columns if col not in first_columns]]
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")

# 4. Comment 4: ESRD age acceleration in young group

In [None]:
pathlib.Path(f"{path}/comment_4").mkdir(parents=True, exist_ok=True)

feats = ['CXCL9', 'CSF1', 'IL6']
trgt = 'SImAge'

df = pd.read_excel(f"{path}/data.xlsx", index_col=0)

df_tst_ctrl = df.loc[df['Dataset'] == 'Test Controls', :]
df_tst_esrd = df.loc[df['Dataset'] == 'Test ESRD', :]

for feat in feats:
    legend_handles = []
    norm = plt.Normalize(df[trgt].min(), df[trgt].max())
    sm = plt.cm.ScalarMappable(cmap="spring", norm=norm)
    sm.set_array([])
    fig = plt.figure(figsize=(8, 6))
    sns.set_theme(style='whitegrid')

    scatter = sns.scatterplot(
        data=df_tst_ctrl,
        x='Age',
        y=feat,
        palette='spring',
        hue=trgt,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        marker='o',
        s=50,
    )
    scatter.get_legend().remove()
    legend_handles.append(mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Test Controls'))

    scatter = sns.scatterplot(
        data=df_tst_esrd,
        x='Age',
        y=feat,
        palette='spring',
        hue=trgt,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        marker='X',
        s=50,
    )
    scatter.get_legend().remove()
    legend_handles.append(mlines.Line2D([], [], marker='X', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Test ESRD'))

    plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=3, frameon=False)
    fig.colorbar(sm, label=trgt)
    if feat == 'CSF1':
        plt.ylim(-100, 2200)
    elif  feat == 'IL6':
        plt.ylim(-2, 21)
    plt.savefig(f"{path}/comment_4/{feat}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path}/comment_4/{feat}.pdf", bbox_inches='tight')
    plt.close()

# 5. Comment 3: PDGFA and PDGFB

In [None]:
pathlib.Path(f"{path}/comment_3").mkdir(parents=True, exist_ok=True)

## Scatter plots

In [None]:
palette = {'Control': 'cyan', 'ESRD': 'magenta'}

df = pd.read_excel(f"{path}/data.xlsx", index_col=0)

df_ctrl = df.loc[df['Status'] == 'Control', :]
df_esrd = df.loc[df['Status'] == 'ESRD', :]

plt.figure()
sns.set_theme(style='whitegrid')
xy_min = df[["PDGFA", 'PDGFB']].min().min()
xy_max = df[["PDGFA", 'PDGFB']].max().max()
xy_ptp = xy_max - xy_min
scatter = sns.scatterplot(
    data=df,
    x="PDGFA",
    y="PDGFB",
    hue="Status",
    palette=palette,
    linewidth=0.2,
    alpha=0.75,
    edgecolor="k",
    s=16,
    hue_order=list(palette.keys())
)
plt.savefig(f"{path}/comment_3/scatter.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path}/comment_3/scatter.pdf", bbox_inches='tight')
plt.close()

trgts = ['Age', 'SImAge']
for trgt in trgts:
    legend_handles = []
    norm = plt.Normalize(df[trgt].min(), df[trgt].max())
    sm = plt.cm.ScalarMappable(cmap="spring", norm=norm)
    sm.set_array([])
    fig = plt.figure(figsize=(8, 6))
    sns.set_theme(style='whitegrid')

    scatter = sns.scatterplot(
        data=df_ctrl,
        x='PDGFA',
        y='PDGFB',
        palette='spring',
        hue=trgt,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        marker='o',
        s=50,
    )
    scatter.get_legend().remove()
    legend_handles.append(mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Controls'))

    scatter = sns.scatterplot(
        data=df_esrd,
        x='PDGFA',
        y='PDGFB',
        palette='spring',
        hue=trgt,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        marker='X',
        s=50,
    )
    scatter.get_legend().remove()
    legend_handles.append(mlines.Line2D([], [], marker='X', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='ESRD'))

    plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=3, frameon=False)
    fig.colorbar(sm, label=trgt)
    plt.savefig(f"{path}/comment_3/scatter_{trgt}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path}/comment_3/scatter_{trgt}.pdf", bbox_inches='tight')
    plt.close()

## Tests for ESRD vs Controls

In [None]:
df = pd.read_excel(f"{path}/data.xlsx", index_col=0)

palette = {'Test Controls': 'cyan', 'Test ESRD': 'magenta'}
feats = ['PDGFA', 'PDGFB']

for feat in feats:
    fig = go.Figure()
    vals = {}
    for group in ['Test Controls', 'Test ESRD']:
        vals[group] = df.loc[df['Dataset'] == group, feat].values
        pointpos = 1.5
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=palette[group],
                marker=dict(color=palette[group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                pointpos=pointpos,
                bandwidth=np.ptp(vals[group]) / 10,
                opacity=0.8
            )
        )
    add_layout(fig, f"", feat, f"")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(autorange=True)
    fig.update_layout(legend={'itemsizing': 'constant'})
    stat_01, pval_01 = mannwhitneyu(vals['Test Controls'], vals['Test ESRD'], alternative='two-sided')
    fig = add_p_value_annotation(fig, {(0,1): pval_01})

    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=800,
        height=600,
        margin=go.layout.Margin(
            l=120,
            r=50,
            b=70,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path}/comment_3/violin_status_{feat}", scale=2)

## Tests for Males vs Females

In [None]:
df = pd.read_excel(f"{path}/data.xlsx", index_col=0)
df = df.loc[df['Status'] == 'Control', :]

palette = {'F': 'tomato', 'M': 'slateblue'}
feats = ['PDGFA', 'PDGFB']

for feat in feats:
    fig = go.Figure()
    vals = {}
    for group in ['F', 'M']:
        vals[group] = df.loc[df['Sex'] == group, feat].values
        pointpos = 1.5
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=palette[group],
                marker=dict(color=palette[group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                pointpos=pointpos,
                bandwidth=np.ptp(vals[group]) / 10,
                opacity=0.8
            )
        )
    add_layout(fig, f"", feat, f"")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(autorange=True)
    fig.update_layout(legend={'itemsizing': 'constant'})
    stat_01, pval_01 = mannwhitneyu(vals['F'], vals['M'], alternative='two-sided')
    fig = add_p_value_annotation(fig, {(0,1): pval_01})

    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=800,
        height=600,
        margin=go.layout.Margin(
            l=135,
            r=50,
            b=70,
            t=50,
            pad=0,
        )
    )
    save_figure(fig, f"{path}/comment_3/violin_sex_{feat}", scale=2)

## Tests for age groups

In [None]:
df = pd.read_excel(f"{path}/data.xlsx", index_col=0)
df = df.loc[df['Status'] == 'Control', :]
df['Age group'] = 'All'
df.loc[df['Age'] < 30, 'Age group'] = 'Age < 30'
df.loc[(df['Age'] < 50) & (df['Age'] >= 30), 'Age group'] = '30 <= Age < 50'
df.loc[(df['Age'] < 70) & (df['Age'] >= 50), 'Age group'] = '50 <= Age < 70'
df.loc[df['Age'] >= 70, 'Age group'] = 'Age >= 70'

palette = {
    'Age < 30': 'chartreuse',
    '30 <= Age < 50': 'gold',
    '50 <= Age < 70': 'coral',
    'Age >= 70': 'firebrick'
}
feats = ['PDGFA', 'PDGFB']

for feat in feats:
    fig = go.Figure()
    vals = {}
    for group in ['Age < 30', '30 <= Age < 50', '50 <= Age < 70', 'Age >= 70']:
        vals[group] = df.loc[df['Age group'] == group, feat].values
        pointpos = 1.5
        fig.add_trace(
            go.Violin(
                y=vals[group],
                name=group,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor=palette[group],
                marker=dict(color=palette[group], line=dict(color='black',width=0.3), opacity=0.8),
                points='all',
                pointpos=pointpos,
                bandwidth=np.ptp(vals[group]) / 10,
                opacity=0.8
            )
        )
    add_layout(fig, f"", feat, f"")
    fig.update_layout(title_xref='paper')
    fig.update_layout(legend_font_size=20)
    fig.update_xaxes(autorange=True)
    fig.update_layout(legend={'itemsizing': 'constant'})
    stat_01, pval_01 = mannwhitneyu(vals['Age < 30'], vals['30 <= Age < 50'], alternative='two-sided')
    stat_12, pval_12 = mannwhitneyu(vals['30 <= Age < 50'], vals['50 <= Age < 70'], alternative='two-sided')
    stat_23, pval_23 = mannwhitneyu(vals['50 <= Age < 70'], vals['Age >= 70'], alternative='two-sided')
    fig = add_p_value_annotation(fig, {(0,1): pval_01, (1,2): pval_12, (2,3): pval_23})

    fig.update_layout(
        violingap=0.35,
        violingroupgap=0.35,
        width=1200,
        height=900,
        margin=go.layout.Margin(
            l=135,
            r=50,
            b=50,
            t=200,
            pad=0,
        )
    )
    save_figure(fig, f"{path}/comment_3/violin_age_{feat}", scale=2)