In [None]:
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
from scipy import stats
import plotly.express as px
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
import seaborn as sns
from glob import glob
import pathlib
from sklearn.metrics import mean_absolute_error
from scipy import stats
import patchworklib as pw
import os
import functools
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
import shap
from slugify import slugify


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# 1. Setup path

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/051_small_immuno_clocks_reviewer_3"
pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)

# 2. Fill data with original sample names and add necessary columns

In [None]:

df = pd.read_excel(f"{path}/data_origin.xlsx", index_col=0)
df_map = pd.read_excel(f"{path}/data_mapping.xlsx", index_col=1)
df.loc[df.index, 'sample_name'] = df_map.loc[df.index, 'old_index']
df.loc[df['Dataset'] == 'Train/Validation', 'Part'] = 'trn_val'
df.loc[df['Dataset'] == 'Test Controls', 'Part'] = 'tst_ctrl'
df.loc[df['Dataset'] == 'Test ESRD', 'Part'] = 'tst_esrd'
df.to_excel(f"{path}/data.xlsx", index_label='index')

# 3. Collect ML results

In [None]:
model = 'widedeep_ft_transformer_trn_val_tst'

path_runs = f"{path}/models/{model}/multiruns"

files = glob(f"{path_runs}/*/*/metrics_all_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)

for file in files:
    # Metrics
    df_metrics = pd.read_excel(file, index_col="metric")
    for metric in df_metrics.index.values:
        df_res.at[file, metric + "_val"] = df_metrics.at[metric, "val"]
        df_res.at[file, metric + "_trn"] = df_metrics.at[metric, "trn"]
        df_res.at[file, metric + "_tst_ctrl"] = df_metrics.at[metric, "tst_ctrl"]
        df_res.at[file, metric + "_tst_esrd"] = df_metrics.at[metric, "tst_esrd"]
        df_res.at[file, metric + "_trn_val"] = df_metrics.at[metric, "trn_val"]
        df_res.at[file, metric + "_val_tst_ctrl"] = df_metrics.at[metric, "val_tst_ctrl"]
        df_res.at[file, metric + "_trn_val_tst_ctrl"] = df_metrics.at[metric, "trn_val_tst_ctrl"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res["train_more_val"] = False
df_res["selected"] = False
df_res.loc[df_res["mean_absolute_error_trn"] > df_res["mean_absolute_error_val"], "train_more_val"] = True

first_columns = [
    'selected',
    'train_more_val',
    'mean_absolute_error_trn',
    'mean_absolute_error_val',
    'mean_absolute_error_tst_ctrl',
    'mean_absolute_error_val_tst_ctrl',
    'mean_absolute_error_trn_val_tst_ctrl',
    'pearson_corr_coef_trn',
    'pearson_corr_coef_val',
    'pearson_corr_coef_tst_ctrl',
    'pearson_corr_coef_val_tst_ctrl',
    'pearson_corr_coef_trn_val_tst_ctrl',
    'mean_absolute_error_cv_mean_trn',
    'mean_absolute_error_cv_std_trn',
    'mean_absolute_error_cv_mean_val',
    'mean_absolute_error_cv_std_val',
    'pearson_corr_coef_cv_mean_trn',
    'pearson_corr_coef_cv_std_trn',
    'pearson_corr_coef_cv_mean_val',
    'pearson_corr_coef_cv_std_val',
]
df_res = df_res[first_columns + [col for col in df_res.columns if col not in first_columns]]
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")