In [None]:
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
from scipy import stats
import plotly.express as px
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
import seaborn as sns
from glob import glob
import pathlib
from sklearn.metrics import mean_absolute_error
from scipy import stats
import patchworklib as pw
import os
import functools


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# 0. Setup

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/044_small_immuno_clocks_revision"
pathlib.Path(f"{path}").mkdir(parents=True, exist_ok=True)

# 1. Prepare additional test data

In [None]:
df_origin = pd.read_excel(f"{path}/data_origin/260_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df_all = pd.read_excel(f"{path}/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
indexes_not_origin = df_all.index.difference(df_origin.index)
df_all["parts_danet"] = df_all["Region"].str.cat(df_all[["Status"]].astype(str), sep="_")
df_all["Split"] = "tst"

df_new = pd.concat([df_origin, df_all.loc[indexes_not_origin, :]])
df_new.to_excel(f"{path}/all_for_test.xlsx", index_label="index")

# 2. Create new dataset

In [None]:
df_origin = pd.read_excel(f"{path}/origin/260_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df_all = pd.read_excel(f"{path}/origin/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)

df_res = pd.read_excel(f"{path}/origin/models/danet_inference/runs/2023-04-12_12-16-05/df.xlsx", index_col=0)

indexes_test_ctrl_central = df_res.index[(df_res["parts_danet"] == "Central_Control")].values
print(len(indexes_test_ctrl_central))
df_test_ctrl_central = df_all.loc[indexes_test_ctrl_central, :].copy()
df_test_ctrl_central["parts_danet"] = 'tst_ctrl_central'
df_test_ctrl_central["Split"] = 'tst_ctrl_central'

indexes_test_ctrl_yakutia = df_res.index[(df_res["parts_danet"] == "Yakutia_Control")].values
print(len(indexes_test_ctrl_yakutia))
df_test_ctrl_yakutia = df_all.loc[indexes_test_ctrl_yakutia, :].copy()
df_test_ctrl_yakutia["parts_danet"] = 'tst_ctrl_yakutia'
df_test_ctrl_yakutia["Split"] = 'tst_ctrl_yakutia'

df_test_esrd = pd.read_excel("D:/YandexDisk/Work/pydnameth/draft/02_geroscience/supplementary/part(v2)/1/SupplementaryTable2.xlsx", index_col=0)
indexes_test_esrd = df_test_esrd.index[df_test_esrd["Group"] == "Disease"].values
df_test_esrd = df_all.loc[indexes_test_esrd, :]
df_test_esrd["parts_danet"] = 'tst_esrd'
df_test_esrd["Split"] = 'tst_esrd'

df_new = pd.concat([df_origin, df_test_ctrl_central, df_test_ctrl_yakutia, df_test_esrd])
df_new.to_excel(f"{path}/data_wtf.xlsx", index_label="index")

# 3. Collect ML results

In [None]:
model = 'widedeep_ft_transformer_trn_val_tst'

part_check = "tst_ctrl_all"
part_check_thld_mean = 7.5
df = pd.read_excel(f"{path}/data.xlsx", index_col=0)
samples_test = df.index[df["Split"] == "tst_ctrl_all"].values

path_runs = f"{path}/models/46_trn_val_tst/{model}/multiruns"

files = glob(f"{path_runs}/*/*/metrics_val_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)
df_samples_test = pd.DataFrame(index=files, columns=samples_test)

for file in files:

    head, tail = os.path.split(file)
    df_pred = pd.read_excel(f"{head}/predictions.xlsx", index_col=0)
    df_pred["Error"] = df_pred["Prediction"] - df_pred["Age"]
    df_pred["AbsError"] = df_pred["Error"].abs()
    part_col = df_pred.columns[0]
    df_pred = df_pred.loc[df_pred[part_col] == part_check, :]
    df_pred.sort_values(["AbsError"], ascending=[True], inplace=True)
    df_pred["MeanAbsErrorExpanding"] = df_pred["AbsError"].expanding().mean()
    samples_passed = df_pred.index[df_pred["MeanAbsErrorExpanding"] < part_check_thld_mean].values
    df_samples_test.loc[file, samples_test] = 0
    df_samples_test.loc[file, samples_passed] = 1
    n_samples_passed = len(samples_passed)

    df_res.at[file, "passed_test_samples"] = n_samples_passed

    # Validation
    df_val = pd.read_excel(file, index_col="metric")
    for metric in df_val.index.values:
        df_res.at[file, metric + "_val"] = df_val.at[metric, "val"]

    # Train
    head, tail = os.path.split(file)
    tail = tail.replace('val', 'trn')
    df_trn = pd.read_excel(f"{head}/{tail}", index_col="metric")
    for metric in df_trn.index.values:
        df_res.at[file, metric + "_trn"] = df_trn.at[metric, "trn"]

    # Test 1
    head, tail = os.path.split(file)
    tail = tail.replace('val', f'tst_ctrl_subset')
    df_tst = pd.read_excel(f"{head}/{tail}", index_col="metric")
    for metric in df_trn.index.values:
        df_res.at[file, metric + "_tst_ctrl_subset"] = df_tst.at[metric, "tst_ctrl_subset"]

    # Test 2
    head, tail = os.path.split(file)
    tail = tail.replace('val', f'tst_ctrl_all')
    df_tst = pd.read_excel(f"{head}/{tail}", index_col="metric")
    for metric in df_trn.index.values:
        df_res.at[file, metric + "_tst_ctrl_all"] = df_tst.at[metric, "tst_ctrl_all"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res["train_more_val"] = False
df_res["selected"] = False
df_res.loc[df_res["mean_absolute_error_trn"] > df_res["mean_absolute_error_val"], "train_more_val"] = True

first_columns = [
    'selected',
    'passed_test_samples',
    'train_more_val',
    'mean_absolute_error_trn',
    'mean_absolute_error_val',
    'pearson_corr_coef_trn',
    'pearson_corr_coef_val',
    'mean_absolute_error_cv_mean_trn',
    'mean_absolute_error_cv_std_trn',
    'pearson_corr_coef_cv_mean_trn',
    'pearson_corr_coef_cv_std_trn',
    'mean_absolute_error_cv_mean_val',
    'mean_absolute_error_cv_std_val',
    'pearson_corr_coef_cv_mean_val',
    'pearson_corr_coef_cv_std_val',
]
df_res = df_res[first_columns + [col for col in df_res.columns if col not in first_columns]]
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")
df_samples_test.to_excel(f"{path_runs}/test_samples.xlsx", index=True, index_label="file")

# 4. Decider for central

In [None]:
part_check = "tst_ctrl_all"
part_check_thld_mean = 8.43

df = pd.read_excel(f"{path}/data.xlsx", index_col=0)
samples_test = df.index[df["Split"] == "tst_ctrl_all"].values

models_all = [
    "elastic_net",
    "xgboost",
    "lightgbm",
    "catboost",
    "widedeep_tab_mlp",
    "nam",
    "nbm_spam_nam",
    "pytorch_tabular_node",
    "danet",
    "widedeep_tab_net",
    "pytorch_tabular_autoint",
    "widedeep_saint",
    "widedeep_ft_transformer"
]

models_main = [
    "danet",
    "widedeep_saint",
    "widedeep_ft_transformer"
]

path_models = f"{path}/models/46_trn_val_tst"

df_res = pd.DataFrame(index=models_all)
df_samples_test = pd.DataFrame(index=samples_test, columns=models_all)

for m in models_all:
    df_summary = pd.read_excel(f"{path_models}/{m}_trn_val_tst/multiruns/summary.xlsx", index_col=0)
    files_slctd = df_summary.index[df_summary["selected"] == True].values
    if len(files_slctd) != 1:
        raise ValueError(f"{m} model selection error")
    file_slctd = files_slctd[0]
    path_head, _ = os.path.split(file_slctd)
    path_head = path_head.replace('/46/', '/46_trn_val_tst/', 1)

    file_val = glob(f"{path_head}/metrics_val_best_*.xlsx")[0]
    df_res_val = pd.read_excel(file_val, index_col=0)

    df_res.at[m, 'val_mae_best'] = df_res_val.at['mean_absolute_error', 'val']
    df_res.at[m, 'val_mae_mean'] = df_res_val.at['mean_absolute_error_cv_mean', 'val']
    df_res.at[m, 'val_mae_std'] = df_res_val.at['mean_absolute_error_cv_std', 'val']
    df_res.at[m, 'val_rho_best'] = df_res_val.at['pearson_corr_coef', 'val']
    df_res.at[m, 'val_rho_mean'] = df_res_val.at['pearson_corr_coef_cv_mean', 'val']
    df_res.at[m, 'val_rho_std'] = df_res_val.at['pearson_corr_coef_cv_std', 'val']

    df_pred = pd.read_excel(f"{path_head}/predictions.xlsx", index_col=0)
    df_pred["Error"] = df_pred["Prediction"] - df_pred["Age"]
    df_pred["AbsError"] = df_pred["Error"].abs()
    part_col = df_pred.columns[0]
    df_pred = df_pred.loc[df_pred[part_col] == part_check, :]
    df_pred.sort_values(["AbsError"], ascending=[True], inplace=True)
    df_pred["MeanAbsErrorExpanding"] = df_pred["AbsError"].expanding().mean()
    samples_passed = df_pred.index[df_pred["MeanAbsErrorExpanding"] < part_check_thld_mean].values
    df_samples_test.loc[:, m] = 0
    df_samples_test.loc[samples_passed, m] = 1
    n_samples_passed = len(samples_passed)
    print(f"{m}: {n_samples_passed}")

df_samples_test.to_excel(f"{path_models}/samples_test_full.xlsx", index_label="model")

conditions = [df_samples_test[m] == 1 for m in models_main]
df_samples_test = df_samples_test[conjunction(conditions)]
samples_test_final = df_samples_test.index.values
print(len(samples_test_final))

for m in models_all:
    df_summary = pd.read_excel(f"{path_models}/{m}_trn_val_tst/multiruns/summary.xlsx", index_col=0)
    files_slctd = df_summary.index[df_summary["selected"] == True].values
    if len(files_slctd) != 1:
        raise ValueError(f"{m} model selection error")
    file_slctd = files_slctd[0]
    path_head, _ = os.path.split(file_slctd)
    path_head = path_head.replace('/46/', '/46_trn_val_tst/', 1)

    df_pred = pd.read_excel(f"{path_head}/predictions.xlsx", index_col=0)
    df_pred = df_pred.loc[samples_test_final, :]
    y_real = df_pred["Age"]
    y_pred = df_pred["Prediction"]
    mae_tst = mean_absolute_error(y_real, y_pred)
    rho_tst = stats.pearsonr(y_real, y_pred).statistic
    df_res.at[m, 'tst_mae'] = mae_tst
    df_res.at[m, 'tst_rho'] = rho_tst

df_res.to_excel(f"{path_models}/baseline_results.xlsx", index_label="model")
df_samples_test.to_excel(f"{path_models}/samples_test_slctd.xlsx", index_label="model")

# 5. Updating data

In [None]:
df = pd.read_excel(f"{path}/data_full.xlsx", index_col=0)
ids_tst_central_all = df.index[df["Split"] == "tst_ctrl_central"].values

df_tst_central_include = pd.read_excel(f"{path}/models/46_trn_val_tst/samples_test_slctd.xlsx", index_col=0)
df_tst_central_include["index"] = df_tst_central_include.index.values
df_tst_central_include["index"] = df_tst_central_include["index"].str.rstrip('_copy')
ids_tst_central_include = df_tst_central_include["index"].values

ids_tst_central_exclude = list(set(ids_tst_central_all) - set(ids_tst_central_include))

df.drop(index=ids_tst_central_exclude, inplace=True)

models_all = [
    "elastic_net",
    "xgboost",
    "lightgbm",
    "catboost",
    "widedeep_tab_mlp",
    "nam",
    "nbm_spam_nam",
    "pytorch_tabular_node",
    "danet",
    "widedeep_tab_net",
    "pytorch_tabular_autoint",
    "widedeep_saint",
    "widedeep_ft_transformer"
]

path_models = f"{path}/models/46"
for m in models_all:
    df[f"best_{m}"] = df["Split"]

    df_summary = pd.read_excel(f"{path_models}/{m}_trn_val_tst/multiruns/summary.xlsx", index_col=0)
    files_slctd = df_summary.index[df_summary["selected"] == True].values
    if len(files_slctd) != 1:
        raise ValueError(f"{m} model selection error")
    file_slctd = files_slctd[0]
    path_head, _ = os.path.split(file_slctd)

    df_pred = pd.read_excel(f"{path_head}/predictions.xlsx", index_col=0)
    part_col = df_pred.columns[0]
    ids_trn = df_pred.index[df_pred[part_col] == "trn"].values
    df.loc[ids_trn, f"best_{m}"] = "trn"
    ids_val = df_pred.index[df_pred[part_col] == "val"].values
    df.loc[ids_val, f"best_{m}"] = "val"

df.to_excel(f"{path}/data_selected.xlsx", index_label="index")

# 6. Decider for central

In [None]:
part_check = "tst_ctrl_yakutia"
part_check_thld_mean = 99999

df = pd.read_excel(f"{path}/data_selected.xlsx", index_col=0)
samples_test = df.index[df["Split"] == part_check].values

models_all = [
    "elastic_net",
    "xgboost",
    "lightgbm",
    "catboost",
    "widedeep_tab_mlp",
    "nam",
    "nbm_spam_nam",
    "pytorch_tabular_node",
    "danet",
    "widedeep_tab_net",
    "pytorch_tabular_autoint",
    "widedeep_saint",
    "widedeep_ft_transformer"
]

models_main = [
    "danet",
    "widedeep_saint",
    "widedeep_ft_transformer"
]

path_models = f"{path}/models/46_inference"

df_res = pd.DataFrame(index=models_all)
df_samples_test = pd.DataFrame(index=samples_test, columns=models_all)

for m in models_all:
    file_val = glob(f"{path_models}/{m}_inference/runs/*/metrics_val.xlsx")[0]
    df_res_val = pd.read_excel(file_val, index_col=0)
    df_res.at[m, 'val_mae'] = df_res_val.at['mean_absolute_error', 'val']
    df_res.at[m, 'val_rho'] = df_res_val.at['pearson_corr_coef', 'val']

    file_tst_central = glob(f"{path_models}/{m}_inference/runs/*/metrics_tst_ctrl_central.xlsx")[0]
    df_res_tst_central = pd.read_excel(file_tst_central, index_col=0)
    df_res.at[m, 'tst_central_mae'] = df_res_tst_central.at['mean_absolute_error', 'tst_ctrl_central']
    df_res.at[m, 'tst_central_rho'] = df_res_tst_central.at['pearson_corr_coef', 'tst_ctrl_central']

    file_pred = glob(f"{path_models}/{m}_inference/runs/*/df.xlsx")[0]
    df_pred = pd.read_excel(file_pred, index_col=0)
    df_pred = df_pred.loc[df_pred[f"best_{m}"] == part_check, :]
    df_pred.sort_values(["Prediction error abs"], ascending=[True], inplace=True)
    df_pred["MeanAbsErrorExpanding"] = df_pred["Prediction error abs"].expanding().mean()
    samples_passed = df_pred.index[df_pred["MeanAbsErrorExpanding"] < part_check_thld_mean].values
    df_samples_test.loc[:, m] = 0
    df_samples_test.loc[samples_passed, m] = 1
    n_samples_passed = len(samples_passed)
    print(f"{m}: {n_samples_passed}")

df_samples_test.to_excel(f"{path_models}/samples_test_full.xlsx", index_label="model")

conditions = [df_samples_test[m] == 1 for m in models_main]
df_samples_test = df_samples_test[conjunction(conditions)]
samples_test_final = df_samples_test.index.values
print(len(samples_test_final))

for m in models_all:
    file_pred = glob(f"{path_models}/{m}_inference/runs/*/df.xlsx")[0]
    df_pred = pd.read_excel(file_pred, index_col=0)
    df_pred = df_pred.loc[samples_test_final, :]
    y_real = df_pred["Age"]
    y_pred = df_pred["Prediction"]
    mae_tst = mean_absolute_error(y_real, y_pred)
    rho_tst = stats.pearsonr(y_real, y_pred).statistic
    df_res.at[m, 'tst_yakutia_mae'] = mae_tst
    df_res.at[m, 'tst_yakutia_rho'] = rho_tst

df_res.to_excel(f"{path_models}/baseline_results_{part_check_thld_mean}.xlsx", index_label="model")
df_samples_test.to_excel(f"{path_models}/samples_test_slctd_{part_check_thld_mean}.xlsx", index_label="model")