# Debugging autoreload

In [ ]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
import itertools
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
from impyute.imputation.cs import fast_knn
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
from statannotations.Annotator import Annotator
from sklearn.model_selection import RepeatedStratifiedKFold
import functools
from src.models.tabular.base import get_model_framework_dict
import matplotlib.lines as mlines
import patchworklib as pw
from glob import glob
from omegaconf import OmegaConf
import os
from pathlib import Path
from src.models.tabular.widedeep.ft_transformer import WDFTTransformerModel
from src.models.tabular.widedeep.tab_mlp import WDTabMLPModel
from src.models.tabular.widedeep.tab_net import WDTabNetModel
import xgboost as xgb
import torch
import lightgbm as lgb
from catboost import CatBoost
import pickle
from src.tasks.metrics import get_reg_metrics

import warnings
warnings.filterwarnings("ignore", ".*will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.*")

def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Immunomarkers

In [None]:
feat_imm = 'IL27'
model = 'widedeep_tab_mlp'

path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/060_EpiSImAge/fimmu_features"
path_runs = f"{path}/{feat_imm}/models/{model}_trn_val_tst/multiruns"

files = glob(f"{path_runs}/*/*/metrics_all_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)

for file in files:

    head, tail = os.path.split(file)
    
    df_res.at[file, 'index'] = head.replace(path_runs, '')
    # Metrics
    df_metrics = pd.read_excel(file, index_col="metric")
    for metric in df_metrics.index.values:
        df_res.at[file, metric + "_val"] = df_metrics.at[metric, "val"]
        df_res.at[file, metric + "_trn"] = df_metrics.at[metric, "trn"]
        df_res.at[file, metric + "_tst"] = df_metrics.at[metric, "tst"]
        df_res.at[file, metric + "_trn_val"] = df_metrics.at[metric, "trn_val"]
        df_res.at[file, metric + "_val_tst"] = df_metrics.at[metric, "val_tst"]
        df_res.at[file, metric + "_trn_val_tst"] = df_metrics.at[metric, "trn_val_tst"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res.set_index('index', inplace=True)
df_res["train_more_val"] = False
df_res["selected"] = False
df_res.loc[df_res["mean_absolute_error_trn"] > df_res["mean_absolute_error_val"], "train_more_val"] = True

first_columns = {
    'selected': 'selected',
    'train_more_val': 'train_more_val',
    'mean_absolute_error_trn': 'MAE trn',
    'mean_absolute_error_val': 'MAE val',
    'mean_absolute_error_tst': 'MAE tst',
    'mean_absolute_error_val_tst': 'MAE val_tst',
    'mean_absolute_error_trn_val_tst': 'MAE trn_val_tst',
    'pearson_corr_coef_trn': 'Pcorr trn',
    'pearson_corr_coef_val': 'Pcorr val',
    'pearson_corr_coef_tst': 'Pcorr tst',
    'pearson_corr_coef_val_tst': 'Pcorr val_tst',
    'pearson_corr_coef_trn_val_tst': 'Pcorr trn_val_tst',
    'mean_absolute_error_cv_mean_trn': 'MAE cv_mean_trn',
    'mean_absolute_error_cv_std_trn': 'MAE cv_std_trn',
    'mean_absolute_error_cv_mean_val': 'MAE cv_mean_val',
    'mean_absolute_error_cv_std_val': 'MAE cv_std_val',
    'pearson_corr_coef_cv_mean_trn': 'Pcorr cv_mean_trn',
    'pearson_corr_coef_cv_std_trn': 'Pcorr cv_std_trn',
    'pearson_corr_coef_cv_mean_val': 'Pcorr cv_mean_val',
    'pearson_corr_coef_cv_std_val': 'Pcorr cv_std_val',
}
df_res = df_res[list(first_columns.keys()) + [col for col in df_res.columns if col not in first_columns]]
df_res.rename(columns=first_columns, inplace=True)
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")

# EpiSImAge calculation

In [None]:
n_epi_feats = 100

path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/060_EpiSImAge"
feats_imm_fimmu = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values

best_files = {
    'CXCL9': f"{path}/fimmu_features/CXCL9/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-07_23-55-29_1337/354/best_fold_0005.ckpt",
    'CCL22': f"{path}/fimmu_features/CCL22/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-09_12-39-48_1337/40/best_fold_0004.ckpt",
    'IL6': f"{path}/fimmu_features/IL6/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-10_02-07-13_1337/110/best_fold_0004.ckpt",
    'PDGFB': f"{path}/fimmu_features/PDGFB/models/lightgbm_trn_val_tst/multiruns/2024-03-10_12-48-20_1337/156/epoch_13_best_0006.model",
    'CD40LG': f"{path}/fimmu_features/CD40LG/models/lightgbm_trn_val_tst/multiruns/2024-03-10_12-37-12_1337/464/epoch_22_best_0000.model",
    'IL27': f"{path}/fimmu_features/IL27/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-10_16-53-13_1337/122/best_fold_0000.ckpt",
    'VEGFA': f"{path}/fimmu_features/VEGFA/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-10_16-06-26_1337/310/best_fold_0005.ckpt",
    'CSF1': f"{path}/fimmu_features/CSF1/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-10_16-50-13_1337/103/best_fold_0005.ckpt",
    'PDGFA': f"{path}/fimmu_features/PDGFA/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-09_23-31-10_1337/27/best_fold_0003.ckpt",
    'CXCL10': f"{path}/fimmu_features/CXCL10/models/widedeep_tab_mlp_trn_val_tst/multiruns/2024-03-09_12-20-47_1337/426/best_fold_0000.ckpt"
}

files_all_list = []
for feat_imm in feats_imm_fimmu:
    if feat_imm in ['CXCL9', 'CCL22', 'IL6', 'PDGFB', 'CD40LG']:
        files = glob(f"{path}/fimmu_features/{feat_imm}/models/*/*/*/*/epoch_*.model") + glob(f"{path}/fimmu_features/{feat_imm}/models/*/*/*/*/best_fold_*.ckpt")
        files_all_list.append(files)
    else:
        files_all_list.append([best_files[feat_imm]])
        
files_all_dicts = [dict(zip(feats_imm_fimmu, elem)) for elem in itertools.product(*files_all_list)]

model_simage = WDFTTransformerModel.load_from_checkpoint(checkpoint_path=f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/data/immuno/models/SImAge/best_fold_0002.ckpt")
model_simage.eval()
model_simage.freeze()

data_parts = ['GSEUNN', 'GSE87571', 'GSE40279']
dfs_tst = {}
for data_part in data_parts:
    df_pheno = pd.read_csv(f"{path}/{data_part}/pheno.csv", index_col=0)
    if data_part == 'GSEUNN':
        df_pheno.set_index("index", inplace=True)
        df_pheno = df_pheno.loc[df_pheno['Status'] == 'Control', :]
    elif data_part == 'GSE40279':
        df_pheno.set_index("gsm", inplace=True)
    df_betas = pd.read_pickle(f"{path}/{data_part}/betas.pkl")
    dfs_tst[data_part] = pd.merge(df_pheno.loc[:, ['Age']], df_betas, left_index=True, right_index=True)

In [None]:
df_combos_metrics = pd.DataFrame(index=range(len(files_all_dicts)))
for combo_id, combo in tqdm(enumerate(files_all_dicts), total=len(files_all_dicts)):
    df_imm_data = {}
    for data_part in data_parts:
        df_imm_data[data_part] = pd.DataFrame(columns=feats_imm_fimmu)
        df_imm_data[data_part]['Age'] = dfs_tst[data_part].loc[:, 'Age'].values
    
    for feat_imm in feats_imm_fimmu:
        
        feats_epi = pd.read_excel(f"{path}/fimmu_features/{feat_imm}/feats_con_{n_epi_feats}.xlsx", index_col=0).index.values
        
        X = {}
        y_pred = {}
        for data_part in data_parts:
            X[data_part] = dfs_tst[data_part].loc[:, feats_epi].values
        
        file = combo[feat_imm]
        df_combos_metrics.at[combo_id, feat_imm] = '/'.join(Path(file).parts[-5::])
        
        model_type = Path(file).parts[-5].replace('_trn_val_tst', '')
        model_framework_dict = get_model_framework_dict()
        model_framework = model_framework_dict[model_type]
        
        if model_framework == "pytorch":
            if model_type == "widedeep_tab_mlp":
                model = WDTabMLPModel.load_from_checkpoint(checkpoint_path=file)
                model.eval()
                model.freeze()
            for data_part in data_parts:
                y_pred[data_part] = model(torch.from_numpy(X[data_part])).cpu().detach().numpy().ravel()
        
        elif model_framework == "stand_alone":
            if model_type == "xgboost":
                model = xgb.Booster()
                model.load_model(file)
                for data_part in data_parts:
                    dmat = xgb.DMatrix(X[data_part], feature_names=feats_epi, enable_categorical=True)
                    y_pred[data_part] = model.predict(dmat)
        
            elif model_type == "catboost":
                model = CatBoost()
                model.load_model(file)
                for data_part in data_parts:
                    y_pred[data_part] = model.predict(X[data_part]).astype('float32')
        
            elif model_type == "lightgbm":
                model = lgb.Booster(model_file=file)
                for data_part in data_parts:
                    y_pred[data_part] = model.predict(X[data_part], num_iteration=model.best_iteration).astype('float32')
        
            elif model_type == "elastic_net":
                model = pickle.load(open(file, 'rb'))
                for data_part in data_parts:
                    y_pred[data_part] = model.predict(X[data_part]).astype('float32')
        
            else:
                raise ValueError(f"Model {model_type} is not supported")
        
        else:
            raise ValueError(f"Unsupported model_framework: {model_framework}")
        
        for data_part in data_parts:
            df_imm_data[data_part][feat_imm] = np.exp(y_pred[data_part])
    
    y = {}
    y_pred = {}
    for data_part in data_parts:
        y[data_part] = df_imm_data[data_part].loc[:, 'Age'].values
        y_pred[data_part] = model_simage(torch.from_numpy(df_imm_data[data_part].loc[:, feats_imm_fimmu].values)).cpu().detach().numpy().ravel()
        metrics = get_reg_metrics()
        for m in metrics:
            y_real_torch = torch.from_numpy(np.float32(y[data_part]))
            y_pred_torch = torch.from_numpy(y_pred[data_part])
            m_val = float(metrics[m][0](y_pred_torch, y_real_torch).numpy())
            metrics[m][0].reset()
            df_combos_metrics.at[combo_id, f"{data_part}_{m}"] = m_val


first_columns = [
    "GSEUNN_mean_absolute_error",
    "GSE87571_mean_absolute_error",
    "GSE40279_mean_absolute_error",
]
df_combos_metrics = df_combos_metrics[first_columns + [col for col in df_combos_metrics.columns if col not in first_columns]]
df_combos_metrics.to_excel(f"{path}/fimmu_features/combos_metrics.xlsx")

# SImAge2

In [None]:
model = 'widedeep_tab_mlp'

path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/060_EpiSImAge/SImAge2"
path_runs = f"{path}/models/{model}_trn_val_tst/multiruns"

files = glob(f"{path_runs}/*/*/metrics_all_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)

for file in files:

    head, tail = os.path.split(file)
    
    df_res.at[file, 'index'] = head.replace(path_runs, '')
    # Metrics
    df_metrics = pd.read_excel(file, index_col="metric")
    for metric in df_metrics.index.values:
        df_res.at[file, metric + "_val"] = df_metrics.at[metric, "val"]
        df_res.at[file, metric + "_trn"] = df_metrics.at[metric, "trn"]
        df_res.at[file, metric + "_tst"] = df_metrics.at[metric, "tst"]
        df_res.at[file, metric + "_trn_val"] = df_metrics.at[metric, "trn_val"]
        df_res.at[file, metric + "_val_tst"] = df_metrics.at[metric, "val_tst"]
        df_res.at[file, metric + "_trn_val_tst"] = df_metrics.at[metric, "trn_val_tst"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res.set_index('index', inplace=True)
df_res["train_more_val"] = False
df_res["selected"] = False
df_res.loc[df_res["mean_absolute_error_trn"] > df_res["mean_absolute_error_val"], "train_more_val"] = True

first_columns = {
    'selected': 'selected',
    'train_more_val': 'train_more_val',
    'mean_absolute_error_trn': 'MAE trn',
    'mean_absolute_error_val': 'MAE val',
    'mean_absolute_error_tst': 'MAE tst',
    'mean_absolute_error_val_tst': 'MAE val_tst',
    'mean_absolute_error_trn_val_tst': 'MAE trn_val_tst',
    'pearson_corr_coef_trn': 'Pcorr trn',
    'pearson_corr_coef_val': 'Pcorr val',
    'pearson_corr_coef_tst': 'Pcorr tst',
    'pearson_corr_coef_val_tst': 'Pcorr val_tst',
    'pearson_corr_coef_trn_val_tst': 'Pcorr trn_val_tst',
    'mean_absolute_error_cv_mean_trn': 'MAE cv_mean_trn',
    'mean_absolute_error_cv_std_trn': 'MAE cv_std_trn',
    'mean_absolute_error_cv_mean_val': 'MAE cv_mean_val',
    'mean_absolute_error_cv_std_val': 'MAE cv_std_val',
    'pearson_corr_coef_cv_mean_trn': 'Pcorr cv_mean_trn',
    'pearson_corr_coef_cv_std_trn': 'Pcorr cv_std_trn',
    'pearson_corr_coef_cv_mean_val': 'Pcorr cv_mean_val',
    'pearson_corr_coef_cv_std_val': 'Pcorr cv_std_val',
}
df_res = df_res[list(first_columns.keys()) + [col for col in df_res.columns if col not in first_columns]]
df_res.rename(columns=first_columns, inplace=True)
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")

# SImAge log

In [None]:
model = 'widedeep_tab_mlp'

path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/060_EpiSImAge/SImAge_log"
path_runs = f"{path}/models/{model}_trn_val_tst/multiruns"

files = glob(f"{path_runs}/*/*/metrics_all_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)

for file in files:

    head, tail = os.path.split(file)
    
    df_res.at[file, 'index'] = head.replace(path_runs, '')
    # Metrics
    df_metrics = pd.read_excel(file, index_col="metric")
    for metric in df_metrics.index.values:
        df_res.at[file, metric + "_val"] = df_metrics.at[metric, "val"]
        df_res.at[file, metric + "_trn"] = df_metrics.at[metric, "trn"]
        df_res.at[file, metric + "_tst"] = df_metrics.at[metric, "tst"]
        df_res.at[file, metric + "_trn_val"] = df_metrics.at[metric, "trn_val"]
        df_res.at[file, metric + "_val_tst"] = df_metrics.at[metric, "val_tst"]
        df_res.at[file, metric + "_trn_val_tst"] = df_metrics.at[metric, "trn_val_tst"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res.set_index('index', inplace=True)
df_res["train_more_val"] = False
df_res["selected"] = False
df_res.loc[df_res["mean_absolute_error_trn"] > df_res["mean_absolute_error_val"], "train_more_val"] = True

first_columns = {
    'selected': 'selected',
    'train_more_val': 'train_more_val',
    'mean_absolute_error_trn': 'MAE trn',
    'mean_absolute_error_val': 'MAE val',
    'mean_absolute_error_tst': 'MAE tst',
    'mean_absolute_error_val_tst': 'MAE val_tst',
    'mean_absolute_error_trn_val_tst': 'MAE trn_val_tst',
    'pearson_corr_coef_trn': 'Pcorr trn',
    'pearson_corr_coef_val': 'Pcorr val',
    'pearson_corr_coef_tst': 'Pcorr tst',
    'pearson_corr_coef_val_tst': 'Pcorr val_tst',
    'pearson_corr_coef_trn_val_tst': 'Pcorr trn_val_tst',
    'mean_absolute_error_cv_mean_trn': 'MAE cv_mean_trn',
    'mean_absolute_error_cv_std_trn': 'MAE cv_std_trn',
    'mean_absolute_error_cv_mean_val': 'MAE cv_mean_val',
    'mean_absolute_error_cv_std_val': 'MAE cv_std_val',
    'pearson_corr_coef_cv_mean_trn': 'Pcorr cv_mean_trn',
    'pearson_corr_coef_cv_std_trn': 'Pcorr cv_std_trn',
    'pearson_corr_coef_cv_mean_val': 'Pcorr cv_mean_val',
    'pearson_corr_coef_cv_std_val': 'Pcorr cv_std_val',
}
df_res = df_res[list(first_columns.keys()) + [col for col in df_res.columns if col not in first_columns]]
df_res.rename(columns=first_columns, inplace=True)
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")