# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [8]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from sklearn.model_selection import BaseCrossValidator, ParameterGrid, ParameterSampler
from sklearn.impute import KNNImputer
import torch
import pickle
import shutil
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.model_selection import train_test_split
import numpy as np
from pytorch_tabular.utils import make_mixed_dataset, print_metrics
from pytorch_tabular import available_models
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, GANDALFConfig, TabNetModelConfig, FTTransformerConfig, DANetConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.tabular_model_tuner import TabularModelTuner
from sklearn.metrics import mean_absolute_error
from pytorch_tabular import MODEL_SWEEP_PRESETS
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.utils.hash import dict_hash
from src.pt.hyper_opt import train_hyper_opt
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from regression_bias_corrector import LinearBiasCorrector
import optuna
from sklearn.preprocessing import LabelEncoder
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import matplotlib.lines as mlines
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]

# Best models plots

In [None]:
path = f"E:/Git/MillenniumAge/data/small_models"

expl_type = 'current'

feat_trgt = 'Возраст'

feats_sets_models = {
    'Биохимические исследования, женщины': 'Биохимические исследования/F',
    'Биохимические исследования, мужчины': 'Биохимические исследования/M',
    'Гематологические исследования, женщины': 'Гематологические исследования/F',
    'Гематологические исследования, мужчины': 'Гематологические исследования/M',
    'Оценка состава тела, женщины': 'Оценка состава тела/F',
    'Оценка состава тела, мужчины': 'Оценка состава тела/M',
    'Половые гормоны, женщины': 'Половые гормоны/F',
    'Половые гормоны, мужчины': 'Половые гормоны/M',
    'Электрокардиограмма': 'Электрокардиограмма',
}

colors_feats_sets = {
    'Биохимические исследования, женщины': 'crimson',
    'Биохимические исследования, мужчины': 'darkred',
    'Гематологические исследования, женщины': 'orchid',
    'Гематологические исследования, мужчины': 'fuchsia',
    'Оценка состава тела, женщины': 'mediumblue',
    'Оценка состава тела, мужчины': 'darkblue',
    'Половые гормоны, женщины': 'gold',
    'Половые гормоны, мужчины': 'orange',
    'Электрокардиограмма': '#088F8F',
}

for feats_set, feats_set_path in feats_sets_models.items():
    data = pd.read_excel(f"{path}/{feats_set_path}/data.xlsx", index_col=0)
    feats = pd.read_excel(f"{path}/{feats_set_path}/feats.xlsx", index_col=0)
    results = pd.read_excel(f"{path}/{feats_set_path}/model/df.xlsx", index_col=0)
    metrics = pd.read_excel(f"{path}/{feats_set_path}/model/metrics.xlsx", index_col=0)
    df_shap = pd.read_excel(f"{path}/{feats_set_path}/model/explanation.xlsx", index_col=0)
    model = TabularModel.load_model(f"{path}/{feats_set_path}/model")
    corrector = LinearBiasCorrector()
    corrector.fit(results.loc[results['Group'] == 'Train', feat_trgt].values, results.loc[results['Group'] == 'Train', 'Prediction'].values)
    
    
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(8, 5 + 1.5 + 0.15 * feats.shape[0]),
        layout="constrained"
    )
    subfigs = fig.subfigures(
        nrows=2,
        ncols=1,
        height_ratios=[5, 1.5 + 0.15 * feats.shape[0]],
        wspace=0.01,
        hspace=0.01,
    )
    
    subfigs_row = subfigs[0].subfigures(
        nrows=1,
        ncols=1,
        # width_ratios=[1, 1],
        wspace=0.15,
        hspace=0.01,
    )
    
    axs = subfigs_row.subplot_mosaic(
        [
            ['table', 'table'],
            ['scatter', 'violin'],
        ],
        # figsize=(6, 1.5 + 6),
        height_ratios=[1, 4],
        width_ratios=[3, 1.5],
        gridspec_kw={
            # "bottom": 0.14,
            # "top": 0.95,
            # "left": 0.1,
            # "right": 0.5,
            "wspace": 0.01,
            "hspace": 0.01,
        },
    )
    subfigs_row.suptitle(feats_set, fontsize='large')

    df_table = pd.DataFrame(index=['MAE', fr"Pearson $\mathbf{{\rho}}$", "Bias"], columns=['Train', 'Validation', 'Test'])
    for part in ['Train', 'Validation', 'Test']:
        df_table.at['MAE', part] = f"{metrics.at[part, 'mean_absolute_error_unbiased']:0.3f}"
        df_table.at[fr"Pearson $\mathbf{{\rho}}$", part] = f"{metrics.at[part, 'pearson_corrcoef_unbiased']:0.3f}"
        df_table.at["Bias", part] = f"{metrics.at[part, 'bias_unbiased']:0.3f}"

    col_defs = [
        ColumnDefinition(
            name="index",
            title='',
            textprops={"ha": "center", "weight": "bold"},
            width=2.5,
            # border="both",
        ),
        ColumnDefinition(
            name="Train",
            textprops={"ha": "left"},
            width=1.5,
            border="left",
        ),
        ColumnDefinition(
            name="Validation",
            textprops={"ha": "left"},
            width=1.5,
        ),
        ColumnDefinition(
            name="Test",
            textprops={"ha": "left"},
            width=1.5,
        )
    ]
    table = Table(
        df_table,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs['table'],
        textprops={"fontsize": 8},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=['Train', 'Validation', 'Test'])

    xy_min, xy_max = np.quantile(results[[feat_trgt, 'Prediction Unbiased']].values.flatten(), [0.01, 0.99])
    xy_ptp = xy_max - xy_min

    kdeplot = sns.kdeplot(
        data=results.loc[results['Group'].isin(['Train', 'Validation']), :],
        x=feat_trgt,
        y='Prediction Unbiased',
        fill=True,
        cbar=False,
        thresh=0.05,
        color=colors_feats_sets[feats_set],
        legend=False,
        ax=axs['scatter']
    )
    scatter = sns.scatterplot(
        data=results.loc[results['Group'] == 'Test', :],
        x=feat_trgt,
        y="Prediction Unbiased",
        linewidth=0.5,
        alpha=0.8,
        edgecolor="k",
        s=25,
        color=colors_feats_sets[feats_set],
        ax=axs['scatter'],
    )
    bisect = sns.lineplot(
        x=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
        y=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
        linestyle='--',
        color='black',
        linewidth=1.0,
        ax=axs['scatter']
    )
    regplot = sns.regplot(
        data=results.loc[results['Group'] == 'Train', :],
        x=feat_trgt,
        y='Prediction Unbiased',
        color='k',
        scatter=False,
        truncate=False,
        ax=axs['scatter']
    )
    axs['scatter'].set_xlim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
    axs['scatter'].set_ylim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
    axs['scatter'].set_ylabel("Биологический возраст")
    axs['scatter'].set_xlabel("Возраст")

    violin = sns.violinplot(
        data=results.loc[results['Group'].isin(['Train', 'Validation']), :],
        x=[0] * results.loc[results['Group'].isin(['Train', 'Validation']), :].shape[0],
        y='Error Unbiased',
        color=make_rgb_transparent(mcolors.to_rgb(colors_feats_sets[feats_set]), (1, 1, 1), 0.5),
        density_norm='width',
        saturation=0.75,
        linewidth=1.0,
        ax=axs['violin'],
        legend=False,
    )
    swarm = sns.swarmplot(
        data=results.loc[results['Group'] == 'Test', :],
        x=[0] * results.loc[results['Group'] == 'Test', :].shape[0],
        y='Error Unbiased',
        color=colors_feats_sets[feats_set],
        linewidth=0.5,
        ax=axs['violin'],
        size= 50 / np.sqrt(results.loc[results['Group'] == 'Test', :].shape[0]),
        legend=False,
    )
    axs['violin'].set_ylabel('Возрастная акселерация')
    axs['violin'].set_xlabel('')
    axs['violin'].set(xticklabels=[]) 
    axs['violin'].set(xticks=[])
    
    
    if expl_type == 'recalc_gradient':
        df_shap = model.explain(data, method="GradientShap", baselines="b|100000")
        df_shap.index = data.index
    elif expl_type == 'recalc_sampling':
        ds_data_shap = data.copy()
        ds_cat_encoders = {}
        for f in feats.index:
            ds_cat_encoders[f] = LabelEncoder()
            ds_data_shap[f] = ds_cat_encoders[f].fit_transform(ds_data_shap[f])
        def predict_func(X):
            X_df = pd.DataFrame(data=X, columns=feats.index.to_list())
            for f in feats.index:
                X_df[f] = ds_cat_encoders[f].inverse_transform(X_df[f].astype(int).values)
            y = model.predict(X_df)[f'{feat_trgt}_prediction'].values
            y = corrector.predict(y)
            return y
        explainer = shap.SamplingExplainer(predict_func, ds_data_shap.loc[:, feats.index.to_list()].values)
        print(explainer.expected_value)
        shap_values = explainer.shap_values(ds_data_shap.loc[:, feats.index.to_list()].values)
        df_shap = pd.DataFrame(index=data.index, columns=feats.index.to_list(), data=shap_values)

    
    ds_fi = pd.DataFrame(index=feats.index.to_list(), columns=['mean(|SHAP|)', 'rho'])
    for f in feats.index.to_list():
        ds_fi.at[f, 'mean(|SHAP|)'] = df_shap[f].abs().mean()
        df_tmp = data.loc[:, [feat_trgt, f]].dropna(axis=0, how='any')
        if df_tmp.shape[0] > 1:
            vals_1 = df_tmp.loc[:, feat_trgt].values
            vals_2 = df_tmp.loc[:, f].values
            ds_fi.at[f, 'rho'], _ = scipy.stats.pearsonr(vals_1, vals_2)
    ds_fi.sort_values(['mean(|SHAP|)'], ascending=[False], inplace=True)
    ds_fi['Features'] = ds_fi.index.values
    
    axs_importance = subfigs[1].subplots(1, 2, width_ratios=[1, 4], gridspec_kw={'wspace':0.02, 'hspace': 0.02}, sharey=False, sharex=False)
    
    heatmap = sns.heatmap(
        ds_fi.loc[:, ['rho']].apply(pd.to_numeric).values,
        yticklabels=ds_fi.index.to_list(),
        annot=True,
        fmt=".2f",
        vmin=-1.0,
        vmax=1.0,
        cmap='coolwarm',
        linewidth=0.1,
        linecolor='black',
        cbar=False,
        #annot_kws={"fontsize": 15},
        # cbar_kws={
        #     # "shrink": 0.9,
        #     # "aspect": 30,
        #     #'fraction': 0.046, 
        #     #'pad': 0.04,
        # },
        ax=axs_importance[0]
    )
    # axs_importance[0].set(yticklabels=ds_fi.index.to_list())
    # heatmap_pos = axs_importance[2].get_position()
    # axs_importance[2].figure.axes[-1].set_position([heatmap_pos.x1 + 0.05, heatmap_pos.y0, 0.1, heatmap_pos.height])
    # axs_importance[2].figure.axes[-1].set_ylabel(r"Pearson $\rho$")
    # for spine in axs_importance[2].figure.axes[-1].spines.values():
    #     spine.set(visible=True, lw=0.25, edgecolor="black")
    # axs_importance[2].set_xlabel('')
    # axs_importance[2].set_ylabel('')
    # axs_importance[2].set(xticklabels=[])
    # axs_importance[2].set(xticks=[])
    
    
    barplot = sns.barplot(
        data=ds_fi,
        x='mean(|SHAP|)',
        y='Features',
        color=colors_feats_sets[feats_set],
        edgecolor='black',
        dodge=False,
        ax=axs_importance[1]
    )
    for container in barplot.containers:
        barplot.bar_label(container, label_type='edge', color='gray', fmt='%0.2f', fontsize=12, padding=4.0)
    axs_importance[1].set_ylabel('')
    # axs_importance[1].set(yticklabels=ds_fi.index.to_list())
    axs_importance[1].set(yticklabels=[])

    # is_colorbar = False
    # f_legends = []
    # for f in ds_fi.index:
        
    #     if df_shap[f].abs().max() > 10:
    #         f_shap_ll = df_shap[f].quantile(0.01)
    #         f_shap_hl = df_shap[f].quantile(0.99)
    #     else:
    #         f_shap_ll = df_shap[f].min()
    #         f_shap_hl = df_shap[f].max()
        
    #     f_index = df_shap.index[(df_shap[f] >= f_shap_ll) & (df_shap[f] <= f_shap_hl)].values
    #     f_shap = df_shap.loc[f_index, f].values
    #     f_vals = data.loc[f_index, f].values
        
    #     f_cmap = sns.color_palette("Spectral_r", as_cmap=True)
    #     f_norm = mcolors.Normalize(vmin=min(f_vals), vmax=max(f_vals)) 
    #     f_colors = {}
    #     for cval in f_vals:
    #         f_colors.update({cval: f_cmap(f_norm(cval))})

    #     strip = sns.stripplot(
    #         x=f_shap,
    #         y=[f]*len(f_shap),
    #         hue=f_vals,
    #         palette=f_colors,
    #         jitter=0.35,
    #         alpha=0.5,
    #         edgecolor='gray',
    #         linewidth=0.1,
    #         size=25 / np.sqrt(results.loc[results['Group'] == 'Test', :].shape[0]),
    #         legend=False,
    #         ax=axs_importance[2],
    #     )
        
    #     if not is_colorbar:
    #         sm = plt.cm.ScalarMappable(cmap=f_cmap, norm=f_norm)
    #         sm.set_array([])
    #         cbar = strip.figure.colorbar(sm)
    #         # cbar.set_label('Значения\nчисленных\nпризнаков', labelpad=-8, fontsize='large')
    #         cbar.set_ticks([min(f_vals), max(f_vals)])
    #         cbar.set_ticklabels(["Min", "Max"])
    #         is_colorbar = True 
    # # axs_importance[2].set(yticklabels=[])
    # axs_importance[2].set_xlabel('SHAP')
    
    # df_shap.to_excel(f"{path}/{feats_set_path}/{feats_set}/model/model_importance.xlsx")
    
    fig.savefig(f"{path}/{feats_set_path}/model/model.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/{feats_set_path}/model/model.pdf", bbox_inches='tight')
    plt.close(fig)

# Best models inference

In [None]:
dir_root = f"E:/Git/MillenniumAge"

feat_trgt = 'Возраст'

components = {
    'Оценка состава тела, женщины': {
        'name': 'Оценка состава тела',
        'path': f"{dir_root}/data/small_models/Оценка состава тела/F",
        'bkg_count': 10,
        'likelihood': 0.60,
        'mae_thld': 0.6
    },
    'Оценка состава тела, мужчины': {
        'name': 'Оценка состава тела',
        'path': f"{dir_root}/data/small_models/Оценка состава тела/M",
        'bkg_count': 10,
        'likelihood': 0.60,
        'mae_thld': 0.6
    },
    
    'Электрокардиограмма, все': {
        'name': 'Электрокардиограмма',
        'path': f"{dir_root}/data/small_models/Электрокардиограмма",
        'bkg_count': 15,
        'likelihood': 0.50,
        'mae_thld': 0.6
    },
    
    'Гематологические исследования, женщины': {
        'name': 'Гематологические исследования',
        'path': f"{dir_root}/data/small_models/Гематологические исследования/F",
        'bkg_count': 20,
        'likelihood': 0.4,
        'mae_thld': 1.0
    },
    'Гематологические исследования, мужчины': {
        'name': 'Гематологические исследования',
        'path': f"{dir_root}/data/small_models/Гематологические исследования/M",
        'bkg_count': 20,
        'likelihood': 0.4,
        'mae_thld': 0.8
    },
    
    'Биохимические исследования, женщины': {
        'name': 'Биохимические исследования',
        'path': f"{dir_root}/data/small_models/Биохимические исследования/F",
        'bkg_count': 12,
        'likelihood': 0.90,
        'mae_thld': 0.4
    },
    'Биохимические исследования, мужчины': {
        'name': 'Биохимические исследования',
        'path': f"{dir_root}/data/small_models/Биохимические исследования/M",
        'bkg_count': 12,
        'likelihood': 0.90,
        'mae_thld': 0.6
    },
    
    'Половые гормоны, женщины': {
        'name': 'Половые гормоны',
        'path': f"{dir_root}/data/small_models/Половые гормоны/F",
        'bkg_count': 15,
        'likelihood': 0.90,
        'mae_thld': 0.4
    },
    'Половые гормоны, мужчины': {
        'name': 'Половые гормоны',
        'path': f"{dir_root}/data/small_models/Половые гормоны/M",
        'bkg_count': 12,
        'likelihood': 0.90,
        'mae_thld': 0.4
    },
}

component_color_dict = {
    'Биохимические исследования': "#880808",
    'Гематологические исследования': "#DA70D6",
    'Половые гормоны': "#E49B0F",
    'Оценка состава тела': "#000080",
    'Электрокардиограмма': "#088F8F"
}

feats_all = ['Пол']
feats_pred_all = []
feats_input_all = []
feats_aux = []
for comp in components:
    components[comp]['data'] = pd.read_excel(f"{components[comp]['path']}/data.xlsx", index_col=0)
    components[comp]['feats'] = pd.read_excel(f"{components[comp]['path']}/feats.xlsx", index_col=0)
    components[comp]['results'] = pd.read_excel(f"{components[comp]['path']}/model/df.xlsx", index_col=0)
    components[comp]['metrics'] = pd.read_excel(f"{components[comp]['path']}/model/metrics.xlsx", index_col=0)
    components[comp]['model'] = TabularModel.load_model(f"{components[comp]['path']}/model")
    components[comp]['corrector'] = LinearBiasCorrector()
    comp_results = components[comp]['results']
    components[comp]['corrector'].fit(comp_results.loc[comp_results['Group'] == 'Train', feat_trgt].values, comp_results.loc[comp_results['Group'] == 'Train', 'Prediction'].values)
    res_cols = ['Group', 'Prediction', 'Error', 'Prediction Unbiased', 'Error Unbiased']
    components[comp]['data'].loc[components[comp]['data'].index, res_cols] = comp_results.loc[components[comp]['data'].index, res_cols]
    components[comp]['data_shap'] = components[comp]['data'].copy()
    
    feats = components[comp]['feats'].index.values
    feats = feats[feats != feat_trgt]
    feats_all += list(feats)
    feats_pred_all += [f"Предсказание {components[comp]['name']}", f"Возрастная Акселерация {components[comp]['name']}", f"Модель {components[comp]['name']}",]
    feats_all += [f"Модель {comp}", f"Предсказание {comp}", f"Возрастная Акселерация {comp}"]
    feats_aux += [f"Модель {comp}", f"Предсказание {comp}", f"Возрастная Акселерация {comp}"]
    feats_input_all += list(feats)
    
    components[comp]['feats_corr'] = pd.DataFrame(index=feats, columns=['Correlation'])
    for f in feats:
        components[comp]['feats_corr'].at[f, 'Correlation'], _ = scipy.stats.pearsonr(components[comp]['data'].loc[:, f].values, components[comp]['data'].loc[:, feat_trgt].values)

feats_all = list(dict.fromkeys(feats_all)) + list(dict.fromkeys(feats_pred_all))
feats_input_all = list(dict.fromkeys(feats_input_all))
feats_aux = list(dict.fromkeys(feats_aux))

for comp in components:
    print(f"{comp}: {components[comp]['data'].shape[0]}")
    mae = components[comp]['metrics'].at['Test', 'mean_absolute_error_unbiased']
    # rho = components[comp]['metrics'].at['Test', 'pearson_corrcoef_unbiased'] * components[comp]['likelihood']
    # rho = components[comp]['metrics'].at['Test', 'pearson_corrcoef_unbiased'] * components[comp]['feats_corr']['Correlation'].abs().max()
    rho = components[comp]['metrics'].at['Test', 'pearson_corrcoef_unbiased'] * components[comp]['feats_corr']['Correlation'].abs().max() * components[comp]['likelihood']
    curr_threshold = rho * mae * components[comp]['mae_thld']
    print(f'MAE: {mae}, rho: {rho}, threshold (rho*MAE*mae_thld): {curr_threshold}')

In [None]:
data = pd.read_excel("E:/YandexDisk/Work/bbd/millennium/models/data.xlsx", index_col=0)

In [None]:
nan_part = 0.1

data_models = pd.DataFrame(columns=feats_all)
for sample_id in tqdm(data.index.values, desc='Вычисление биологического возраста'):
    
    data_models.at[sample_id, "Пол"] = data.at[sample_id, "Пол"]
    
    components_trgt = {}
    
    if data.at[sample_id, "Пол"] == 'Ж':
        
        components_trgt['Оценка состава тела'] = 'Оценка состава тела, женщины'
        
        components_sex_hormones = [
            'Половые гормоны, женщины',
        ]
        for comp in components_sex_hormones:
            feats = components[comp]['feats'].index.values
            n_feats = len(feats)
            n_nans = data.loc[sample_id, feats].isna().sum()
            if n_nans / n_feats < nan_part:
                components_trgt['Половые гормоны'] = comp
                break
            else:
                data_models.at[sample_id, f"Модель {comp}"] = False
        
        components_hematology = [
            'Гематологические исследования, женщины',
        ]
        for comp in components_hematology:
            feats = components[comp]['feats'].index.values
            n_feats = len(feats)
            n_nans = data.loc[sample_id, feats].isna().sum()
            if n_nans / n_feats < nan_part:
                components_trgt['Гематологические исследования'] = comp
                break
            else:
                data_models.at[sample_id, f"Модель {comp}"] = False
                
        components_biochem = [
            'Биохимические исследования, женщины',
        ]
        for comp in components_biochem:
            feats = components[comp]['feats'].index.values
            n_feats = len(feats)
            n_nans = data.loc[sample_id, feats].isna().sum()
            if n_nans / n_feats < nan_part:
                components_trgt['Биохимические исследования'] = comp
                break
            else:
                data_models.at[sample_id, f"Модель {comp}"] = False
        
    elif data.at[sample_id, "Пол"] == 'М':
        
        components_trgt['Оценка состава тела'] = 'Оценка состава тела, мужчины'
        
        components_sex_hormones = [
            'Половые гормоны, мужчины',
        ]
        for comp in components_sex_hormones:
            feats = components[comp]['feats'].index.values
            n_feats = len(feats)
            n_nans = data.loc[sample_id, feats].isna().sum()
            if n_nans / n_feats < nan_part:
                components_trgt['Половые гормоны'] = comp
                break
            else:
                data_models.at[sample_id, f"Модель {comp}"] = False
                
        components_hematology = [
            'Гематологические исследования, мужчины',
        ]
        for comp in components_hematology:
            feats = components[comp]['feats'].index.values
            n_feats = len(feats)
            n_nans = data.loc[sample_id, feats].isna().sum()
            if n_nans / n_feats < nan_part:
                components_trgt['Гематологические исследования'] = comp
                break
            else:
                data_models.at[sample_id, f"Модель {comp}"] = False
                
        components_biochem = [
            'Биохимические исследования, мужчины',
        ]
        for comp in components_biochem:
            feats = components[comp]['feats'].index.values
            n_feats = len(feats)
            n_nans = data.loc[sample_id, feats].isna().sum()
            if n_nans / n_feats < nan_part:
                components_trgt['Биохимические исследования'] = comp
                break
            else:
                data_models.at[sample_id, f"Модель {comp}"] = False
        
    else:
        print(f"Пол для {sample_id}: {data.at[sample_id, 'Пол']}")
    
    if data.at[sample_id, 'Возраст'] < 15:
        components_trgt['Электрокардиограмма'] = 'Электрокардиограмма, все'
    else:
        components_trgt['Электрокардиограмма'] = 'Электрокардиограмма, все'
        
    n_pos = 0
    n_neg = 0
    comp_present = []
    for _, comp in components_trgt.items():
        data_models.at[sample_id, f"Модель {comp}"] = False
        feats = components[comp]['feats'].index.values
        feats_w_trgt = list(feats) + [feat_trgt]
        n_feats = len(feats)
        if set(feats).issubset(data.columns):
            n_nans = data.loc[sample_id, feats].isna().sum()
            # rho = components[comp]['metrics'].at['Test', 'pearson_corrcoef_unbiased'] * components[comp]['likelihood']
            # rho = components[comp]['metrics'].at['Test', 'pearson_corrcoef_unbiased'] * components[comp]['feats_corr']['Correlation'].abs().max()
            rho = components[comp]['metrics'].at['Test', 'pearson_corrcoef_unbiased'] * components[comp]['feats_corr']['Correlation'].abs().max() * components[comp]['likelihood']
            if n_nans / n_feats < nan_part:
                comp_present.append(comp)
                data_sample = data.loc[[sample_id], feats_w_trgt]
                if n_nans != 0:
                    data_bkcg = components[comp]['data'].loc[:, feats_w_trgt]
                    data_imp = pd.concat([data_sample, data_bkcg], axis=0, ignore_index=True)
                    imputer = KNNImputer(n_neighbors=5)
                    data_sample.loc[sample_id, feats_w_trgt] = imputer.fit_transform(data_imp.loc[:, feats_w_trgt].values)[0, :]
                pred = components[comp]['model'].predict(data_sample)[f'{feat_trgt}_prediction'].values
                pred = components[comp]['corrector'].predict(pred)
                data_sample.at[sample_id, f"Предсказание {components[comp]['name']}"] = pred
                data_sample.at[sample_id, f"Предсказание {comp}"] = pred
                
                gt = data_sample.at[sample_id, feat_trgt]
                aa = pred - gt
                if aa > 0:
                    n_pos += 1
                else:
                    n_neg += 1
                
                data_sample.at[sample_id, f"Возрастная Акселерация {components[comp]['name']}"] = aa * rho
                data_sample.at[sample_id, f"Возрастная Акселерация {comp}"] = aa * rho
                
                data_sample.at[sample_id, f"Модель {components[comp]['name']}"] = comp
                
                data_models.loc[sample_id, data_sample.columns] = data_sample.loc[sample_id, data_sample.columns]
            
    if len(comp_present) > 0:
        data_models.at[sample_id, "Число моделей"] = len(comp_present)
        data_models.at[sample_id, "Число моделей c отрицательной аккселерацией"] = n_neg
        data_models.at[sample_id, "Число моделей c положительной аккселерацией"] = n_pos
        data_models.at[sample_id, "Возрастная Акселерация"] = 0.0
        for comp in comp_present:
            data_models.at[sample_id, f"Модель {comp}"] = True
            # samples.at[sample_id, f"Возрастная Акселерация {components[comp]['name']}"] /= max(n_pos - 1, n_neg - 1, 1)
            # samples.at[sample_id, f"Возрастная Акселерация {comp}"] /= max(n_pos - 1, n_neg - 1, 1)
            data_models.at[sample_id, "Возрастная Акселерация"] += data_models.at[sample_id, f"Возрастная Акселерация {comp}"]
        
data_models.insert(len(data_models.columns) - 1, "Число моделей", data_models.pop("Число моделей"))
data_models.insert(len(data_models.columns) - 1, "Число моделей c отрицательной аккселерацией", data_models.pop("Число моделей c отрицательной аккселерацией"))
data_models.insert(len(data_models.columns) - 1, "Число моделей c положительной аккселерацией", data_models.pop("Число моделей c положительной аккселерацией"))
data_models.insert(len(data_models.columns) - 1, "Возрастная Акселерация", data_models.pop("Возрастная Акселерация"))
data_models['Биологический возраст'] = data_models[feat_trgt] + data_models["Возрастная Акселерация"]

data_models = data_models.dropna(subset=['Биологический возраст'])

Вычисление биологического возраста: 100%|██████████| 3825/3825 [03:24<00:00, 18.69it/s]


In [None]:
data_models.to_excel("E:/YandexDisk/Work/bbd/millennium/models/data_with_results.xlsx")

In [22]:
sns.set_theme(style='ticks')
fig = plt.figure(
    figsize=(7, 5),
    layout="constrained"
)

axs = fig.subplot_mosaic(
    [
        ['table', 'none'],
        ['scatter', 'violin'],
    ],
    # figsize=(6, 1.5 + 6),
    height_ratios=[1, 4],
    width_ratios=[3, 1.5],
    gridspec_kw={
        # "bottom": 0.14,
        # "top": 0.95,
        # "left": 0.1,
        # "right": 0.5,
        "wspace": 0.01,
        "hspace": 0.01,
    },
)
axs['none'].axis('off')

df_table = pd.DataFrame(index=['MAE', fr"Pearson $\mathbf{{\rho}}$", "Bias"], columns=['Женщины', 'Мужчины'])

y_true = data_models.loc[data_models['Пол'] == 'Ж', 'Возраст'].values
y_pred = data_models.loc[data_models['Пол'] == 'Ж', 'Биологический возраст'].values
y_error = data_models.loc[data_models['Пол'] == 'Ж', 'Биологический возраст'].values
mae = mean_absolute_error(y_true, y_pred)
pearsonr, _ = scipy.stats.pearsonr(y_true, y_pred)
bias = np.mean(data_models.loc[data_models['Пол'] == 'Ж', 'Возрастная Акселерация'].values)
df_table.at['MAE', 'Женщины'] = f"{mean_absolute_error(y_true, y_pred):0.3f}"
df_table.at[fr"Pearson $\mathbf{{\rho}}$", 'Женщины'] = f"{pearsonr:0.3f}"
df_table.at["Bias", 'Женщины'] = f"{bias:0.3f}"

y_true = data_models.loc[data_models['Пол'] == 'М', 'Возраст'].values
y_pred = data_models.loc[data_models['Пол'] == 'М', 'Биологический возраст'].values
y_error = data_models.loc[data_models['Пол'] == 'М', 'Биологический возраст'].values
mae = mean_absolute_error(y_true, y_pred)
pearsonr, _ = scipy.stats.pearsonr(y_true, y_pred)
bias = np.mean(data_models.loc[data_models['Пол'] == 'М', 'Возрастная Акселерация'].values)
df_table.at['MAE', 'Мужчины'] = f"{mean_absolute_error(y_true, y_pred):0.3f}"
df_table.at[fr"Pearson $\mathbf{{\rho}}$", 'Мужчины'] = f"{pearsonr:0.3f}"
df_table.at["Bias", 'Мужчины'] = f"{bias:0.3f}"

col_defs = [
    ColumnDefinition(
        name="index",
        title='',
        textprops={"ha": "center", "weight": "bold"},
        width=2.5,
        # border="both",
    ),
    ColumnDefinition(
        name="Женщины",
        textprops={"ha": "left"},
        width=1.5,
        border="left",
    ),
    ColumnDefinition(
        name="Мужчины",
        textprops={"ha": "left"},
        width=1.5,
    ),
]
table = Table(
    df_table,
    column_definitions=col_defs,
    row_dividers=True,
    footer_divider=False,
    ax=axs['table'],
    textprops={"fontsize": 8},
    row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
    col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
    column_border_kw={"linewidth": 1, "linestyle": "-"},
).autoset_fontcolors(colnames=['Женщины', 'Мужчины'])

xy_min, xy_max = np.quantile(data_models[['Возраст', 'Биологический возраст']].values.flatten(), [0.01, 0.99])
xy_ptp = xy_max - xy_min

# kdeplot = sns.kdeplot(
#     data=data_models,
#     x='Возраст',
#     y='Биологический возраст',
#     fill=True,
#     cbar=False,
#     thresh=0.01,
#     hue="Пол",
#     palette={'Ж': 'crimson', 'М': 'dodgerblue'},
#     # color='crimson',
#     legend=False,
#     ax=axs['scatter']
# )
scatter = sns.scatterplot(
    data=data_models,
    x='Возраст',
    y='Биологический возраст',
    linewidth=0.3,
    alpha=0.75,
    edgecolor="k",
    s=20,
    hue="Пол",
    palette={'Ж': 'crimson', 'М': 'dodgerblue'},
    # color='crimson',
    ax=axs['scatter'],
)
bisect = sns.lineplot(
    x=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
    y=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
    linestyle='--',
    color='black',
    linewidth=1.0,
    ax=axs['scatter']
)
# regplot = sns.regplot(
#     data=results.loc[results['Group'] == 'Train', :],
#     x=feat_trgt,
#     y='Prediction Unbiased',
#     color='k',
#     scatter=False,
#     truncate=False,
#     ax=axs['scatter']
# )
axs['scatter'].set_xlim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
axs['scatter'].set_ylim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
axs['scatter'].set_ylabel("Биологический возраст")
axs['scatter'].set_xlabel("Возраст")

q01 = data_models['Возрастная Акселерация'].quantile(0.01)
q99 = data_models['Возрастная Акселерация'].quantile(0.99)

violin = sns.violinplot(
    data=data_models.loc[(data_models['Возрастная Акселерация'] > q01) & (data_models['Возрастная Акселерация'] < q99), :],
    # x=[0] * data_models.shape[0],
    y='Возрастная Акселерация',
    # color='crimson',
    hue="Пол",
    x='Пол',
    palette={'Ж': 'crimson', 'М': 'dodgerblue'},
    density_norm='width',
    saturation=0.75,
    linewidth=1.0,
    ax=axs['violin'],
    legend=False,
)
# swarm = sns.swarmplot(
#     data=results.loc[results['Group'] == 'Test', :],
#     x=[0] * results.loc[results['Group'] == 'Test', :].shape[0],
#     y='Error Unbiased',
#     color=colors_feats_sets[feats_set],
#     linewidth=0.5,
#     ax=axs['violin'],
#     size= 50 / np.sqrt(results.loc[results['Group'] == 'Test', :].shape[0]),
#     legend=False,
# )
axs['violin'].set_ylabel('Возрастная Акселерация')
axs['violin'].set_xlabel('')
axs['violin'].set(xticklabels=[]) 
axs['violin'].set(xticks=[])

fig.savefig(f"E:/YandexDisk/Work/bbd/millennium/models/data_with_results.png", bbox_inches='tight', dpi=200)
fig.savefig(f"E:/YandexDisk/Work/bbd/millennium/models/data_with_results.pdf", bbox_inches='tight')
plt.close(fig)

# Legacy

In [None]:
path = f"E:/YandexDisk/Work/bbd/millennium/models/Электрокардиограмма (чекап)"
path_model = f"{path}/models/DANet/424"
dataset = 'Электрокардиограмма'
expl_type = 'current'
color = 'olive'

feat_trgt = 'Возраст'

data = pd.read_excel(f"{path}/data.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats.xlsx", index_col=0)
results = pd.read_excel(f"{path_model}/df.xlsx", index_col=0)
metrics = pd.read_excel(f"{path_model}/metrics.xlsx", index_col=0)
df_shap = pd.read_excel(f"{path_model}/explanation.xlsx", index_col=0)
model = TabularModel.load_model(f"{path_model}")
corrector = LinearBiasCorrector()
corrector.fit(results.loc[results['Group'] == 'Train', feat_trgt].values, results.loc[results['Group'] == 'Train', 'Prediction'].values)

xy_min, xy_max = np.quantile(results[[feat_trgt, 'Prediction Unbiased']].values.flatten(), [0.01, 0.99])
xy_ptp = xy_max - xy_min


sns.set_theme(style='ticks')
fig = plt.figure(
    figsize=(8, 5 + 1.5 + 0.15 * feats.shape[0] + 1.5 + 0.15 * feats.shape[0]),
    layout="constrained"
)
subfigs = fig.subfigures(
    nrows=3,
    ncols=1,
    height_ratios=[5, 1.5 + 0.15 * feats.shape[0], 1.5 + 0.15 * feats.shape[0]],
    wspace=0.001,
    hspace=0.001,
)

axs = subfigs[0].subplot_mosaic(
    [
        ['table', 'table'],
        ['scatter', 'violin'],
    ],
    # figsize=(6, 1.5 + 6),
    height_ratios=[1, 4],
    width_ratios=[3, 1.5],
    gridspec_kw={
        # "bottom": 0.14,
        # "top": 0.95,
        # "left": 0.1,
        # "right": 0.5,
        "wspace": 0.01,
        "hspace": 0.01,
    },
)

df_table = pd.DataFrame(index=['MAE', fr"Pearson $\mathbf{{\rho}}$", "Bias"], columns=['Train', 'Validation', 'Test'])
for part in ['Train', 'Validation', 'Test']:
    df_table.at['MAE', part] = f"{metrics.at[part, 'mean_absolute_error_unbiased']:0.3f}"
    df_table.at[fr"Pearson $\mathbf{{\rho}}$", part] = f"{metrics.at[part, 'pearson_corrcoef_unbiased']:0.3f}"
    df_table.at["Bias", part] = f"{metrics.at[part, 'bias_unbiased']:0.3f}"

col_defs = [
    ColumnDefinition(
        name="index",
        title='',
        textprops={"ha": "center", "weight": "bold"},
        width=2.5,
        # border="both",
    ),
    ColumnDefinition(
        name="Train",
        textprops={"ha": "left"},
        width=1.5,
        border="left",
    ),
    ColumnDefinition(
        name="Validation",
        textprops={"ha": "left"},
        width=1.5,
    ),
    ColumnDefinition(
        name="Test",
        textprops={"ha": "left"},
        width=1.5,
    )
]

table = Table(
    df_table,
    column_definitions=col_defs,
    row_dividers=True,
    footer_divider=False,
    ax=axs['table'],
    textprops={"fontsize": 8},
    row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
    col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
    column_border_kw={"linewidth": 1, "linestyle": "-"},
).autoset_fontcolors(colnames=['Train', 'Validation', 'Test'])

kdeplot = sns.kdeplot(
    data=results.loc[results['Group'].isin(['Train', 'Validation']), :],
    x=feat_trgt,
    y='Prediction Unbiased',
    fill=True,
    cbar=False,
    thresh=0.05,
    color=color,
    legend=False,
    ax=axs['scatter']
)
scatter = sns.scatterplot(
    data=results.loc[results['Group'] == 'Test', :],
    x=feat_trgt,
    y="Prediction Unbiased",
    linewidth=0.5,
    alpha=0.8,
    edgecolor="k",
    s=25,
    color=color,
    ax=axs['scatter'],
)
bisect = sns.lineplot(
    x=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
    y=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
    linestyle='--',
    color='black',
    linewidth=1.0,
    ax=axs['scatter']
)
regplot = sns.regplot(
    data=results,
    x=feat_trgt,
    y='Prediction Unbiased',
    color='k',
    scatter=False,
    truncate=False,
    ax=axs['scatter']
)
axs['scatter'].set_xlim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
axs['scatter'].set_ylim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
axs['scatter'].set_ylabel("Биологический возраст")
axs['scatter'].set_xlabel("Возраст")

violin = sns.violinplot(
    data=results.loc[results['Group'].isin(['Train', 'Validation']), :],
    x=[0] * results.loc[results['Group'].isin(['Train', 'Validation']), :].shape[0],
    y='Error Unbiased',
    color=make_rgb_transparent(mcolors.to_rgb(color), (1, 1, 1), 0.5),
    density_norm='width',
    saturation=0.75,
    linewidth=1.0,
    ax=axs['violin'],
    legend=False,
)
swarm = sns.swarmplot(
    data=results.loc[results['Group'] == 'Test', :],
    x=[0] * results.loc[results['Group'] == 'Test', :].shape[0],
    y='Error Unbiased',
    color=color,
    linewidth=0.5,
    ax=axs['violin'],
    size= 50 / np.sqrt(results.loc[results['Group'] == 'Test', :].shape[0]),
    legend=False,
)
axs['violin'].set_ylabel('Возрастная акселерация')
axs['violin'].set_xlabel('')
axs['violin'].set(xticklabels=[]) 
axs['violin'].set(xticks=[]) 

ax_heatmap = subfigs[1].subplots()
df_corr = pd.DataFrame(index=feats.index.to_list(), columns=['rho'])
for f in tqdm(feats.index.to_list()):
    df_tmp = data.loc[:, [feat_trgt, f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        vals_1 = df_tmp.loc[:, feat_trgt].values
        vals_2 = df_tmp.loc[:, f].values
        df_corr.at[f, 'rho'], _ = scipy.stats.pearsonr(vals_1, vals_2)
df_corr.dropna(axis=0, how='any', inplace=True)
df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
feats_cnt_wo_age = df_corr.index.to_list()
feats_cnt = ['Возраст'] + feats_cnt_wo_age
df_corr = df_corr.apply(pd.to_numeric)
heatmap = sns.heatmap(
    df_corr.loc[:, ['rho']],
    annot=True,
    fmt=".2f",
    vmin=-1.0,
    vmax=1.0,
    cmap='coolwarm',
    linewidth=0.1,
    linecolor='black',
    #annot_kws={"fontsize": 15},
    cbar_kws={
        # "shrink": 0.9,
        # "aspect": 30,
        #'fraction': 0.046, 
        #'pad': 0.04,
    },
    ax=ax_heatmap
)
heatmap_pos = ax_heatmap.get_position()
ax_heatmap.figure.axes[-1].set_position([heatmap_pos.x1 + 0.05, heatmap_pos.y0, 0.1, heatmap_pos.height])
ax_heatmap.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
for spine in ax_heatmap.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
ax_heatmap.set_xlabel('')
ax_heatmap.set_ylabel('')
ax_heatmap.set(xticklabels=[])
ax_heatmap.set(xticks=[])


if expl_type == 'recalc_gradient':
    df_shap = model.explain(data, method="GradientShap", baselines="b|100000")
    df_shap.index = data.index
elif expl_type == 'recalc_sampling':
    ds_data_shap = data.copy()
    ds_cat_encoders = {}
    for f in feats.index:
        ds_cat_encoders[f] = LabelEncoder()
        ds_data_shap[f] = ds_cat_encoders[f].fit_transform(ds_data_shap[f])
    def predict_func(X):
        X_df = pd.DataFrame(data=X, columns=feats.index.to_list())
        for f in feats.index:
            X_df[f] = ds_cat_encoders[f].inverse_transform(X_df[f].astype(int).values)
        y = model.predict(X_df)[f'{feat_trgt}_prediction'].values
        y = corrector.predict(y)
        return y
    explainer = shap.SamplingExplainer(predict_func, ds_data_shap.loc[:, feats.index.to_list()].values)
    print(explainer.expected_value)
    shap_values = explainer.shap_values(ds_data_shap.loc[:, feats.index.to_list()].values)
    df_shap = pd.DataFrame(index=data.index, columns=feats.index.to_list(), data=shap_values)

ds_fi = pd.DataFrame(index=feats.index.to_list(), columns=['mean(|SHAP|)'])
for f in feats.index.to_list():
    ds_fi.at[f, 'mean(|SHAP|)'] = df_shap[f].abs().mean()
ds_fi.sort_values(['mean(|SHAP|)'], ascending=[False], inplace=True)
ds_fi['Features'] = ds_fi.index.values


axs_importance = subfigs[2].subplots(1, 2, width_ratios=[4, 8], gridspec_kw={'wspace':0.02, 'hspace': 0.02}, sharey=True, sharex=False)

barplot = sns.barplot(
    data=ds_fi,
    x='mean(|SHAP|)',
    y='Features',
    color=color,
    edgecolor='black',
    dodge=False,
    ax=axs_importance[0]
)
for container in barplot.containers:
    barplot.bar_label(container, label_type='edge', color='gray', fmt='%0.2f', fontsize=12, padding=4.0)
axs_importance[0].set_ylabel('')
axs_importance[0].set(yticklabels=ds_fi.index.to_list())

is_colorbar = False
f_legends = []
for f in ds_fi.index:
    
    if df_shap[f].abs().max() > 10:
        f_shap_ll = df_shap[f].quantile(0.01)
        f_shap_hl = df_shap[f].quantile(0.99)
    else:
        f_shap_ll = df_shap[f].min()
        f_shap_hl = df_shap[f].max()
    
    f_index = df_shap.index[(df_shap[f] >= f_shap_ll) & (df_shap[f] <= f_shap_hl)].values
    f_shap = df_shap.loc[f_index, f].values
    f_vals = data.loc[f_index, f].values
    
    f_cmap = sns.color_palette("Spectral_r", as_cmap=True)
    f_norm = mcolors.Normalize(vmin=min(f_vals), vmax=max(f_vals)) 
    f_colors = {}
    for cval in f_vals:
        f_colors.update({cval: f_cmap(f_norm(cval))})

    strip = sns.stripplot(
        x=f_shap,
        y=[f]*len(f_shap),
        hue=f_vals,
        palette=f_colors,
        jitter=0.35,
        alpha=0.5,
        edgecolor='gray',
        linewidth=0.1,
        size=25 / np.sqrt(results.loc[results['Group'] == 'Test', :].shape[0]),
        legend=False,
        ax=axs_importance[1],
    )
    
    if not is_colorbar:
        sm = plt.cm.ScalarMappable(cmap=f_cmap, norm=f_norm)
        sm.set_array([])
        cbar = strip.figure.colorbar(sm)
        # cbar.set_label('Значения\nчисленных\nпризнаков', labelpad=-8, fontsize='large')
        cbar.set_ticks([min(f_vals), max(f_vals)])
        cbar.set_ticklabels(["Min", "Max"])
        is_colorbar = True 

axs_importance[1].set_xlabel('SHAP')
df_shap.to_excel(f"{path}/model_importance.xlsx")

fig.suptitle(dataset, fontsize='large')
fig.savefig(f"{path}/model.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/model.pdf", bbox_inches='tight')
plt.close(fig)