# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from sklearn.model_selection import BaseCrossValidator, ParameterGrid, ParameterSampler
import torch
import pickle
import shutil
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
from pytorch_tabular.utils import make_mixed_dataset, print_metrics
from pytorch_tabular import available_models
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, GANDALFConfig, TabNetModelConfig, FTTransformerConfig, DANetConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.tabular_model_tuner import TabularModelTuner
from sklearn.metrics import mean_absolute_error
from pytorch_tabular import MODEL_SWEEP_PRESETS
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.pt.hyper_opt import train_hyper_opt
from src.utils.hash import dict_hash
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
from itertools import chain
from pathlib import Path
import requests
from matplotlib.ticker import MaxNLocator
from regression_bias_corrector import LinearBiasCorrector
from scipy.stats import mannwhitneyu, variation, levene
from copy import deepcopy


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]

def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter


# Load data

In [None]:
path = f"E:/YandexDisk/Work/bbd/mriya"

## Load original file

In [None]:
yadisk_file_url = "https://disk.yandex.ru/i/CEaw3cqI2Y7J0A"
response = requests.get(
    "https://cloud-api.yandex.net/v1/disk/public/resources/download", 
    params={'public_key': yadisk_file_url}
)
res = response.json()
download_url = res['href']
response = requests.get(download_url)
yadisk_subject_file_name = Path(f"{path}/Испытуемые Яндекс.xlsx")
with open(yadisk_subject_file_name, 'wb') as f:
    f.write(response.content)

## Load samples info

In [None]:
df_info = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Info', index_col=0)
df_info.rename(columns={'sex': 'Sex'}, inplace=True)
df_info.insert(3, 'Age', (df_info['sample_date'] - df_info['birthday']) / np.timedelta64(1, 'D') / 365.25)
df_info = df_info[df_info['Age'].notna()]

groups_ids = {
    'Low Risk M': df_info.index[(df_info['Sex'] == 'M') & (df_info['РИСК'] == 'Низкий')].values,
    'Low Risk F': df_info.index[(df_info['Sex'] == 'F') & (df_info['РИСК'] == 'Низкий')].values,
    'High Risk M': df_info.index[(df_info['Sex'] == 'M') & (df_info['РИСК'] == 'Высокий')].values,
    'High Risk F': df_info.index[(df_info['Sex'] == 'F') & (df_info['РИСК'] == 'Высокий')].values,
}

groups_colors = {
    'Low Risk M': 'crimson',
    'Low Risk F': 'dodgerblue',
    'High Risk M': 'lawngreen',
    'High Risk F': 'darkorchid',
}

n_rows = 2
n_cols = 2
fig_width = 10
fig_height = 8
hist_bins = np.linspace(5, 115, 23)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups_ids.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=df_info.loc[g_ids, ],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(20, 90))
    axs[row_id, col_id].set_title(f"{g} ({len(g_ids)})")
fig.tight_layout()    
fig.savefig(f"{path}/select_samples/hist_age_basic.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/select_samples/hist_age_basic.pdf", bbox_inches='tight')
plt.close(fig)


## Load models

In [None]:
feat_trgt = 'Age'

components = {
    'Anthropometry': {
        'name': 'Антропометрия',
        'path': f"{path}/Anthropometry",
        'path_model': f"{path}/Anthropometry/models/DANet/1/127", 
        'color': 'dodgerblue',
    },
    'Complete Blood Count': {
        'name': 'Общий анализ крови',
        'path': f"{path}/Complete Blood Count",
        'path_model': f"{path}/Complete Blood Count/models/DANet/1/113", 
        'color': 'crimson',
    },
    'Blood Biochemical': {
        'name': 'Биохимия крови',
        'path': f"{path}/Blood Biochemical",
        'path_model': f"{path}/Blood Biochemical/models/DANet/1/125", 
        'color': 'cyan',
    },
    "Electrocardiography": {
        "name": "Электрокардиография",
        "path": f"{path}/Electrocardiography",
        "path_model": f"{path}/Electrocardiography/models/DANet/1/70",
        "color": "gold",
    },
    "Echocardiography": {
        "name": "Эхокардиография",
        "path": f"{path}/Echocardiography",
        "path_model": f"{path}/Echocardiography/models/DANet/1/108",
        "color": "olive",
    },
    "Sphygmocardiography": {
        "name": "Сфигмография",
        "path": f"{path}/Sphygmocardiography",
        "path_model": f"{path}/Sphygmocardiography/models/DANet/1/112",
        "color": "chocolate",
    },
    "Arterial Stiffness": {
        "name": "Cосудистая жесткость",
        "path": f"{path}/Arterial Stiffness",
        "path_model": f"{path}/Arterial Stiffness/models/DANet/1/86",
        "color": "lawngreen",
    },
    "Best Correlation": {
        "name": "Лучшие из разных групп",
        "path": f"{path}/Best Correlation",
        "path_model": f"{path}/Best Correlation/models/DANet/1/45",
        "color": "gray",
    },
}

feats_all = []
for comp in components:
    components[comp]['data'] = pd.read_excel(f"{components[comp]['path']}/data.xlsx", index_col=0)
    components[comp]['feats'] = pd.read_excel(f"{components[comp]['path']}/feats.xlsx", index_col=0)
    components[comp]['results'] = pd.read_excel(f"{components[comp]['path_model']}/df.xlsx", index_col=0)
    components[comp]['metrics'] = pd.read_excel(f"{components[comp]['path_model']}/metrics.xlsx", index_col=0)
    components[comp]['shap'] = pd.read_excel(f"{components[comp]['path_model']}/explanation.xlsx", index_col=0)
    components[comp]['model'] = TabularModel.load_model(f"{components[comp]['path_model']}")
    components[comp]['corrector'] = LinearBiasCorrector()
    comp_results = components[comp]['results']
    components[comp]['corrector'].fit(comp_results.loc[comp_results['Group'] == 'Train', feat_trgt].values, comp_results.loc[comp_results['Group'] == 'Train', 'Prediction'].values)
    res_cols = ['Group', 'Prediction', 'Error', 'Prediction Unbiased', 'Error Unbiased']
    components[comp]['data'].loc[components[comp]['data'].index, res_cols] = comp_results.loc[components[comp]['data'].index, res_cols]

In [None]:
samples_special = ['I1 (4)', 'I14 (2)', 'I117 (3)']

df_samples_to_select = pd.read_excel(f"{path}/select_samples/samples_to_select.xlsx")
samples_to_select = {
    'ИФА': df_samples_to_select['ИФА'].dropna().values,
    'Эпи': df_samples_to_select['Эпи'].dropna().values
}


In [None]:
len(set(samples_to_select['ИФА']).intersection(samples_to_select['Эпи']))


In [None]:
len(set(samples_to_select['Эпи']))

In [None]:

for group in ['ИФА', 'Эпи']:
    groups_selected = {
        'Low Risk M': deepcopy(groups_ids['Low Risk M']),
        'Low Risk F': deepcopy(groups_ids['Low Risk F']),
        'High Risk M': df_info.index[(df_info.index.isin(samples_to_select[group])) & ((df_info['Sex'] == 'M'))].values,
        'High Risk F': df_info.index[(df_info.index.isin(samples_to_select[group])) & ((df_info['Sex'] == 'F'))],
    }
    
    n_rows = 2
    n_cols = 2
    fig_width = 10
    fig_height = 8
    hist_bins = np.linspace(5, 115, 23)

    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
    for g_id, (g, g_ids) in enumerate(groups_selected.items()):
        row_id, col_id = divmod(g_id, n_cols)
        
        histplot = sns.histplot(
            data=df_info.loc[g_ids, ],
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            color=groups_colors[g],
            ax=axs[row_id, col_id]
        )
        axs[row_id, col_id].set(xlim=(20, 90))
        axs[row_id, col_id].set_title(f"{g} ({len(g_ids)})")
        axs[row_id, col_id].yaxis.set_major_locator(MaxNLocator(integer=True))
    fig.tight_layout()    
    fig.savefig(f"{path}/select_samples/hist_age_{group}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/select_samples/hist_age_{group}.pdf", bbox_inches='tight')
    plt.close(fig)

    samples_selected = list(chain.from_iterable(groups_selected.values()))
    data_selected = df_info.loc[samples_selected, :]
    data_selected.to_excel(f"{path}/select_samples/{group}.xlsx")
    
    df_samples = df_info.loc[samples_selected, :]

    for comp in components:
        data_comp = components[comp]['data']
        samples_cmn = data_comp.index.intersection(df_samples.index).values
        print(f"{components[comp]['name']}: {len(samples_cmn)}")
        df_samples.loc[samples_cmn, f"{components[comp]['name']}"] = data_comp.loc[samples_cmn, 'Prediction Unbiased']
        df_samples.loc[samples_cmn, f"{components[comp]['name']} акселерация"] = data_comp.loc[samples_cmn, 'Error Unbiased']
        
    ages = [components[comp]['name'] for comp in components]
    df_ages_stat = pd.DataFrame(index=ages)
    pathlib.Path(f"{path}/select_samples").mkdir(parents=True, exist_ok=True)
    for comp in components:
        name_comp = components[comp]['name']
        df_comp = df_samples.loc[:, ["Age", f"{name_comp}", f"{name_comp} акселерация", 'РИСК']]
        df_comp.dropna(axis=0, how='any', inplace=True)
        vals = {
            'Низкий': df_comp.loc[df_comp['РИСК'] == 'Низкий', f"{name_comp} акселерация"].values,
            'Высокий': df_comp.loc[df_comp['РИСК'] == 'Высокий', f"{name_comp} акселерация"].values
        }
        _, df_ages_stat.at[name_comp, "mannwhitneyu_pval"] = mannwhitneyu(vals['Низкий'], vals['Высокий'], alternative='two-sided')
        _, df_ages_stat.at[name_comp, "levene_pval"] = levene(vals['Низкий'], vals['Высокий'])
    _, df_ages_stat.loc[:, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_ages_stat.loc[ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_ages_stat.loc[ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_ages_stat.loc[ages, "levene_pval"].values, 0.05, method='fdr_bh')
    df_ages_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    
    colors_risk = {
        'Низкий': 'dodgerblue',
        'Высокий': 'crimson'
    }

    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(24, 10),
        layout="constrained"
    )
    subfigs = fig.subfigures(
        nrows=2,
        ncols=4,
        wspace=0.1,
        hspace=0.1,
    )
    for epiage_id, comp in enumerate(components):
        epiage = components[comp]['name']
        df_comp = df_samples.loc[:, ["Age", f"{epiage}", f"{epiage} акселерация", 'РИСК']]
        df_comp.dropna(axis=0, how='any', inplace=True)
        row_id, col_id = divmod(epiage_id, 4)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 5],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.9,
                # "left": 0.1,
                # "right": 0.5,
                #"wspace": 0.33,
                "hspace": 0.001,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE', fr"Pearson $\rho$", "Bias"], columns=[epiage])
        mae = mean_absolute_error(df_comp['Age'].values, df_comp[epiage].values)
        rho, _ = stats.pearsonr(df_comp['Age'].values, df_comp[epiage].values)
        bias = np.mean(df_comp[epiage] - df_comp['Age'])
        ds_table.at['MAE', epiage] = f"{mae:0.2f}"
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Bias", epiage] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_comp[['Age', epiage]].min().min()
        xy_max = df_comp[['Age', epiage]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_comp,
            x='Age',
            y=epiage,
            color='black',
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_comp,
            x='Age',
            y=epiage,
            hue='РИСК',
            palette=colors_risk,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(colors_risk.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_ylabel(f"{epiage} возраст")
        axs['21'].set_xlabel(f"Возраст")
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_comp,
            x='РИСК',
            y=f"{epiage} акселерация",
            hue='РИСК',
            palette=colors_risk,
            density_norm='width',
            order=list(colors_risk.keys()),
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} акселерация")
        mannwhitneyu_pval = df_ages_stat.at[epiage, "mannwhitneyu_pval"]
        levene_pval = df_ages_stat.at[epiage, "levene_pval"]
        axs['22'].set_title(f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}')

    fig.savefig(f"{path}/select_samples/ages_{group}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/select_samples/ages_{group}.pdf", bbox_inches='tight')
    plt.close(fig)


## Select random samples

In [None]:
from copy import deepcopy

special_m = ['I1 (4)', 'I14 (2)']
special_f = ['I117 (3)']

samples_type = 'эпи'

needed_samples = {
    'High Risk M': 16 - len(special_m),
    'High Risk F': 17 - len(special_f),
}

age_bin_edges = np.linspace(5, 115, 23)
age_prob = np.asarray([1/22] * 22)
# age_prob = np.asarray([3.0]*6 + [1.0]*4 +  [3.0]*12)
age_prob /= np.sum(age_prob)
bin_diff = 5

for seed in range(1, 31):

    groups_selected = {
        'Low Risk M': deepcopy(groups_ids['Low Risk M']),
        'Low Risk F': deepcopy(groups_ids['Low Risk F']),
        'High Risk M': deepcopy(special_m),
        'High Risk F': deepcopy(special_f),
    }

    for g in ['High Risk M', 'High Risk F']:
        data_cands = df_info.loc[list(set(groups_ids[g]) - set(special_m + special_f)), :]
        print(f"data_cands: {data_cands.shape[0]}")
        data_cands.loc[:, 'Prob Age'] = age_prob[np.rint((data_cands.loc[:, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]
        
        n_same_age = needed_samples[g]
        print(n_same_age)
        index_selected = data_cands.sample(n=n_same_age, replace=False, weights='Prob Age', random_state=seed).index
        if index_selected.is_unique:
            ids_selected = index_selected.to_list()
            print(ids_selected)
            groups_selected[g] += ids_selected
            print(len(groups_selected[g]))
        else:
            print("Not unique index")

    n_rows = 2
    n_cols = 2
    fig_width = 10
    fig_height = 8
    hist_bins = np.linspace(5, 115, 23)

    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
    for g_id, (g, g_ids) in enumerate(groups_selected.items()):
        row_id, col_id = divmod(g_id, n_cols)
        
        histplot = sns.histplot(
            data=df_info.loc[g_ids, ],
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x="Age",
            color=groups_colors[g],
            ax=axs[row_id, col_id]
        )
        axs[row_id, col_id].set(xlim=(20, 90))
        axs[row_id, col_id].set_title(f"{g} ({len(g_ids)})")
        axs[row_id, col_id].yaxis.set_major_locator(MaxNLocator(integer=True))
    fig.tight_layout()    
    fig.savefig(f"{path}/select_samples/hist_age_{samples_type}_{seed}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/select_samples/hist_age_{samples_type}_{seed}.pdf", bbox_inches='tight')
    plt.close(fig)

    data_selected = df_info.loc[chain.from_iterable(groups_selected.values()), :]
    data_selected.to_excel(f"{path}/select_samples/{samples_type}_{seed}.xlsx")

# Models inference

In [None]:
path = f"E:/YandexDisk/Work/bbd/mriya"

feat_trgt = 'Age'

components = {
    'Anthropometry': {
        'name': 'Антропометрия',
        'path': f"{path}/Anthropometry",
        'path_model': f"{path}/Anthropometry/models/DANet/1/127", 
        'color': 'dodgerblue',
    },
    'Complete Blood Count': {
        'name': 'Общий анализ крови',
        'path': f"{path}/Complete Blood Count",
        'path_model': f"{path}/Complete Blood Count/models/DANet/1/113", 
        'color': 'crimson',
    },
    'Blood Biochemical': {
        'name': 'Биохимия крови',
        'path': f"{path}/Blood Biochemical",
        'path_model': f"{path}/Blood Biochemical/models/DANet/1/125", 
        'color': 'cyan',
    },
    "Electrocardiography": {
        "name": "Электрокардиография",
        "path": f"{path}/Electrocardiography",
        "path_model": f"{path}/Electrocardiography/models/DANet/1/70",
        "color": "gold",
    },
    "Echocardiography": {
        "name": "Эхокардиография",
        "path": f"{path}/Echocardiography",
        "path_model": f"{path}/Echocardiography/models/DANet/1/108",
        "color": "olive",
    },
    "Sphygmocardiography": {
        "name": "Сфигмография",
        "path": f"{path}/Sphygmocardiography",
        "path_model": f"{path}/Sphygmocardiography/models/DANet/1/112",
        "color": "chocolate",
    },
    "Arterial Stiffness": {
        "name": "Cосудистая жесткость",
        "path": f"{path}/Arterial Stiffness",
        "path_model": f"{path}/Arterial Stiffness/models/DANet/1/86",
        "color": "lawngreen",
    },
    "Best Correlation": {
        "name": "Лучшие из разных групп",
        "path": f"{path}/Best Correlation",
        "path_model": f"{path}/Best Correlation/models/DANet/1/45",
        "color": "gray",
    },
}

feats_all = []
for comp in components:
    components[comp]['data'] = pd.read_excel(f"{components[comp]['path']}/data.xlsx", index_col=0)
    components[comp]['feats'] = pd.read_excel(f"{components[comp]['path']}/feats.xlsx", index_col=0)
    components[comp]['results'] = pd.read_excel(f"{components[comp]['path_model']}/df.xlsx", index_col=0)
    components[comp]['metrics'] = pd.read_excel(f"{components[comp]['path_model']}/metrics.xlsx", index_col=0)
    components[comp]['shap'] = pd.read_excel(f"{components[comp]['path_model']}/explanation.xlsx", index_col=0)
    components[comp]['model'] = TabularModel.load_model(f"{components[comp]['path_model']}")
    components[comp]['corrector'] = LinearBiasCorrector()
    comp_results = components[comp]['results']
    components[comp]['corrector'].fit(comp_results.loc[comp_results['Group'] == 'Train', feat_trgt].values, comp_results.loc[comp_results['Group'] == 'Train', 'Prediction'].values)
    res_cols = ['Group', 'Prediction', 'Error', 'Prediction Unbiased', 'Error Unbiased']
    components[comp]['data'].loc[components[comp]['data'].index, res_cols] = comp_results.loc[components[comp]['data'].index, res_cols]

In [None]:
df_info = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Info', index_col=0)
df_info.rename(columns={'sex': 'Sex'}, inplace=True)
df_info.insert(3, 'Age', (df_info['sample_date'] - df_info['birthday']) / np.timedelta64(1, 'D') / 365.25)
df_info = df_info[df_info['Age'].notna()]

for comp in components:
    data_comp = components[comp]['data']
    samples_cmn = data_comp.index.intersection(df_info.index).values
    print(f"{components[comp]['name']}: {len(samples_cmn)}")
    df_info.loc[samples_cmn, f"{components[comp]['name']}"] = data_comp.loc[samples_cmn, 'Prediction Unbiased']
    df_info.loc[samples_cmn, f"{components[comp]['name']} акселерация"] = data_comp.loc[samples_cmn, 'Error Unbiased']
df_info.to_excel(f"{path}/select_samples/info.xlsx")

In [None]:
ages = [components[comp]['name'] for comp in components]
df_ages_stat = pd.DataFrame(index=ages)
pathlib.Path(f"{path}/select_samples").mkdir(parents=True, exist_ok=True)
for comp in components:
    name_comp = components[comp]['name']
    df_comp = df_info.loc[:, ["Age", f"{name_comp}", f"{name_comp} акселерация", 'РИСК']]
    df_comp.dropna(axis=0, how='any', inplace=True)
    vals = {
        'Низкий': df_comp.loc[df_comp['РИСК'] == 'Низкий', f"{name_comp} акселерация"].values,
        'Высокий': df_comp.loc[df_comp['РИСК'] == 'Высокий', f"{name_comp} акселерация"].values
    }
    _, df_ages_stat.at[name_comp, "mannwhitneyu_pval"] = mannwhitneyu(vals['Низкий'], vals['Высокий'], alternative='two-sided')
    _, df_ages_stat.at[name_comp, "levene_pval"] = levene(vals['Низкий'], vals['Высокий'])
_, df_ages_stat.loc[:, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_ages_stat.loc[ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
_, df_ages_stat.loc[ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_ages_stat.loc[ages, "levene_pval"].values, 0.05, method='fdr_bh')
df_ages_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
df_ages_stat.to_excel(f"{path}/select_samples/ages.xlsx")

In [None]:
colors_risk = {
    'Низкий': 'dodgerblue',
    'Высокий': 'crimson'
}

sns.set_theme(style='ticks')
fig = plt.figure(
    figsize=(24, 10),
    layout="constrained"
)
subfigs = fig.subfigures(
    nrows=2,
    ncols=4,
    wspace=0.1,
    hspace=0.1,
)
for epiage_id, comp in enumerate(components):
    epiage = components[comp]['name']
    df_comp = df_info.loc[:, ["Age", f"{epiage}", f"{epiage} акселерация", 'РИСК']]
    df_comp.dropna(axis=0, how='any', inplace=True)
    row_id, col_id = divmod(epiage_id, 4)

    axs = subfigs[row_id, col_id].subplot_mosaic(
        [
            ['11', '12'],
            ['21', '22'],
        ],
        height_ratios=[1, 5],
        width_ratios=[3, 1.5],
        gridspec_kw={
            "bottom": 0.14,
            "top": 0.9,
            # "left": 0.1,
            # "right": 0.5,
            #"wspace": 0.33,
            "hspace": 0.001,
        },
    )
    
    ds_table = pd.DataFrame(index=['MAE', fr"Pearson $\rho$", "Bias"], columns=[epiage])
    mae = mean_absolute_error(df_comp['Age'].values, df_comp[epiage].values)
    rho, _ = stats.pearsonr(df_comp['Age'].values, df_comp[epiage].values)
    bias = np.mean(df_comp[epiage] - df_comp['Age'])
    ds_table.at['MAE', epiage] = f"{mae:0.2f}"
    ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
    ds_table.at["Bias", epiage] = f"{bias:0.2f}"
    col_defs = [
        ColumnDefinition(
            name="index",
            title=epiage,
            textprops={"ha": "left"},
            width=4.5,
        ),
        ColumnDefinition(
            name=epiage,
            title='',
            textprops={"ha": "center"},
            width=2.0,
        ),
    ]
    table = Table(
        ds_table,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs['11'],
        textprops={"fontsize": 7},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=[epiage])
    
    axs['12'].axis('off')
    
    xy_min = df_comp[['Age', epiage]].min().min()
    xy_max = df_comp[['Age', epiage]].max().max()
    xy_ptp = xy_max - xy_min
    bisect = sns.lineplot(
        x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
        y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
        linestyle='--',
        color='black',
        linewidth=1.0,
        ax=axs['21']
    )
    regplot = sns.regplot(
        data=df_comp,
        x='Age',
        y=epiage,
        color='black',
        scatter=False,
        truncate=False,
        ax=axs['21']
    )
    scatter = sns.scatterplot(
        data=df_comp,
        x='Age',
        y=epiage,
        hue='РИСК',
        palette=colors_risk,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        s=20,
        hue_order=list(colors_risk.keys()),
        legend=True,
        ax=axs['21'],
    )
    axs['21'].set_ylabel(f"{epiage} возраст")
    axs['21'].set_xlabel(f"Возраст")
    axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
    axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
    
    sns.violinplot(
        data=df_comp,
        x='РИСК',
        y=f"{epiage} акселерация",
        hue='РИСК',
        palette=colors_risk,
        density_norm='width',
        order=list(colors_risk.keys()),
        saturation=0.75,
        linewidth=1.0,
        ax=axs['22'],
        legend=False,
        cut=0,
    )
    axs['22'].set_ylabel(f"{epiage} акселерация")
    mannwhitneyu_pval = df_ages_stat.at[epiage, "mannwhitneyu_pval"]
    levene_pval = df_ages_stat.at[epiage, "levene_pval"]
    axs['22'].set_title(f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}')

fig.savefig(f"{path}/select_samples/ages.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/select_samples/ages.pdf", bbox_inches='tight')
plt.close(fig)

# Create data subsets for web

In [None]:
path = f"E:/YandexDisk/Work/bbd/mriya"

df_info = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Info', index_col=0)
df_info.index = df_info.index.astype(str)
df_info.rename(columns={'sex': 'Sex'}, inplace=True)
df_info.insert(3, 'Age', (df_info['sample_date'] - df_info['birthday']) / np.timedelta64(1, 'D') / 365.25)
df_info = df_info[df_info['Age'].notna()]

df_xmpl = pd.read_excel(f"{path}/data_bioage.xlsx", index_col=0)
df_xmpl.index = df_xmpl.index.astype(str)


df_xmpl.loc[df_xmpl.index.intersection(df_info[df_info['РИСК'] == 'Низкий'].index), :].to_excel(f"{path}/data_lowrisk.xlsx")

# Check selected samples from "Эпи" and "Ифа"

In [None]:
path = f"E:/YandexDisk/Work/bbd/mriya"

df_epi = pd.read_excel(f"{path}/select_samples/Эпи.xlsx", index_col=0)
df_ifa = pd.read_excel(f"{path}/select_samples/ИФА.xlsx", index_col=0)

print(df_epi.shape[0])
print(df_ifa.shape[0])
print(len(df_epi.index.intersection(df_ifa.index)))