# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [1]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import BaseCrossValidator, ParameterGrid, ParameterSampler
import torch
import pickle
import shutil
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
from pytorch_tabular.utils import make_mixed_dataset, print_metrics
from pytorch_tabular import available_models
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, GANDALFConfig, TabNetModelConfig, FTTransformerConfig, DANetConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.tabular_model_tuner import TabularModelTuner
from torchmetrics.functional.regression import mean_absolute_error, pearson_corrcoef
from pytorch_tabular import MODEL_SWEEP_PRESETS
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.pt.hyper_opt import train_hyper_opt
from src.utils.hash import dict_hash
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
import pathlib
import matplotlib.lines as mlines
import mrmr

from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE

import missingno as msno

def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]

def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter


# Load data

In [2]:
path = f"E:/YandexDisk/Work/bbd/atlas"

## Выгрузка биовозраст 2.2

### Initial preprocessing

In [None]:
data_raw = pd.read_excel(f"{path}/Выгрузка биовозраст 2.2.xlsx", sheet_name='Итоговая выгрузка', index_col="№ карты")

# Возраст
age_parts = data_raw['Возраст'].astype(str).str.extract(r"^(\d+)\D*(\d*)\D*$")
age_parts.columns = ['Years', 'Month']
age_parts['Month'] = age_parts['Month'].replace({'': 0})
age_parts['Возраст'] = pd.to_numeric(age_parts['Years']) + pd.to_numeric(age_parts['Month']) / 12.0
data_raw.loc[data_raw.index, 'Возраст'] = age_parts.loc[data_raw.index, 'Возраст']
data_raw['Возраст'] = pd.to_numeric(data_raw['Возраст'])

# Давление
blood_pressure = data_raw['АД'].astype(str).str.extract(r"^(\d+)/(\d+)$")
blood_pressure.columns = ['Cистолическое АД', 'Диастолическое АД']
data_raw.loc[data_raw.index, ['Cистолическое АД', 'Диастолическое АД']] = blood_pressure.loc[data_raw.index, ['Cистолическое АД', 'Диастолическое АД']]
data_raw['Cистолическое АД'] = pd.to_numeric(data_raw['Cистолическое АД'])
data_raw['Диастолическое АД'] = pd.to_numeric(data_raw['Диастолическое АД'])

# % свободного ПСА
data_raw.rename(columns={'% свободного ПСА, %': '% свободного ПСА'}, inplace=True)

# Много знаков %
data_raw.columns = data_raw.columns.str.replace(" %, %", ", %", regex=True)
data_raw.columns = data_raw.columns.str.replace(",,", ",", regex=True)

# Двойные пробелы
data_raw.columns = data_raw.columns.str.replace("  ", " ", regex=True)

# Двойные пробелы
data_raw.columns = data_raw.columns.str.replace("  ", " ", regex=True)

# Запятая c пробелом в конце названия столбца
data_raw.columns = [x[:-2] if x[-2:] == ', ' else x for x in data_raw.columns]

# Пробел перед запятой
data_raw.columns = data_raw.columns.str.replace(" ,", ",", regex=True)

# Replace nan-characters with nan
data_raw.replace({'.': np.nan, }, inplace=True)
data_raw.replace({'-': np.nan, }, inplace=True)
data_raw.replace({r"^.*Необходимо повторить исследование.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r"^.*расчет невозможен.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r"^.*нет результата.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r"^.*без особенностей.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r'^([<>].*)$': np.nan}, regex=True, inplace=True)
data_raw['Тироксин свободный (Т4 свободный), пмоль/л'] = pd.to_numeric(data_raw['Тироксин свободный (Т4 свободный), пмоль/л'], errors='coerce')

# Уникальность столбцов
cols_dupl = data_raw.columns.values[data_raw.columns.duplicated()]
if len(cols_dupl) > 0:
    print(cols_dupl)

### Save available features for manual selection

In [None]:
df_feats = pd.DataFrame(index=data_raw.columns.values)
df_feats['Selected'] = 1
df_feats.to_excel(f"{path}/feats.xlsx")

In [None]:
print(data_raw.dtypes[data_raw.dtypes.isin(['object','category'])].index)
data_raw.select_dtypes(include=['object', 'category'])

### Old data: load processed features and apply filters

In [None]:
df_feats = pd.read_excel(f"{path}/feats.xlsx", index_col=0)
for col in df_feats.index[df_feats['Source'] == 'Inbody'].values:
    data_raw[col] = pd.to_numeric(data_raw[col].astype(str).str.replace(',', '.'), errors='coerce')

### Filter features and save preprocessed data

In [None]:
df_feats = pd.read_excel(f"{path}/feats.xlsx", index_col=0)
data = data_raw.loc[:, df_feats.index[df_feats['Selected'] == 1].to_list()]
data.to_excel(f"{path}/data.xlsx")

## Выгрузка биовозраст 3.0_20241126

### Initial preprocessing

In [None]:
data_raw = pd.read_excel(f"{path}/Выгрузка биовозраст 3.0_20241126.xlsx", sheet_name='Выгрузка', index_col="№ карты")

# Возраст
age_parts = data_raw['Возраст'].astype(str).str.extract(r"^(\d+)\D*(\d*)\D*$")
age_parts.columns = ['Years', 'Month']
age_parts['Month'] = age_parts['Month'].replace({'': 0})
age_parts['Возраст'] = pd.to_numeric(age_parts['Years']) + pd.to_numeric(age_parts['Month']) / 12.0
data_raw.loc[data_raw.index, 'Возраст'] = age_parts.loc[data_raw.index, 'Возраст']
data_raw['Возраст'] = pd.to_numeric(data_raw['Возраст'])

# Давление
blood_pressure = data_raw['АД'].astype(str).str.extract(r"^(\d+)/(\d+)$")
blood_pressure.columns = ['Cистолическое АД', 'Диастолическое АД']
data_raw.loc[data_raw.index, ['Cистолическое АД', 'Диастолическое АД']] = blood_pressure.loc[data_raw.index, ['Cистолическое АД', 'Диастолическое АД']]
data_raw['Cистолическое АД'] = pd.to_numeric(data_raw['Cистолическое АД'])
data_raw['Диастолическое АД'] = pd.to_numeric(data_raw['Диастолическое АД'])

# % свободного ПСА
data_raw.rename(columns={'% свободного ПСА, %': '% свободного ПСА'}, inplace=True)

# Много знаков %
data_raw.columns = data_raw.columns.str.replace(" %, %", ", %", regex=True)
data_raw.columns = data_raw.columns.str.replace(",,", ",", regex=True)

# Двойные пробелы
data_raw.columns = data_raw.columns.str.replace("  ", " ", regex=True)

# Двойные пробелы
data_raw.columns = data_raw.columns.str.replace("  ", " ", regex=True)

# Запятая c пробелом в конце названия столбца
data_raw.columns = [x[:-2] if x[-2:] == ', ' else x for x in data_raw.columns]

# Пробел перед запятой
data_raw.columns = data_raw.columns.str.replace(" ,", ",", regex=True)

# Replace nan-characters with nan
data_raw.replace({'.': np.nan, }, inplace=True)
data_raw.replace({'-': np.nan, }, inplace=True)
data_raw.replace({r"^.*Необходимо повторить исследование.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r"^.*расчет невозможен.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r"^.*нет результата.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r"^.*без особенностей.*$": np.nan}, regex=True, inplace=True)
data_raw.replace({r'^([<>].*)$': np.nan}, regex=True, inplace=True)
data_raw['Тироксин свободный (Т4 свободный), пмоль/л'] = pd.to_numeric(data_raw['Тироксин свободный (Т4 свободный), пмоль/л'], errors='coerce')

# Уникальность столбцов
cols_dupl = data_raw.columns.values[data_raw.columns.duplicated()]
if len(cols_dupl) > 0:
    print(cols_dupl)

In [None]:
feats_v2 = pd.read_excel(f"{path}/data_v2.xlsx", index_col=0).columns.values
feats_v3 = data_raw.columns.values
feats_v2_only = list(set(feats_v2) - set(feats_v3))
feats_v3_only = list(set(feats_v3) - set(feats_v2))

df_feats_all = pd.DataFrame(index=list(set.union(set(feats_v2), set(feats_v3))))
df_feats_all['Set'] = 'Common'
df_feats_all.loc[feats_v2_only, 'Set'] = 'V2 Only'
df_feats_all.loc[feats_v3_only, 'Set'] = 'V3 Only'
df_feats_all.to_excel(f"{path}/feats_v2_v3_cmn.xlsx")

In [None]:
data_3 = data_raw.drop(feats_v3_only, axis=1)
data_2 = pd.read_excel(f"{path}/data_v2.xlsx", index_col=0)
data_3 = data_2.combine_first(data_3)
data_3.to_excel(f"{path}/data_v3.xlsx")

In [None]:
df_feats_3 = pd.DataFrame(index=data_3.columns.values)
df_feats_3.to_excel(f"{path}/feats_v3.xlsx")

## Load preprocessed data

In [3]:
suffix = '_v3'
data = pd.read_excel(f"{path}/data{suffix}.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats{suffix}.xlsx", index_col=0)

In [None]:
data.loc[data.index.duplicated(), ['Возраст']]

# NaNs analysis

In [None]:
nan_pct = data.isna().sum().sum() / data.size * 100
print(nan_pct)

In [None]:
nan_feats = data.isna().sum(axis=0).to_frame(name="Number of NaNs")
nan_feats["% of NaNs"] = nan_feats["Number of NaNs"] / data.shape[0] * 100
nan_feats["Number of not-NaNs"] = data.notna().sum(axis=0)
nan_feats.sort_values(["% of NaNs"], ascending=[True], inplace=True)
nan_feats.to_excel(f"{path}/nan_feats{suffix}.xlsx", index_label="Features")

sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(16, 4))
hist = sns.histplot(
    data=nan_feats,
    x="% of NaNs",
    bins=np.linspace(-0.5, 100.5, 102),
    # discrete=True,
    edgecolor='k',
    linewidth=0.5,
    color='crimson',
    ax=ax
)
hist.set(xlim=(-0.5, 100.5))
hist.set_ylabel("Количество признаков")
hist.set_xlabel("% пропущенных значений")
plt.savefig(f"{path}/nan_feats_hist{suffix}.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/nan_feats_hist{suffix}.pdf", bbox_inches='tight')
plt.close(fig)

In [None]:
sns.set_theme(style='ticks')
plt.figure(figsize=(4, 15))
df_fig = nan_feats.loc[nan_feats[f"% of NaNs"] < 85, :]
barplot = sns.barplot(
    data=df_fig,
    x=f"% of NaNs",
    y=df_fig.index,
    edgecolor='black',
    dodge=False,
)
for container in barplot.containers:
    barplot.bar_label(container, label_type='edge', fmt='%.2f', fontsize=10, padding=2.5)
barplot.set_xlabel("% пропущенных значений")
plt.savefig(f"{path}/nan_feats_bar{suffix}.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/nan_feats_bar{suffix}.pdf", bbox_inches='tight')
plt.close()

# Correlation with Age of all biomarkers

In [None]:
suffix = '_v3'
data = pd.read_excel(f"{path}/data{suffix}.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats{suffix}.xlsx", index_col=0)
nan_feats =  pd.read_excel(f"{path}/nan_feats{suffix}.xlsx", index_col=0)
feats = feats.loc[nan_feats.index, :]
feats.loc[feats.index, nan_feats.columns.to_list()] = nan_feats.loc[feats.index, nan_feats.columns.to_list()]
feats_cnt_wo_age = np.sort(feats.index[feats['Type'] == 'continuous'].to_list())
feats_cnt_wo_age = list(feats_cnt_wo_age[feats_cnt_wo_age != 'Возраст'])

hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(6, 3.5))
histplot = sns.histplot(
    data=data,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Возраст",
    color='crimson',
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path}/age_hist{suffix}.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/age_hist{suffix}.pdf", bbox_inches='tight')
plt.close(fig)

df_corr = pd.DataFrame(index=feats_cnt_wo_age, columns=['count', 'rho', 'pval', ])
for f in tqdm(feats_cnt_wo_age):
    df_tmp = data.loc[:, ['Возраст', f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        df_corr.at[f, 'count'] = df_tmp.shape[0]
        vals_1 = df_tmp.loc[:, 'Возраст'].values
        vals_2 = df_tmp.loc[:, f].values
        df_corr.at[f, 'rho'], df_corr.at[f, 'pval'] = stats.pearsonr(vals_1, vals_2)
df_corr.dropna(axis=0, how='any', inplace=True)
_, df_corr['pval_fdr_bh'], _, _ = multipletests(df_corr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
df_corr.loc[df_corr.index, feats.columns] = feats.loc[df_corr.index, feats.columns]
df_corr.to_excel(f"{path}/age_pearson{suffix}.xlsx", index_label="Features")

# Check unused features intersection

In [10]:
data_suffix = '_v3'
data = pd.read_excel(f"{path}/data{data_suffix}.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats{data_suffix}_unused.xlsx", index_col=0)
data = data[feats.index.values]
data.dropna(axis=0, how='all', inplace=True)
data.to_excel(f"{path}/data{data_suffix}_unused.xlsx", index_label="№ карты")

# Generate datasets

In [None]:
data_suffix = '_v3'
feats_set = 'RheumatologyScreening_no-sex'

data = pd.read_excel(f"{path}/data{data_suffix}.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats{data_suffix}.xlsx", index_col=0)
nan_feats =  pd.read_excel(f"{path}/feats_{feats_set}.xlsx", index_col=0)
feats = feats.loc[nan_feats.index, :]
feats.loc[feats.index, nan_feats.columns.to_list()] = nan_feats.loc[feats.index, nan_feats.columns.to_list()]
data = data[feats.index.values]
data.dropna(axis=0, how='any', inplace=True)
pathlib.Path(f"{path}/subset_{feats_set}").mkdir(parents=True, exist_ok=True)
feats.to_excel(f"{path}/subset_{feats_set}/feats.xlsx", index_label="Features")
data.to_excel(f"{path}/subset_{feats_set}/data.xlsx", index_label="№ карты")
print(data.shape)

feats_cnt = np.sort(feats.index[feats['Type'] == 'continuous'].to_list())
feats_cnt = ['Возраст'] + list(feats_cnt[feats_cnt != 'Возраст'])
df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_cnt), len(feats_cnt))), index=feats_cnt, columns=feats_cnt)
for f_id_1 in range(len(feats_cnt)):
    for f_id_2 in range(f_id_1, len(feats_cnt)):
        f_1 = feats_cnt[f_id_1]
        f_2 = feats_cnt[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = data.loc[:, f_1].values
            vals_2 = data.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_corr.at[f_2, f_1] = pval
            df_corr.at[f_1, f_2] = corr
        else:
            df_corr.at[f_2, f_1] = np.nan
selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=bool)
df_fdr = df_corr.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
nzmin = df_fdr['pval_fdr_bh'][df_fdr['pval_fdr_bh'].gt(0)].min(0) * 0.5
df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
df_corr_fdr = df_corr.copy()
for line_id in range(df_fdr.shape[0]):
    df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])
df_corr_fdr.to_excel(f"{path}/subset_{feats_set}/feats_pearsonr.xlsx")
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(4.5 + 0.3 * len(feats_cnt), 2.5 + 0.2 * len(feats_cnt)), layout='constrained')
cmap_triu = plt.get_cmap("seismic").copy()
mask_triu=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool)
heatmap_diff = sns.heatmap(
    df_corr_fdr,
    mask=mask_triu,
    annot=True,
    fmt=".2f",
    center=0.0,
    cmap=cmap_triu,
    linewidth=0.1,
    linecolor='black',
    annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 8)},
    ax=ax
)
ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')
mask_tril=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool).T
heatmap_pval = sns.heatmap(
    df_corr_fdr,
    mask=mask_tril,
    annot=True,
    fmt=".1f",
    vmin=-np.log10(0.05),
    cmap=cmap_tril,
    linewidth=0.1,
    linecolor='black',
    annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 8)},
    ax=ax
)
ax.figure.axes[-1].set_ylabel(r"$-\log_{10}(\mathrm{p-value})$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
ax.set_xlabel('', fontsize=16)
ax.set_ylabel('', fontsize=16)
ax.set_title('', fontsize=16)
# ax.set_xticklabels(ax.get_xticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
# for tick_label in ax.get_xticklabels():
    # tick_label.set_color(colors_tissues[tick_label.get_text()])
    # ax.set_xticklabels(ax.get_xticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
# for tick_label in ax.get_yticklabels():
    # tick_label.set_color(colors_tissues[tick_label.get_text()])
    # ax.set_yticklabels(ax.get_yticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
# plt.savefig(f"{path}/subset_{feats_set}/feats_pearsonr.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/subset_{feats_set}/feats_pearsonr.pdf", bbox_inches='tight')
plt.close(fig)

# Feature selection for InBody

In [None]:
n_feats = 20

new_subset_name = 'inbody_portable'

ds_data = pd.read_excel(f"{path}/subset_inbody/data.xlsx", index_col=0)
ds_feats = pd.read_excel(f"{path}/subset_inbody/feats.xlsx", index_col=0)

feats_cnt_wo_age = np.sort(ds_feats.index[ds_feats['Type'] == 'continuous'].to_list())
feats_cnt_wo_age = list(feats_cnt_wo_age[feats_cnt_wo_age != 'Возраст'])

selected_features = mrmr.mrmr_regression(
    X=ds_data.loc[:, feats_cnt_wo_age],
    y=ds_data.loc[:, 'Возраст'],
    K=n_feats,
    relevance='f',
    show_progress=True
)

feats_cat = ds_feats.index[ds_feats['Type'] == 'categorical'].to_list()

new_data = ds_data.loc[:, ['Возраст'] + feats_cat + selected_features]
new_feats = ds_feats.loc[['Возраст'] + feats_cat + selected_features, :]

pathlib.Path(f"{path}/subset_{new_subset_name}").mkdir(parents=True, exist_ok=True)
new_feats.to_excel(f"{path}/subset_{new_subset_name}/feats.xlsx", index_label="Features")
new_data.to_excel(f"{path}/subset_{new_subset_name}/data.xlsx", index_label="№ карты")

# Separate markers analysis

## Number of NaNs and correlation with age

In [5]:
data_suffix = '_v3'
data = pd.read_excel(f"{path}/data{data_suffix}.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats{data_suffix}.xlsx", index_col=0)

feats_sprt = pd.read_excel(f"{path}/feats_separate{data_suffix}.xlsx", index_col=0)
feats_sprt.drop(['Возраст', 'Пол'], inplace=True)
feats_sprt[r"Pearson $\rho$"] = 0
feats_sprt['Features'] = feats_sprt.index
for f in feats_sprt.index:
    df_tmp = data.loc[:, ['Возраст', f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        if df_tmp[f].nunique() > 1:
            vals_1 = df_tmp.loc[:, 'Возраст'].values
            vals_2 = df_tmp.loc[:, f].values
            rho, _ = stats.pearsonr(vals_1, vals_2)
            feats_sprt.at[f, r"Pearson $\rho$"] = rho
        else:
            feats_sprt.at[f, r"Pearson $\rho$"] = 0.0

f_cmap = sns.color_palette("coolwarm", as_cmap=True)
# f_norm = mcolors.Normalize(vmin=min(feats_sprt[r"Pearson $\rho$"]), vmax=max(feats_sprt[r"Pearson $\rho$"])) 
f_norm = mcolors.TwoSlopeNorm(vcenter=0.0, vmin=min(feats_sprt[r"Pearson $\rho$"]), vmax=max(feats_sprt[r"Pearson $\rho$"]))
f_colors = {}
for cval in feats_sprt[r"Pearson $\rho$"]:
    f_colors.update({cval: f_cmap(f_norm(cval))})
        
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(4, 16), layout='constrained')
barplot = sns.barplot(
    data=feats_sprt,
    x='Number of not-NaNs',
    y='Features',
    hue=r"Pearson $\rho$",
    edgecolor='black',
    palette=f_colors,
    dodge=False,
    ax=ax
)
for container in barplot.containers:
    barplot.bar_label(container, label_type='edge', color='gray', fmt='%d', fontsize=8, padding=4.0)
ax.set_ylabel('')
ax.set(yticklabels=feats_sprt.index.to_list())
ax.get_legend().remove()
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
ax.set_xlabel('Количество записей not-NaN')
sm = plt.cm.ScalarMappable(cmap=f_cmap, norm=f_norm)
sm.set_array([])
cbar = barplot.figure.colorbar(sm, orientation="horizontal")
cbar.set_label("Корреляция с возрастом")
plt.savefig(f"{path}/feats_separate{data_suffix}.pdf", bbox_inches='tight')
plt.savefig(f"{path}/feats_separate{data_suffix}.png", bbox_inches='tight', dpi=200)
plt.close(fig)

# Datasets processings

## Setup datasets

In [5]:
datasets = {
    "RheumatologyScreening": {
        "name": "Ревматологический тест",
        "path": f"{path}/subset_RheumatologyScreening_no-sex",
        "color": "gray",
    },
    # "ProstateSpecificAntigenTest": {
    #     "name": "Простатический специфический антиген",
    #     "path": f"{path}/subset_ProstateSpecificAntigenTest_no-sex",
    #     "color": "lawngreen",
    # },
    # "HormoneProfile": {
    #     "name": "Гормональный профиль",
    #     "path": f"{path}/subset_HormoneProfile_no-sex",
    #     "color": "chocolate",
    # },
    # "BloodBiochemical": {
    #     "name": "Биохимия крови",
    #     "path": f"{path}/subset_BloodBiochemical_no-sex",
    #     "color": "cyan",
    # },
    # "BloodPressure": {
    #     "name": "Кровяное Давление",
    #     "path": f"{path}/subset_BloodPressure_no-sex",
    #     "color": "orchid",
    # },
    # "CoagulationTest_no-sex": {
    #     "name": "Коагулограмма",
    #     "path": f"{path}/subset_CoagulationTest_no-sex",
    #     "color": "olive",
    # },
    # "LipidProfile": {
    #     "name": "Липидный профиль",
    #     "path": f"{path}/subset_LipidProfile_no-sex",
    #     "color": "gold",
    # },
    # "CBC": {
    #     "name": "Общий анализ крови",
    #     "path": f"{path}/subset_CBC_no-sex",
    #     "color": "crimson",
    # },
    # "InBody-mRMR": {
    #     "name": "Биоимпеданс (InBody)",
    #     "path": f"{path}/subset_InBody-mRMR_no-sex",
    #     "color": "dodgerblue",
    # },
    
    
    # "inbody": {
    #     "name": "Биоимпеданс (InBody)",
    #     "path": f"{path}/subset_inbody",
    #     "color": "dodgerblue",
    # },
    # "inbody_portable": {
    #     "name": "Биоимпеданс (InBody), mRMR",
    #     "path": f"{path}/subset_inbody_portable",
    #     "color": "lawngreen",
    # },
    # "inbody_mrmr": {
    #     "name": "Биоимпеданс (InBody), mRMR",
    #     "path": f"{path}/subset_inbody_mrmr",
    #     "color": "lawngreen",
    # },
    # "lab": {
    #     "name": "Анализ Крови",
    #     "path": f"{path}/subset_lab",
    #     "color": "crimson"
    # },
}

for ds in datasets:
    datasets[ds]['data'] = pd.read_excel(f"{datasets[ds]['path']}/data.xlsx", index_col=0)
    datasets[ds]['feats'] = pd.read_excel(f"{datasets[ds]['path']}/feats.xlsx", index_col=0)

## Age histograms

In [6]:
for ds in datasets:
    hist_bins = np.linspace(5, 115, 23)
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(6, 3.5), layout='constrained')
    histplot = sns.histplot(
        data=datasets[ds]['data'],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Возраст",
        color=datasets[ds]['color'],
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    histplot.set_ylabel('Количество')
    histplot.set_title(datasets[ds]['name'])
    plt.savefig(f"{datasets[ds]['path']}/age_hist.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{datasets[ds]['path']}/age_hist.pdf", bbox_inches='tight')
    plt.close(fig)

## Features correlations

In [None]:
for ds in datasets:
    ds_feats = datasets[ds]['feats']
    feats_cnt_wo_age = np.sort(ds_feats.index[ds_feats['Type'] == 'continuous'].to_list())
    feats_cnt_wo_age = list(feats_cnt_wo_age[feats_cnt_wo_age != 'Возраст'])
    
    df_corr = pd.DataFrame(index=feats_cnt_wo_age, columns=['rho'])
    for f in tqdm(feats_cnt_wo_age):
        df_tmp = datasets[ds]['data'].loc[:, ['Возраст', f]].dropna(axis=0, how='any')
        if df_tmp.shape[0] > 1:
            vals_1 = df_tmp.loc[:, 'Возраст'].values
            vals_2 = df_tmp.loc[:, f].values
            df_corr.at[f, 'rho'], _ = stats.pearsonr(vals_1, vals_2)
    df_corr.dropna(axis=0, how='any', inplace=True)
    df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
    df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
    feats_cnt_wo_age = df_corr.index.to_list()
    feats_cnt = ['Возраст'] + feats_cnt_wo_age
    
    if ds != 'inbody':
        feats_corr = {'': feats_cnt}
    elif ds == 'inbody':
        feats_corr = {
            '_pos': ['Возраст'] + df_corr.index[df_corr['rho'] > 0].to_list(),
            '_neg': ['Возраст'] + df_corr.index[df_corr['rho'] < 0].to_list()
        }
    
    for fc_name, feats_cnt in feats_corr.items():
    
        df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_cnt), len(feats_cnt))), index=feats_cnt, columns=feats_cnt)
        for f_id_1 in range(len(feats_cnt)):
            for f_id_2 in range(f_id_1, len(feats_cnt)):
                f_1 = feats_cnt[f_id_1]
                f_2 = feats_cnt[f_id_2]
                if f_id_1 != f_id_2:
                    vals_1 = datasets[ds]['data'].loc[:, f_1].values
                    vals_2 = datasets[ds]['data'].loc[:, f_2].values
                    corr, pval = stats.pearsonr(vals_1, vals_2)
                    df_corr.at[f_2, f_1] = pval
                    df_corr.at[f_1, f_2] = corr
                else:
                    df_corr.at[f_2, f_1] = np.nan
        selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=bool)
        df_fdr = df_corr.where(selection).stack().reset_index()
        df_fdr.columns = ['row', 'col', 'pval']
        _, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
        nzmin = df_fdr['pval_fdr_bh'][df_fdr['pval_fdr_bh'].gt(0)].min(0) * 0.5
        df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
        df_corr_fdr = df_corr.copy()
        for line_id in range(df_fdr.shape[0]):
            df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])
        df_corr_fdr.to_excel(f"{datasets[ds]['path']}/feats_pearsonr{fc_name}.xlsx")
        
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(8.5 + 0.35 * len(feats_cnt), 6.5 + 0.25 * len(feats_cnt)), layout='constrained')
        cmap_triu = plt.get_cmap("seismic").copy()
        mask_triu=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool)
        heatmap_diff = sns.heatmap(
            df_corr_fdr,
            mask=mask_triu,
            annot=True,
            fmt=".2f",
            center=0.0,
            cmap=cmap_triu,
            linewidth=0.1,
            linecolor='black',
            annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 10)},
            ax=ax
        )
        ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
        for spine in ax.figure.axes[-1].spines.values():
            spine.set(visible=True, lw=0.25, edgecolor="black")
        cmap_tril = plt.get_cmap("viridis").copy()
        cmap_tril.set_under('black')
        mask_tril=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool).T
        heatmap_pval = sns.heatmap(
            df_corr_fdr,
            mask=mask_tril,
            annot=True,
            fmt=".1f",
            vmin=-np.log10(0.05),
            cmap=cmap_tril,
            linewidth=0.1,
            linecolor='black',
            annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 10)},
            ax=ax
        )
        ax.figure.axes[-1].set_ylabel(r"$-\log_{10}(\mathrm{p-value})$")
        for spine in ax.figure.axes[-1].spines.values():
            spine.set(visible=True, lw=0.25, edgecolor="black")
        ax.set_xlabel('')
        ax.set_ylabel('')
        ax.set_title(datasets[ds]['name'])
        plt.savefig(f"{datasets[ds]['path']}/feats_pearsonr{fc_name}.pdf", bbox_inches='tight')
        plt.close(fig)

## Dimensionality reduction

In [None]:
for ds in datasets:
    ds_feats = datasets[ds]['feats']
    ds_data = datasets[ds]['data']
    feats_cnt = ds_feats.index[ds_feats['Type'] == 'continuous'].to_list()
    
    dim_red_models = {
        't-SNE': TSNE(n_components=2),
        'PCA': PCA(n_components=2, whiten=False),
        'IsoMap': Isomap(n_components=2, n_neighbors=5),
        'MDS': MDS(n_components=2, metric=True),
        'GRP': GaussianRandomProjection(n_components=2, eps=0.5),
        'SRP': SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False),
    }
    
    for drm in dim_red_models:
        dim_red_res = dim_red_models[drm].fit_transform(ds_data.loc[:, feats_cnt].values)
        ds_data.loc[:, f"{drm} 1"] = dim_red_res[:, 0]
        ds_data.loc[:, f"{drm} 2"] = dim_red_res[:, 1]
        
    n_rows = 2
    n_cols = 3
    fig_height = 10
    fig_width = 15

    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False, layout='constrained')

    for drm_id, drm in enumerate(dim_red_models.keys()):
        row_id, col_id = divmod(drm_id, n_cols)
        scatter = sns.scatterplot(
            data=ds_data,
            x=f"{drm} 1",
            y=f"{drm} 2",
            linewidth=0.25,
            alpha=0.75,
            edgecolor="k",
            s=40,
            color=datasets[ds]['color'],
            ax=axs[row_id, col_id],
        )
        axs[row_id, col_id].set_title(drm)
        # axs[n_rows - 1, n_cols - 1].axis('off')
    fig.suptitle(datasets[ds]['name'], fontsize='large')   
    fig.savefig(f"{datasets[ds]['path']}/dim_red.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{datasets[ds]['path']}/dim_red.pdf", bbox_inches='tight')
    plt.close(fig)

## Outliers detection

### IQR

In [None]:
for ds in datasets:
    ds_feats = datasets[ds]['feats']
    ds_data = datasets[ds]['data']
    feats_cnt = ds_feats.index[ds_feats['Type'] == 'continuous'].to_list()
    
    out_columns = []
    for f in tqdm(feats_cnt):
        q1 = ds_data[f].quantile(0.25)
        q3 = ds_data[f].quantile(0.75)
        iqr = q3 - q1
        ds_data[f"{f}_out_iqr"] = True
        out_columns.append(f"{f}_out_iqr")
        filter = (ds_data[f] >= q1 - 1.5 * iqr) & (ds_data[f] <= q3 + 1.5 * iqr)
        ds_data.loc[filter, f"{f}_out_iqr"] = False
    ds_data[f"n_outs_iqr"] = ds_data.loc[:, out_columns].sum(axis=1)
    
    hist_bins = np.linspace(-0.5, len(feats_cnt) + 0.5, len(feats_cnt) + 2)
    fig = plt.figure(figsize=(5, 3))
    sns.set_theme(style='ticks')
    histplot = sns.histplot(
        data=ds_data,
        x=f"n_outs_iqr",
        multiple="stack",
        bins=hist_bins,
        edgecolor='k',
        linewidth=1.0,
        color=datasets[ds]['color'],
    )
    histplot.set(xlim=(-0.5, max(ds_data['n_outs_iqr'] + 0.5)))
    histplot.set_title(datasets[ds]['name'])
    histplot.set_xlabel("Количество IQR выбросов")
    histplot.set_ylabel("Количество записей")
    plt.savefig(f"{datasets[ds]['path']}/outs_iqr_hist.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{datasets[ds]['path']}/outs_iqr_hist.pdf", bbox_inches='tight')
    plt.close(fig)
    
    if ds != 'inbody':
        
        out_columns = [f"{f}_out_iqr" for f in feats_cnt]
        df_msno = ds_data.loc[:, out_columns].copy()
        df_msno.replace({True: np.nan}, inplace=True)
        df_msno.rename(columns=dict(zip(out_columns, feats_cnt)), inplace=True)

        # Plot barplot for features with outliers
        msno_bar = msno.bar(
            df=df_msno,
            label_rotation=90,
            color=datasets[ds]['color'],
            figsize=(0.4 * len(feats_cnt), 4),
        )
        plt.xticks(ha='center')
        plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
        msno_bar.set_title(datasets[ds]['name'], fontsize='large')
        msno_bar.set_ylabel("Записи без выбросов", fontsize='large')
        plt.savefig(f"{datasets[ds]['path']}/outs_iqr_bar.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{datasets[ds]['path']}/outs_iqr_bar.pdf", bbox_inches='tight')
        plt.clf()

        # Plot matrix of samples outliers distribution
        msno_mtx = msno.matrix(
            df=df_msno,
            label_rotation=90,
            color=mcolors.to_rgb(datasets[ds]['color']),
            figsize=(0.7 * len(feats_cnt), 5),
        )
        plt.xticks(ha='center')
        plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
        msno_mtx.set_title(datasets[ds]['name'], fontsize='large')
        msno_mtx.set_ylabel("Записи", fontsize='large')
        plt.savefig(f"{datasets[ds]['path']}/outs_iqr_matrix.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{datasets[ds]['path']}/outs_iqr_matrix.pdf", bbox_inches='tight')
        plt.clf()

        # Plot heatmap of features outliers correlations
        msno_heatmap = msno.heatmap(
            df=df_msno,
            label_rotation=90,
            cmap="bwr",
            fontsize=12,
            figsize=(0.6 * len(feats_cnt), 0.6 * len(feats_cnt))
        )
        msno_heatmap.set_title(datasets[ds]['name'], fontsize='large')
        plt.setp(msno_heatmap.xaxis.get_majorticklabels(), ha="center")
        msno_heatmap.collections[0].colorbar.ax.tick_params(labelsize=20)
        plt.savefig(f"{datasets[ds]['path']}/outs_iqr_heatmap.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{datasets[ds]['path']}/outs_iqr_heatmap.pdf", bbox_inches='tight')
        plt.clf()
    

### PyOD

In [None]:
from pyod.models.ecod import ECOD
from pyod.models.copod import COPOD
from pyod.models.sos import SOS
from pyod.models.qmcd import QMCD as QMCDOD
from pyod.models.sampling import Sampling
from pyod.models.gmm import GMM
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.cd import CD
from pyod.models.lmdd import LMDD
from pyod.models.lof import LOF
from pyod.models.cof import COF
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.models.sod import SOD
from pyod.models.rod import ROD
from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.dif import DIF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.loda import LODA
from pyod.models.lunar import LUNAR

from pythresh.thresholds.iqr import IQR
from pythresh.thresholds.mad import MAD
from pythresh.thresholds.fwfm import FWFM
from pythresh.thresholds.yj import YJ
from pythresh.thresholds.zscore import ZSCORE
from pythresh.thresholds.aucp import AUCP
from pythresh.thresholds.qmcd import QMCD
from pythresh.thresholds.fgd import FGD
from pythresh.thresholds.dsn import DSN
from pythresh.thresholds.clf import CLF
from pythresh.thresholds.filter import FILTER
from pythresh.thresholds.wind import WIND
from pythresh.thresholds.eb import EB
from pythresh.thresholds.regr import REGR
from pythresh.thresholds.boot import BOOT
from pythresh.thresholds.mcst import MCST
from pythresh.thresholds.hist import HIST
from pythresh.thresholds.moll import MOLL
from pythresh.thresholds.chau import CHAU
from pythresh.thresholds.gesd import GESD
from pythresh.thresholds.mtt import MTT
from pythresh.thresholds.karch import KARCH
from pythresh.thresholds.ocsvm import OCSVM
from pythresh.thresholds.clust import CLUST
from pythresh.thresholds.decomp import DECOMP
from pythresh.thresholds.meta import META
from pythresh.thresholds.vae import VAE
from pythresh.thresholds.cpd import CPD
from pythresh.thresholds.gamgmm import GAMGMM
from pythresh.thresholds.mixmod import MIXMOD

for ds in datasets:
    ds_feats = datasets[ds]['feats']
    ds_data = datasets[ds]['data']
    feats_cnt = ds_feats.index[ds_feats['Type'] == 'continuous'].to_list()
    
    classifiers = {
        'ECDF-Based (ECOD)': ECOD(),
        'Copula-Based (COPOD)': COPOD(),
        # 'Stochastic (SOS)': SOS(),
        # 'Quasi-Monte Carlo Discrepancy (QMCD)': QMCDOD(),
        'Rapid distance-based via Sampling': Sampling(),
        'Probabilistic Mixture Modeling (GMM)': GMM(),
        'Principal Component Analysis (PCA)': PCA(),
        'Minimum Covariance Determinant (MCD)': MCD(),
        'Cook\'s Distance (CD)': CD(),
        # 'Deviation-based Outlier Detection (LMDD)': LMDD(),
        'Local Outlier Factor (LOF)': LOF(),
        'Connectivity-Based Outlier Factor (COF)': COF(),
        'Clustering-Based Local Outlier Factor (CBLOF)': CBLOF(),
        # 'Histogram-based Outlier Score (HBOS)': HBOS(),
        'k Nearest Neighbors (kNN)': KNN(),
        # 'Subspace Outlier Detection (SOD)': SOD(),
        # 'Rotation-based Outlier Detection (ROD)': ROD(),
        # 'Isolation Forest': IForest(),
        # 'Isolation-Based with Nearest-Neighbor Ensembles (INNE)': INNE(),
        # 'Deep Isolation Forest for Anomaly Detection (DIF)': DIF(),
        'Feature Bagging': FeatureBagging(),
        'Lightweight On-line Detector of Anomalies (LODA)': LODA(),
        'LUNAR': LUNAR()
    }

    thresholders = {
            'Inter-Quartile Region (IQR)':IQR(),
            'Median Absolute Deviation (MAD)':MAD(),
            'Full Width at Full Minimum (FWFM)':FWFM(),
            'Yeo-Johnson Transformation (YJ)': YJ(),
            # 'Z Score (ZSCORE)': ZSCORE(),
            # 'AUC Percentage (AUCP)': AUCP(),
            'Quasi-Monte Carlo Discreperancy (QMCD)': QMCD(),
            'Fixed Gradient Descent (FGD)': FGD(),
            'Distance Shift from Normal (DSN)': DSN(),
            'Trained Classifier (CLF)': CLF(),
            # 'Filtering Based (FILTER)': FILTER(),
            # 'Topological Winding Number (WIND)': WIND(),
            # 'Elliptical Boundary (EB)': EB(),
            'Regression Intercept (REGR)': REGR(),
            # 'Bootstrap Method (BOOT)': BOOT(),
            # 'Monte Carlo Statistical Tests (MCST)': MCST(),
            # 'Histogram Based Methods (HIST)': HIST(),
            'Mollifier (MOLL)': MOLL(),
            # "Chauvenet's Criterion (CHAU)": CHAU(),
            # 'Generalized Extreme Studentized Deviate (GESD)': GESD(),
            # 'Modified Thompson Tau Test (MTT)': MTT(),
            # 'Karcher Mean (KARCH)': KARCH(),
            # 'One-Class SVM (OCSVM)': OCSVM(),
            # 'Clustering (CLUST)': CLUST(),
            # 'Decomposition (DECOMP)': DECOMP(),
            'Meta-model (META)': META(),
            # 'Variational Autoencoder (VAE)': VAE(),
            # 'Change Point Detection (CPD)': CPD(),
            # 'Bayesian Gamma GMM (GAMGMM)': GAMGMM(skip=True),
            'Mixture Models (MIXMOD)': MIXMOD(),
    }
    
    df_outs = pd.DataFrame(index=list(classifiers.keys()), columns=list(thresholders.keys()))
    for pyod_m_name, pyod_m in (pbar := tqdm(classifiers.items())):
        pbar.set_description(f"{pyod_m_name}")
        scores = pyod_m.fit(ds_data.loc[:, feats_cnt].values).decision_scores_
        for pythresh_m_name, pythresh_m in thresholders.items():
            labels = pythresh_m.eval(scores)
            df_outs.at[pyod_m_name, pythresh_m_name] = sum(labels) / len(labels) * 100
            
    df_fig = df_outs.astype(float)
    sns.set_theme(style='ticks', font_scale=1.0)
    fig, ax = plt.subplots(figsize=(8, 7))
    heatmap = sns.heatmap(
        df_fig,
        annot=True,
        fmt=".1f",
        cmap='hot',
        linewidth=0.1,
        linecolor='black',
        cbar_kws={
            'orientation': 'horizontal',
            'location': 'top',
            'pad': 0.025,
            'aspect': 30
        },
        annot_kws={"size": 10},
        ax=ax
    )
    ax.set_ylabel('Алгоритмы детекции выбросов')
    ax.set_xlabel('Алгоритмы выставления пороговых значений')
    heatmap_pos = heatmap.get_position()
    ax.figure.axes[-1].set_title("Процент выбросов")
    ax.figure.axes[-1].tick_params()
    for spine in ax.figure.axes[-1].spines.values():
        spine.set_linewidth(1)
    fig.suptitle(datasets[ds]['name'], fontsize='xx-large')
    plt.savefig(f"{datasets[ds]['path']}/outs_pyod.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{datasets[ds]['path']}/outs_pyod.pdf", bbox_inches='tight')
    plt.close(fig)

# PhenoAge in Atlas

In [None]:
data_suffix = 'v3'

data = pd.read_excel(f"{path}/data_{data_suffix}.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats_{data_suffix}.xlsx", index_col=0)
phenoage = pd.read_excel(f"E:/YandexDisk/Work/bbd/phenoage/phenoage.xlsx", index_col=0)
data = data.loc[:, phenoage['Atlas'].values].dropna(axis=0, how='any')
data['NonLog С-реактивный белок ультрачувствительный, мг/л'] = data['С-реактивный белок ультрачувствительный, мг/л'].values
data['С-реактивный белок ультрачувствительный, мг/л'] = np.log(data['С-реактивный белок ультрачувствительный, мг/л'].values / 10.0)
data['LinearComb'] = -19.9067
gamma = 0.0077
for f in phenoage.index.values:
    data['LinearComb'] += phenoage.at[f, 'Coeff'] * data[phenoage.at[f, 'Atlas']].values
data['MortalityScore'] = 1 - np.exp(-np.exp(data['LinearComb'].values) * (np.exp(120 * gamma) - 1) / gamma)
data['PhenoAge'] = 141.50225 + np.log(-0.00553 * np.log(1 - data['MortalityScore'].values)) / 0.090165
data['PhenoAge acceleration'] = data['PhenoAge'] - data['Возраст']
print(np.mean(data['PhenoAge acceleration'].values))
data.to_excel(f"{path}/PhenoAge/data_PhenoAge.xlsx")

In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=False, shared_xaxes=False, column_widths=[5, 3], horizontal_spacing=0.15)
min_plot_age = data[["Возраст", "PhenoAge"]].min().min()
max_plot_age = data[["Возраст", "PhenoAge"]].max().max()
shift_plot_age = max_plot_age - min_plot_age
min_plot_age -= 0.1 * shift_plot_age
max_plot_age += 0.1 * shift_plot_age
fig.add_trace(
    go.Scatter(
        x=[min_plot_age, max_plot_age],
        y=[min_plot_age, max_plot_age],
        showlegend=False,
        mode='lines',
        line = dict(color='black', width=2, dash='dot')
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Scatter(
        name='Scatter',
        x=data.loc[:, 'Возраст'].values,
        y=data.loc[:, 'PhenoAge'].values,
        text=data.index.values,
        hovertext=data.index.values,
        showlegend=False,
        mode='markers',
        marker=dict(
            size=10,
            opacity=0.75,
            line=dict(
                width=1,
                color='black'
            ),
            color='crimson'
        )
    ),
    row=1,
    col=1
)
fig.update_xaxes(
    row=1,
    col=1,
    automargin=True,
    title_text="Age",
    autorange=False,
    range=[min_plot_age, max_plot_age],
    showgrid=False,
    zeroline=False,
    linecolor='black',
    showline=True,
    gridcolor='gainsboro',
    gridwidth=0.05,
    mirror=True,
    ticks='outside',
    titlefont=dict(
        color='black',
        size=20
    ),
    showticklabels=True,
    tickangle=0,
    tickfont=dict(
        color='black',
        size=16
    ),
    exponentformat='e',
    showexponent='all'
)
fig.update_yaxes(
    row=1,
    col=1,
    automargin=True,
    title_text=f"PhenoAge",
    # scaleanchor="x",
    # scaleratio=1,
    autorange=False,
    range=[min_plot_age, max_plot_age],
    showgrid=False,
    zeroline=False,
    linecolor='black',
    showline=True,
    gridcolor='gainsboro',
    gridwidth=0.05,
    mirror=True,
    ticks='outside',
    titlefont=dict(
        color='black',
        size=20
    ),
    showticklabels=True,
    tickangle=0,
    tickfont=dict(
        color='black',
        size=16
    ),
    exponentformat='e',
    showexponent='all'
)
fig.add_trace(
    go.Violin(
        y=data.loc[:, 'PhenoAge acceleration'].values,
        hovertext=data.index.values,
        name="Violin",
        box_visible=True,
        meanline_visible=True,
        showlegend=False,
        line_color='black',
        fillcolor='crimson',
        marker=dict(color='crimson', line=dict(color='black', width=0.5), opacity=0.75),
        points='all',
        bandwidth=np.ptp(data.loc[:, 'PhenoAge acceleration'].values) / 32,
        opacity=0.75
    ),
    row=1,
    col=2
)
fig.update_yaxes(
    row=1,
    col=2,
    automargin=True,
    title_text="PhenoAge acceleraton",
    autorange=True,
    showgrid=False,
    zeroline=True,
    linecolor='black',
    showline=True,
    gridcolor='gainsboro',
    gridwidth=0.05,
    mirror=True,
    ticks='outside',
    titlefont=dict(
        color='black',
        size=20
    ),
    showticklabels=True,
    tickangle=0,
    tickfont=dict(
        color='black',
        size=16
    ),
    exponentformat='e',
    showexponent='all'
)
fig.update_xaxes(
    row=1,
    col=2,
    automargin=True,
    autorange=False,
    range=[-0.5, 0.3],
    showgrid=False,
    showline=True,
    zeroline=False,
    showticklabels=False,
    mirror=True,
    ticks='outside',
    tickvals=[],

)
fig.update_layout(
    template="simple_white",
    width=800,
    height=450,
    margin=go.layout.Margin(l=100, r=20, b=50, t=50, pad=0),
)
fig.show()
fig.write_image(f"{path}/PhenoAge/PhenoAge.png")
fig.write_image(f"{path}/PhenoAge/PhenoAge.pdf", format="pdf")

## SHAP for PhenoAge

In [23]:
def predict_func(X):
    X_df = pd.DataFrame(data=X, columns=phenoage['Atlas'].values)
    
    X_df['LinearComb'] = -19.9067
    gamma = 0.0077
    for f in phenoage.index.values:
        X_df['LinearComb'] += phenoage.at[f, 'Coeff'] * X_df[phenoage.at[f, 'Atlas']].values
    X_df['MortalityScore'] = 1 - np.exp(-np.exp(X_df['LinearComb'].values) * (np.exp(120 * gamma) - 1) / gamma)
    X_df['PhenoAge'] = 141.50225 + np.log(-0.00553 * np.log(1 - X_df['MortalityScore'].values)) / 0.090165
    
    y = X_df['PhenoAge'].values
    return y

In [None]:
explainer = shap.SamplingExplainer(predict_func, data.loc[:, phenoage['Atlas'].values].values)
print(explainer.expected_value)
shap_values = explainer.shap_values(data.loc[:, phenoage['Atlas'].values].values)

In [25]:
shap.summary_plot(
    shap_values=shap_values,
    features=data.loc[:, phenoage['Atlas'].values].values,
    feature_names=phenoage['Atlas'].values,
    max_display=10,
    plot_type="violin",
    show=False,
    plot_size=[15,6]
)
plt.savefig(f"{path}/PhenoAge/PhenoAge_SHAP.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/PhenoAge/PhenoAge_SHAP.pdf", bbox_inches='tight')
plt.close()

In [None]:
trgt_id = 0

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[trgt_id],
        base_values=data['Возраст'].values[trgt_id],
        data=data.loc[:, phenoage['Atlas'].values].values[0],
        feature_names=phenoage['Atlas'].values
    ),
    max_display=10,
    show=True,
)

## Comparison with UNN data

In [None]:
data_unn = pd.read_excel(f"D:/YandexDisk/Work/bbd/phenoage/result.xlsx")
rename_dict = dict(zip(phenoage.index, phenoage['Atlas']))
data_unn.rename(columns=rename_dict, inplace=True)
data_unn['Source'] = "UNN"
data['Source'] = "Atlas"
df_compare = pd.concat([
    data.loc[:, phenoage['Atlas'].to_list() + ['Source']],
    data_unn.loc[:, phenoage['Atlas'].to_list() + ['Source']],
])

In [None]:
df_stat = pd.DataFrame(index=phenoage['Atlas'])
for feat in list(phenoage['Atlas']):
    vals = {}
    for group in ['UNN', 'Atlas']:
        vals[group] = df_compare.loc[df_compare['Source'] == group, feat].values
        df_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_stat.at[feat, f"q75_{group}"], df_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at[feat, f"iqr_{group}"] = df_stat.at[feat, f"q75_{group}"] - df_stat.at[feat, f"q25_{group}"]
    _, df_stat.at[feat, "mw_pval"] = mannwhitneyu(vals['UNN'], vals['Atlas'], alternative='two-sided')
_, df_stat.loc[phenoage['Atlas'].values, "mw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[phenoage['Atlas'].values, "mw_pval"].values, 0.05, method='fdr_bh')
_, df_stat.loc[phenoage['Atlas'].values, "mw_pval_bonferroni"], _, _ = multipletests(df_stat.loc[phenoage['Atlas'].values, "mw_pval"].values, 0.05, method='bonferroni')
_, df_stat.loc[phenoage['Atlas'].values, "mw_pval_simes-hochberg"], _, _ = multipletests(df_stat.loc[phenoage['Atlas'].values, "mw_pval"].values, 0.05, method='simes-hochberg')
df_stat.sort_values([f"mw_pval_fdr_bh"], ascending=[True], inplace=True)

In [None]:
n_rows = 2
n_cols = 5
fig_width = 15
fig_height = 8

pallette = {
    'UNN': 'dodgerblue',
    'Atlas': 'crimson'
}

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={})

for f_id, f in enumerate(df_stat.index.values):
    row_id, col_id = divmod(f_id, n_cols)
    
    sns.violinplot(
        data=df_compare,
        x='Source',
        y=f,
        hue='Source',
        palette=pallette,
        density_norm='width',
        order=['UNN', 'Atlas'],
        saturation=0.75,
        linewidth=1.0,
        ax=axs[row_id, col_id],
        legend=False,
    )
    axs[row_id, col_id].set_ylabel(f)
    axs[row_id, col_id].set_xlabel('')
    axs[row_id, col_id].get_legend().set_visible(False)
    axs[row_id, col_id].set(xticklabels=[]) 
    axs[row_id, col_id].set(xticks=[]) 
    mw_pval = df_stat.at[f, "mw_pval_fdr_bh"]
    axs[row_id, col_id].set_title(f'{mw_pval:.2e}')
    
legend_handles = [
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=pallette['UNN'], markersize=10, label='UNN'),
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=pallette['Atlas'], markersize=10, label='Atlas')
]
fig.legend(handles=legend_handles, bbox_to_anchor=(0.5, 1.0), loc="lower center", ncol=2, frameon=False, fontsize='large')
fig.tight_layout()    
plt.savefig(f"{path}/PhenoAge_feats_VS_UNN.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/PhenoAge_feats_VS_UNN.pdf", bbox_inches='tight')
plt.close(fig)