# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from sklearn.model_selection import BaseCrossValidator, ParameterGrid, ParameterSampler
import torch
import pickle
import shutil
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
from pytorch_tabular.utils import make_mixed_dataset, print_metrics
from pytorch_tabular import available_models
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, GANDALFConfig, TabNetModelConfig, FTTransformerConfig, DANetConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.tabular_model_tuner import TabularModelTuner
from torchmetrics.functional.regression import mean_absolute_error, pearson_corrcoef
from pytorch_tabular import MODEL_SWEEP_PRESETS
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.pt.hyper_opt import train_hyper_opt
from src.utils.hash import dict_hash
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
from itertools import chain
from pathlib import Path
import requests
from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE
import missingno as msno
from collections import Counter
import functools
from sklearn.cluster import DBSCAN, HDBSCAN


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]

def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter


# Load data

In [None]:
path = f"E:/YandexDisk/Work/bbd/mriya"

yadisk_file_url = "https://disk.yandex.ru/i/CEaw3cqI2Y7J0A"
response = requests.get(
    "https://cloud-api.yandex.net/v1/disk/public/resources/download", 
    params={'public_key': yadisk_file_url}
)
res = response.json()
download_url = res['href']
response = requests.get(download_url)
yadisk_subject_file_name = Path(f"{path}/Испытуемые Яндекс.xlsx")
with open(yadisk_subject_file_name, 'wb') as f:
    f.write(response.content)

In [None]:
path = f"E:/YandexDisk/Work/bbd/mriya"

df_params = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Parameters', index_col=0)
df_params['analysis_type'].replace(
    {
        'Sphygmocardiography': 'Сфигмография',
        'Echocardiography': 'Эхокардиография',
        'ECG': 'Электрокардиография',
        'BP': 'Биохимический анализ крови',
        'CBC': 'Общий анализ крови',
        'Anthropometry': 'Антропометрия'   
    },
    inplace=True
)
df_blood = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Blood', index_col=0)
df_blood.index = df_blood.index.astype(str)
df_heart = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Heart', index_col=0)
df_heart.index = df_heart.index.astype(str)
df_heart['sample_date'] = pd.to_datetime(df_heart['sample_date'])
df_heart['birthday'] = pd.to_datetime(df_heart['birthday'])
df_blood.insert(3, 'Age', (df_blood['sample_date'] - df_blood['birthday']) / np.timedelta64(1, 'D') / 365.25)
df_heart.insert(3, 'Age', (df_heart['sample_date'] - df_heart['birthday']) / np.timedelta64(1, 'D') / 365.25)

suffixes=('', '_heart')
df = pd.merge(df_blood, df_heart, left_index=True, right_index=True, how='outer', suffixes=suffixes)
cols_cmn = df_blood.columns.intersection(df_heart.columns).to_list()
with pd.ExcelWriter(f"{path}/conflicts.xlsx", engine='xlsxwriter') as writer:
    for col in cols_cmn:
        non_eq_ids = df.index[df[f'{col}{suffixes[0]}'] != df[f'{col}{suffixes[1]}']].to_list()
        df_col = df.loc[non_eq_ids, [f'{col}{suffixes[0]}', f'{col}{suffixes[1]}']]
        df_col.rename(columns={f'{col}{suffixes[0]}': 'Blood', f'{col}{suffixes[1]}': 'Heart'}, inplace=True)
        df_col.to_excel(writer, sheet_name=col)
        
cols_types = df_params['analysis_type'].dropna().unique()
cols_sets = {x: df.columns.intersection(df_params.index[df_params['analysis_type'] == x]).to_list() for x in cols_types}
df = df.loc[:, cols_cmn + list(chain.from_iterable(cols_sets.values()))]
df = df[df['Age'].notna()]

col_risk = 'РИСК, рассчитанный по Шкале глобальной оценки 10-летнего ССР (версия с использованием SCORE 2) с учётом, что все, кто не прошёл анкету НЕ курят и меньше 40 лет - низкий/умеренный риск, если нет модификаторы'

df_info = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Info', index_col=0)
indexes_info = df.index.intersection(df_info.index[df_info[col_risk].notna()])
df.loc[indexes_info, 'Риск ССЗ'] = df_info.loc[indexes_info, col_risk]

In [None]:
indexes_info = df.index.intersection(df_info.index[df_info['Код по МКБ'].notna()])
df.loc[indexes_info, 'ICD-10 Code'] = df_info.loc[indexes_info, 'Код по МКБ']
df['ICD-10 Code'] = df['ICD-10 Code'].str.replace(' ', '')

statuses = np.concatenate(df['ICD-10 Code'].dropna().str.split(',').values)
statuses_counter = Counter(statuses)
df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)
for icd_code in df_statuses_counter.index:
    df[f"{icd_code}"] = 0
    
srs_statuses = df['ICD-10 Code'].dropna().str.split(',')
for sample_id in srs_statuses.index:
    sample_diseases = srs_statuses[sample_id]
    for sample_disease in sample_diseases:
        if sample_disease != '':
            df.at[sample_id, f"{sample_disease}"] = 1

In [None]:
df_statuses_counter.to_excel(f"{path}/ICD10.xlsx")

In [None]:
df.to_excel(f"{path}/data_bioage_all.xlsx")
df.loc[df['Риск ССЗ'] == 'Высокий', :].to_excel(f"{path}/data_bioage_high_risk.xlsx")
df.loc[df['Риск ССЗ'] == 'Очень высокий', :].to_excel(f"{path}/data_bioage_very_high_risk.xlsx")
df.loc[df['Риск ССЗ'] == 'Низкий/умеренный', :].to_excel(f"{path}/data_bioage_low_medium_risk.xlsx")

# NaNs analysis

In [None]:
with pd.ExcelWriter(f"{path}/nans.xlsx", engine='xlsxwriter') as writer:
    for col_set, cols in cols_sets.items():
        data = df.loc[:, cols]
        nan_feats = data.isna().sum(axis=0).to_frame(name="Number of NaNs")
        nan_feats["% of NaNs"] = nan_feats["Number of NaNs"] / data.shape[0] * 100
        nan_feats["Number of not-NaNs"] = data.notna().sum(axis=0)
        nan_feats.sort_values(["% of NaNs"], ascending=[True], inplace=True)
        nan_feats.to_excel(writer, sheet_name=col_set)

# Correlation with Age

In [None]:
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(6, 3.5))
histplot = sns.histplot(
    data=df[['Age']].dropna(),
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    color='crimson',
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path}/age_hist.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/age_hist.pdf", bbox_inches='tight')
plt.close(fig)

feats_cnt = df_params.index[df_params['data_type'].isin(['decimal', 'integer'])].intersection(list(chain.from_iterable(cols_sets.values()))).to_list()
df_corr = pd.DataFrame(index=feats_cnt, columns=['count', 'rho', 'pval', ])
for f in tqdm(feats_cnt):
    df_tmp = df.loc[:, ['Age', f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        df_corr.at[f, 'count'] = df_tmp.shape[0]
        vals_1 = df_tmp.loc[:, 'Age'].values
        vals_2 = df_tmp.loc[:, f].values
        df_corr.at[f, 'rho'], df_corr.at[f, 'pval'] = stats.pearsonr(vals_1, vals_2)
df_corr.dropna(axis=0, how='any', inplace=True)
_, df_corr['pval_fdr_bh'], _, _ = multipletests(df_corr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
df_corr[['feature_name_ru', 'analysis_type', 'description']] = df_params.loc[df_corr.index, ['feature_name_ru', 'analysis_type', 'description']]
df_corr.to_excel(f"{path}/age_pearson.xlsx", index_label="Features")

# Generate datasets

In [None]:
path = 'E:/YandexDisk/Work/bbd/mriya'
path_to_models = f"{path}/models/тест"

feats_set = 'Тест'

feat_out = 'Age'

n_cols = 6

colors_feats_sets = {
    'Эхокардиография': 'darkcyan',
    'Сфигмография': 'mediumorchid',
    'Биохимический анализ крови': 'goldenrod',
    'Электрокардиография': 'dodgerblue',
    'Антропометрия': 'chartreuse',
    'Общий анализ крови': 'crimson',
    'Все': 'gray',
    'Тест': 'crimson',
}

df = pd.read_excel(f"{path}/data_bioage_all.xlsx", index_col=0)

df_params = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Parameters', index_col=0)
df_params['analysis_type'].replace(
    {
        'Sphygmocardiography': 'Сфигмография',
        'Echocardiography': 'Эхокардиография',
        'ECG': 'Электрокардиография',
        'BP': 'Биохимический анализ крови',
        'CBC': 'Общий анализ крови',
        'Anthropometry': 'Антропометрия'   
    },
    inplace=True
)

df_feats = pd.read_excel(f"{path_to_models}/{feats_set}/feats.xlsx", index_col=0)
feats_to_copy = ['feature_name', 'feature_name_ru', 'units', 'units_ru', 'analysis_type', 'description', 'data_type']
df_feats.loc[df_feats.index, feats_to_copy] = df_params.loc[df_feats.index, feats_to_copy]
feats_in = df_feats.index.to_list()
df_feats.to_excel(f"{path_to_models}/{feats_set}/feats.xlsx", index_label="Features")
df = df[[feat_out, 'Sex'] + feats_in + ['Риск ССЗ']]
df.dropna(axis=0, subset=[feat_out] + feats_in, how='any', inplace=True)
df.to_excel(f"{path_to_models}/{feats_set}/data.xlsx", index_label="IDs")

# Table view
df_msno = df[feats_in].copy()
msno_mtx = msno.matrix(
    df=df_msno,
    label_rotation=90,
    color=mcolors.to_rgb(colors_feats_sets[feats_set]),
    figsize=(0.7 * len(feats_in), 5),
)
plt.xticks(ha='center')
plt.setp(msno_mtx.xaxis.get_majorticklabels(), ha="center")
msno_mtx.set_title(feats_set, fontsize='large')
msno_mtx.set_ylabel("IDs", fontsize='large')
plt.savefig(f"{path_to_models}/{feats_set}/msno.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/msno.pdf", bbox_inches='tight')
plt.clf()

# Age histogramm
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(6, 3.5), layout='constrained')
histplot = sns.histplot(
    data=df,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x=feat_out,
    color=colors_feats_sets[feats_set],
    ax=ax
)
histplot.set(xlim=(0, 120))
histplot.set_ylabel('Количество')
histplot.set_xlabel('Возраст')
histplot.set_title(feats_set)
plt.savefig(f"{path_to_models}/{feats_set}/age_hist.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/age_hist.pdf", bbox_inches='tight')
plt.close(fig)

# Input features and output feature correlations
df_corr = pd.DataFrame(index=feats_in, columns=['rho'])
for f in tqdm(feats_in):
    df_tmp = df.loc[:, [feat_out, f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        vals_1 = df_tmp.loc[:, feat_out].values
        vals_2 = df_tmp.loc[:, f].values
        df_corr.at[f, 'rho'], _ = stats.pearsonr(vals_1, vals_2)
df_corr.dropna(axis=0, how='any', inplace=True)
df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
feats_cnt_wo_age = df_corr.index.to_list()
feats_cnt = [feat_out] + feats_cnt_wo_age
df_corr = df_corr.apply(pd.to_numeric)
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(0.8 + 0.038 * df_corr.index.str.len().max(), 0.9 + 0.4 * len(feats_cnt_wo_age) + 0.04 * df_corr.index.str.len().max()) , layout='constrained')
heatmap = sns.heatmap(
    df_corr.loc[:, ['rho']],
    annot=True,
    fmt=".2f",
    vmin=-1.0,
    vmax=1.0,
    cmap='coolwarm',
    linewidth=0.1,
    linecolor='black',
    #annot_kws={"fontsize": 15},
    cbar_kws={
        # "shrink": 0.9,
        # "aspect": 30,
        #'fraction': 0.046, 
        #'pad': 0.04,
    },
    ax=ax
)
heatmap_pos = ax.get_position()
ax.figure.axes[-1].set_position([heatmap_pos.x1 + 0.05, heatmap_pos.y0, 0.1, heatmap_pos.height])
ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title(feats_set, fontsize=16)
ax.set(xticklabels=[])
ax.set(xticks=[])
plt.savefig(f"{path_to_models}/{feats_set}/age_feats_pearsonr.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/age_feats_pearsonr.pdf", bbox_inches='tight')
plt.close(fig)

# Input vs output distribution
n_rows = int(np.ceil(len(feats_in) / n_cols))
n_empty = n_rows * n_cols - len(feats_in)
sns.set_theme(style='ticks')
fig, axs = plt.subplots(
    nrows=n_rows,
    ncols=n_cols,
    figsize=(n_cols * 3.0, n_rows * 2.5),
    gridspec_kw={'wspace':0.10, 'hspace': 0.05}, 
    sharex=True,
    layout='constrained'
)
if axs.ndim > 1:
    for feat_id, feat in enumerate(df_corr.index.values):
        row_id, col_id = divmod(feat_id, n_cols)
        regplot = sns.regplot(
            data=df,
            x=feat_out,
            y=feat,
            color=colors_feats_sets[feats_set],
            scatter_kws=dict(
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                s=16,
            ),
            ax=axs[row_id, col_id]
        )
        axs[row_id, col_id].set_title(fr"Pearson $\rho$: {df_corr.loc[feat, 'rho']:0.3f}")
        y_labe_fontsize = min(15 / (len(feat) / 20), 13)
        axs[row_id, col_id].set_ylabel(feat, fontsize=y_labe_fontsize)
        axs[row_id, col_id].xaxis.set_tick_params(which='both', labelbottom=True)
    for empty_id in range(n_empty):   
        axs[n_rows - 1, n_cols - 1 - empty_id].axis('off')
else:
    for feat_id, feat in enumerate(df_corr.index.values):
        row_id, col_id = divmod(feat_id, n_cols)
        regplot = sns.regplot(
            data=df,
            x=feat_out,
            y=feat,
            color=colors_feats_sets[feats_set],
            scatter_kws=dict(
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                s=16,
            ),
            ax=axs[max(row_id, col_id)]
        )
        axs[max(row_id, col_id)].set_title(fr"Pearson $\rho$: {df_corr.loc[feat, 'rho']:0.3f}")
        y_labe_fontsize = min(15 / (len(feat) / 20), 13)
        axs[max(row_id, col_id)].set_ylabel(feat, fontsize=y_labe_fontsize)
        axs[max(row_id, col_id)].xaxis.set_tick_params(which='both', labelbottom=True)
fig.savefig(f"{path_to_models}/{feats_set}/age_feats_regplot.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_to_models}/{feats_set}/age_feats_regplot.pdf", bbox_inches='tight')
plt.close(fig)

# Correlation heatmap
feats_cnt = [feat_out] + feats_in
df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_cnt), len(feats_cnt))), index=feats_cnt, columns=feats_cnt)
for f_id_1 in range(len(feats_cnt)):
    for f_id_2 in range(f_id_1, len(feats_cnt)):
        f_1 = feats_cnt[f_id_1]
        f_2 = feats_cnt[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = df.loc[:, f_1].values
            vals_2 = df.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_corr.at[f_2, f_1] = pval
            df_corr.at[f_1, f_2] = corr
        else:
            df_corr.at[f_2, f_1] = np.nan
selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=bool)
df_fdr = df_corr.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
nzmin = df_fdr['pval_fdr_bh'][df_fdr['pval_fdr_bh'].gt(0)].min(0) * 0.5
df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
df_corr_fdr = df_corr.copy()
for line_id in range(df_fdr.shape[0]):
    df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])
df_corr_fdr.to_excel(f"{path_to_models}/{feats_set}/feats_pearsonr.xlsx")
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(8.5 + 0.35 * len(feats_cnt), 6.5 + 0.25 * len(feats_cnt)), layout='constrained')
cmap_triu = plt.get_cmap("seismic").copy()
mask_triu=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool)
heatmap_diff = sns.heatmap(
    df_corr_fdr,
    mask=mask_triu,
    annot=True,
    fmt=".2f",
    center=0.0,
    cmap=cmap_triu,
    linewidth=0.1,
    linecolor='black',
    annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 8)},
    ax=ax
)
ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')
mask_tril=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool).T
heatmap_pval = sns.heatmap(
    df_corr_fdr,
    mask=mask_tril,
    annot=True,
    fmt=".1f",
    vmin=-np.log10(0.05),
    cmap=cmap_tril,
    linewidth=0.1,
    linecolor='black',
    annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 8)},
    ax=ax
)
ax.figure.axes[-1].set_ylabel(r"$-\log_{10}(\mathrm{p-value})$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
ax.set_xlabel('', fontsize=16)
ax.set_ylabel('', fontsize=16)
ax.set_title(feats_set, fontsize=16)
plt.savefig(f"{path_to_models}/{feats_set}/feats_pearsonr.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/feats_pearsonr.pdf", bbox_inches='tight')
plt.close(fig)

# IQR outliers
feats_cnt = [feat_out] + feats_in
out_columns = []
for f in tqdm(feats_cnt):
    q1 = df[f].quantile(0.25)
    q3 = df[f].quantile(0.75)
    iqr = q3 - q1
    df[f"{f} IQR Outlier"] = 1
    out_columns.append(f"{f} IQR Outlier")
    filter = (df[f] >= q1 - 1.5 * iqr) & (df[f] <= q3 + 1.5 * iqr)
    df.loc[filter, f"{f} IQR Outlier"] = 0
df[f"Number of IQR Outliers"] = df.loc[:, out_columns].sum(axis=1)

hist_bins = np.linspace(-0.5, len(feats_cnt) + 0.5, len(feats_cnt) + 2)
fig = plt.figure(figsize=(5, 3))
sns.set_theme(style='ticks')
histplot = sns.histplot(
    data=df,
    x=f"Number of IQR Outliers",
    multiple="stack",
    bins=hist_bins,
    edgecolor='k',
    linewidth=1.0,
    color=colors_feats_sets[feats_set],
)
histplot.set(xlim=(-0.5, max(df['Number of IQR Outliers'] + 0.5)))
histplot.set_title(feats_set)
histplot.set_xlabel("Количество IQR выбросов")
histplot.set_ylabel("Количество записей")
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_hist.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_hist.pdf", bbox_inches='tight')
plt.close(fig)

out_columns = [f"{f} IQR Outlier" for f in feats_cnt]
df_msno = df.loc[:, out_columns].copy()
df_msno.replace({1: np.nan}, inplace=True)
df_msno.rename(columns=dict(zip(out_columns, feats_cnt)), inplace=True)

# Plot barplot for features with outliers
msno_bar = msno.bar(
    df=df_msno,
    label_rotation=90,
    color=colors_feats_sets[feats_set],
    # figsize=(0.4 * len(feats_cnt), 4),
)
plt.xticks(ha='center')
plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
msno_bar.set_title(feats_set, fontsize='large')
msno_bar.set_ylabel("Записи без выбросов", fontsize='large')
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_bar.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_bar.pdf", bbox_inches='tight')
plt.clf()

# Plot matrix of samples outliers distribution
msno_mtx = msno.matrix(
    df=df_msno,
    label_rotation=90,
    color=mcolors.to_rgb(colors_feats_sets[feats_set]),
    # figsize=(0.7 * len(feats_cnt), 5),
)
plt.xticks(ha='center')
plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
msno_mtx.set_title(feats_set, fontsize='large')
msno_mtx.set_ylabel("Записи", fontsize='large')
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_matrix.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_matrix.pdf", bbox_inches='tight')
plt.clf()

# Plot heatmap of features outliers correlations
msno_heatmap = msno.heatmap(
    df=df_msno,
    label_rotation=90,
    cmap="bwr",
    fontsize=12,
    # figsize=(0.6 * len(feats_cnt), 0.6 * len(feats_cnt))
)
msno_heatmap.set_title(feats_set, fontsize='large')
plt.setp(msno_heatmap.xaxis.get_majorticklabels(), ha="center")
msno_heatmap.collections[0].colorbar.ax.tick_params(labelsize=20)
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_heatmap.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_to_models}/{feats_set}/outs_iqr_heatmap.pdf", bbox_inches='tight')
plt.clf()

# Dimensionality reduction
feats_cnt = [feat_out] + feats_in
dim_red_models = {
    't-SNE': TSNE(n_components=2),
    'PCA': PCA(n_components=2, whiten=False),
    'IsoMap': Isomap(n_components=2, n_neighbors=5),
    'MDS': MDS(n_components=2, metric=True),
    'GRP': GaussianRandomProjection(n_components=2, eps=0.5),
    'SRP': SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False),
}
feats_dim_red = []
for drm in dim_red_models:
    dim_red_res = dim_red_models[drm].fit_transform(df.loc[:, feats_cnt].values)
    df.loc[:, f"{drm} 1"] = dim_red_res[:, 0]
    df.loc[:, f"{drm} 2"] = dim_red_res[:, 1]
    df.loc[:, f"{drm} HDBSCAN"] = HDBSCAN(min_cluster_size=int(df.shape[0] * 0.05)).fit(df.loc[:, [f"{drm} 1", f"{drm} 2"]].values).labels_
    feats_dim_red += [ f"{drm} 1",  f"{drm} 2"]
n_rows = 2
n_cols = 3
fig_height = 10
fig_width = 15
sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False, layout='constrained')
for drm_id, drm in enumerate(dim_red_models.keys()):
    row_id, col_id = divmod(drm_id, n_cols)
    scatter = sns.scatterplot(
        data=df,
        x=f"{drm} 1",
        y=f"{drm} 2",
        # hue=f"{drm} HDBSCAN",
        hue='Sex',
        palette={'M': 'deepskyblue', 'F': 'hotpink'},
        linewidth=0.25,
        alpha=0.75,
        edgecolor="k",
        s=40,
        # color=colors[feats_set],
        ax=axs[row_id, col_id],
    )
    axs[row_id, col_id].set_title(drm)
    # axs[n_rows - 1, n_cols - 1].axis('off')
fig.suptitle(feats_set, fontsize='large')   
fig.savefig(f"{path_to_models}/{feats_set}/dim_red.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_to_models}/{feats_set}/dim_red.pdf", bbox_inches='tight')
df.to_excel(f"{path_to_models}/{feats_set}/df_proc.xlsx", index_label="ID")
plt.close(fig)


# Legacy datasets generation

## Generate datasets for each feature group

In [None]:
models_type = 'models_v2'

colors_feats_sets = {
    'Эхокардиография': 'darkcyan',
    'Сфигмография': 'mediumorchid',
    'Биохимический анализ крови': 'goldenrod',
    'Электрокардиография': 'dodgerblue',
    'Антропометрия': 'chartreuse',
    'Общий анализ крови': 'crimson',
}

for fs in tqdm(colors_feats_sets):
    Path(f"{path}/{models_type}/{fs}").mkdir(parents=True, exist_ok=True)
    ds_feats = pd.read_excel(f"{path}/nans.xlsx", sheet_name=fs, index_col=0)      
    feats_to_copy = ['feature_name', 'feature_name_ru', 'units', 'units_ru', 'analysis_type', 'description', 'data_type']
    ds_feats.loc[ds_feats.index, feats_to_copy] = df_params.loc[ds_feats.index, feats_to_copy]
    ds_feats.to_excel(f"{path}/{models_type}/{fs}/feats.xlsx", index_label="Features")
    ds_df = df[['Age'] + ds_feats.index.to_list() + ['Сердечно-сосудистый риск']]
    ds_df.dropna(axis=0, subset=['Age'] + ds_feats.index.to_list(), how='any', inplace=True)
    ds_df.to_excel(f"{path}/{models_type}/{fs}/data.xlsx", index_label="IDs")
    
    # Age histogramm
    hist_bins = np.linspace(5, 115, 23)
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(6, 3.5), layout='constrained')
    histplot = sns.histplot(
        data=df,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=colors_feats_sets[fs],
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    histplot.set_ylabel('Количество')
    histplot.set_title(fs)
    plt.savefig(f"{path}/{models_type}/{fs}/age_hist.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/age_hist.pdf", bbox_inches='tight')
    plt.close(fig)
    
    # Input features and output feature correlations
    feats_cnt_wo_age = ds_feats.index[ds_feats['data_type'].isin(['decimal', 'integer'])].values
    df_corr = pd.DataFrame(index=feats_cnt_wo_age, columns=['rho'])
    for f in feats_cnt_wo_age:
        df_tmp = ds_df.loc[:, ['Age', f]].dropna(axis=0, how='any')
        if df_tmp.shape[0] > 1:
            vals_1 = df_tmp.loc[:, 'Age'].values
            vals_2 = df_tmp.loc[:, f].values
            df_corr.at[f, 'rho'], _ = stats.pearsonr(vals_1, vals_2)
    df_corr.dropna(axis=0, how='any', inplace=True)
    df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
    df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
    feats_cnt_wo_age = df_corr.index.to_list()
    feats_cnt = ['Age'] + feats_cnt_wo_age
    df_corr = df_corr.apply(pd.to_numeric)
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(0.3 + 0.03 * df_corr.index.str.len().max(), 0.5 + 0.4 * len(feats_cnt_wo_age) + 0.04 * df_corr.index.str.len().max()) , layout='constrained')
    heatmap = sns.heatmap(
        df_corr.loc[:, ['rho']],
        annot=True,
        fmt=".2f",
        vmin=-1.0,
        vmax=1.0,
        cmap='coolwarm',
        linewidth=0.1,
        linecolor='black',
        #annot_kws={"fontsize": 15},
        cbar_kws={
            # "shrink": 0.9,
            # "aspect": 30,
            #'fraction': 0.046, 
            #'pad': 0.04,
        },
        ax=ax
    )
    heatmap_pos = ax.get_position()
    ax.figure.axes[-1].set_position([heatmap_pos.x1 + 0.05, heatmap_pos.y0, 0.1, heatmap_pos.height])
    ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
    for spine in ax.figure.axes[-1].spines.values():
        spine.set(visible=True, lw=0.25, edgecolor="black")
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set(xticklabels=[])
    ax.set(xticks=[])
    plt.savefig(f"{path}/{models_type}/{fs}/age_feats_pearsonr.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/age_feats_pearsonr.pdf", bbox_inches='tight')
    plt.close(fig)
    
    # Correlation heatmap
    feats_cnt = ['Age'] + ds_feats.index[ds_feats['data_type'].isin(['decimal', 'integer'])].to_list()
    df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_cnt), len(feats_cnt))), index=feats_cnt, columns=feats_cnt)
    for f_id_1 in range(len(feats_cnt)):
        for f_id_2 in range(f_id_1, len(feats_cnt)):
            f_1 = feats_cnt[f_id_1]
            f_2 = feats_cnt[f_id_2]
            if f_id_1 != f_id_2:
                vals_1 = ds_df.loc[:, f_1].values
                vals_2 = ds_df.loc[:, f_2].values
                corr, pval = stats.pearsonr(vals_1, vals_2)
                df_corr.at[f_2, f_1] = pval
                df_corr.at[f_1, f_2] = corr
            else:
                df_corr.at[f_2, f_1] = np.nan
    selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=bool)
    df_fdr = df_corr.where(selection).stack().reset_index()
    df_fdr.columns = ['row', 'col', 'pval']
    _, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
    nzmin = df_fdr['pval_fdr_bh'][df_fdr['pval_fdr_bh'].gt(0)].min(0) * 0.5
    df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
    df_corr_fdr = df_corr.copy()
    for line_id in range(df_fdr.shape[0]):
        df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])
    df_corr_fdr.to_excel(f"{path}/{models_type}/{fs}/feats_pearsonr.xlsx")
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(4.5 + 0.25 * len(feats_cnt), 2.5 + 0.25 * len(feats_cnt)))
    cmap_triu = plt.get_cmap("seismic").copy()
    mask_triu=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool)
    heatmap_diff = sns.heatmap(
        df_corr_fdr,
        mask=mask_triu,
        annot=True,
        fmt=".2f",
        center=0.0,
        cmap=cmap_triu,
        linewidth=0.1,
        linecolor='black',
        annot_kws={"fontsize": 25 / np.sqrt(len(df_corr_fdr.values))},
        ax=ax
    )
    ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$", size=13)
    for spine in ax.figure.axes[-1].spines.values():
        spine.set(visible=True, lw=0.25, edgecolor="black")
        
    cmap_tril = plt.get_cmap("viridis").copy()
    cmap_tril.set_under('black')
    mask_tril=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool).T
    heatmap_pval = sns.heatmap(
        df_corr_fdr,
        mask=mask_tril,
        annot=True,
        fmt=".1f",
        vmin=-np.log10(0.05),
        cmap=cmap_tril,
        linewidth=0.1,
        linecolor='black',
        annot_kws={"fontsize": 25 / np.sqrt(len(df_corr_fdr.values))},
        ax=ax
    )
    ax.figure.axes[-1].set_ylabel(r"$-\log_{10}(\mathrm{p-value})$", size=13)
    for spine in ax.figure.axes[-1].spines.values():
        spine.set(visible=True, lw=0.25, edgecolor="black")
    ax.set_xlabel('', fontsize=16)
    ax.set_ylabel('', fontsize=16)
    ax.set_title('', fontsize=16)
    # ax.set_xticklabels(ax.get_xticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
    # for tick_label in ax.get_xticklabels():
        # tick_label.set_color(colors_tissues[tick_label.get_text()])
        # ax.set_xticklabels(ax.get_xticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
    # for tick_label in ax.get_yticklabels():
        # tick_label.set_color(colors_tissues[tick_label.get_text()])
        # ax.set_yticklabels(ax.get_yticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
    plt.savefig(f"{path}/{models_type}/{fs}/feats_pearsonr.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/feats_pearsonr.pdf", bbox_inches='tight')
    plt.close(fig)
    
    df_proc = ds_df.copy()

    # IQR outliers
    out_columns = []
    for f in tqdm(feats_cnt):
        q1 = df_proc[f].quantile(0.25)
        q3 = df_proc[f].quantile(0.75)
        iqr = q3 - q1
        df_proc[f"{f} IQR Outlier"] = 1
        out_columns.append(f"{f} IQR Outlier")
        filter = (df_proc[f] >= q1 - 1.5 * iqr) & (df_proc[f] <= q3 + 1.5 * iqr)
        df_proc.loc[filter, f"{f} IQR Outlier"] = 0
    df_proc[f"Number of IQR Outliers"] = df_proc.loc[:, out_columns].sum(axis=1)

    hist_bins = np.linspace(-0.5, len(feats_cnt) + 0.5, len(feats_cnt) + 2)
    fig = plt.figure(figsize=(5, 3))
    sns.set_theme(style='ticks')
    histplot = sns.histplot(
        data=df_proc,
        x=f"Number of IQR Outliers",
        multiple="stack",
        bins=hist_bins,
        edgecolor='k',
        linewidth=1.0,
        color=colors_feats_sets[fs],
    )
    histplot.set(xlim=(-0.5, max(df_proc['Number of IQR Outliers'] + 0.5)))
    histplot.set_title(fs)
    histplot.set_xlabel("Количество IQR выбросов")
    histplot.set_ylabel("Количество записей")
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_hist.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_hist.pdf", bbox_inches='tight')
    plt.close(fig)

    out_columns = [f"{f} IQR Outlier" for f in feats_cnt]
    df_msno = df_proc.loc[:, out_columns].copy()
    df_msno.replace({1: np.nan}, inplace=True)
    df_msno.rename(columns=dict(zip(out_columns, feats_cnt)), inplace=True)

    # Plot barplot for features with outliers
    msno_bar = msno.bar(
        df=df_msno,
        label_rotation=90,
        color=colors_feats_sets[fs],
        # figsize=(0.4 * len(feats_cnt), 4),
    )
    plt.xticks(ha='center')
    plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
    msno_bar.set_title(fs, fontsize='large')
    msno_bar.set_ylabel("Записи без выбросов", fontsize='large')
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_bar.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_bar.pdf", bbox_inches='tight')
    plt.clf()

    # Plot matrix of samples outliers distribution
    msno_mtx = msno.matrix(
        df=df_msno,
        label_rotation=90,
        color=mcolors.to_rgb(colors_feats_sets[fs]),
        # figsize=(0.7 * len(feats_cnt), 5),
    )
    plt.xticks(ha='center')
    plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
    msno_mtx.set_title(fs, fontsize='large')
    msno_mtx.set_ylabel("Записи", fontsize='large')
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_matrix.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_matrix.pdf", bbox_inches='tight')
    plt.clf()

    # Plot heatmap of features outliers correlations
    msno_heatmap = msno.heatmap(
        df=df_msno,
        label_rotation=90,
        cmap="bwr",
        fontsize=12,
        # figsize=(0.6 * len(feats_cnt), 0.6 * len(feats_cnt))
    )
    msno_heatmap.set_title(fs, fontsize='large')
    plt.setp(msno_heatmap.xaxis.get_majorticklabels(), ha="center")
    msno_heatmap.collections[0].colorbar.ax.tick_params(labelsize=20)
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_heatmap.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{models_type}/{fs}/outs_iqr_heatmap.pdf", bbox_inches='tight')
    plt.clf()
        
    # Dimensionality reduction
    dim_red_models = {
        't-SNE': TSNE(n_components=2),
        'PCA': PCA(n_components=2, whiten=False),
        'IsoMap': Isomap(n_components=2, n_neighbors=5),
        'MDS': MDS(n_components=2, metric=True),
        'GRP': GaussianRandomProjection(n_components=2, eps=0.5),
        'SRP': SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False),
    }
    feats_dim_red = []
    for drm in dim_red_models:
        dim_red_res = dim_red_models[drm].fit_transform(df_proc.loc[:, feats_cnt].values)
        df_proc.loc[:, f"{drm} 1"] = dim_red_res[:, 0]
        df_proc.loc[:, f"{drm} 2"] = dim_red_res[:, 1]
        feats_dim_red += [ f"{drm} 1",  f"{drm} 2"]
    n_rows = 2
    n_cols = 3
    fig_height = 10
    fig_width = 15
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False, layout='constrained')
    for drm_id, drm in enumerate(dim_red_models.keys()):
        row_id, col_id = divmod(drm_id, n_cols)
        scatter = sns.scatterplot(
            data=df_proc,
            x=f"{drm} 1",
            y=f"{drm} 2",
            linewidth=0.25,
            alpha=0.75,
            edgecolor="k",
            s=40,
            color=colors_feats_sets[fs],
            ax=axs[row_id, col_id],
        )
        axs[row_id, col_id].set_title(drm)
        # axs[n_rows - 1, n_cols - 1].axis('off')
    fig.suptitle(fs, fontsize='large')   
    fig.savefig(f"{path}/{models_type}/{fs}/dim_red.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/{models_type}/{fs}/dim_red.pdf", bbox_inches='tight')
    df_proc.to_excel(f"{path}/{models_type}/{fs}/df_proc.xlsx", index_label="ID")
    plt.close(fig)
    

## Generate datasets with selected features

In [None]:
feats_sets = [
    # 'Best Correlation',
    # 'Arterial Stiffness',
    'MaxSamplesMaxFeatures',
]

df_info = pd.read_excel(f"{path}/Испытуемые Яндекс.xlsx", sheet_name='Info', index_col=0)

for fs in feats_sets:
    dataset_feats = pd.read_excel(f"{path}/{fs}/feats.xlsx", index_col=0)
    feats_to_copy = ['units', 'units_ru', 'analysis_type', 'description', 'data_type']
    dataset_feats.loc[dataset_feats.index, feats_to_copy] = df_params.loc[dataset_feats.index, feats_to_copy]
    dataset_feats.to_excel(f"{path}/{fs}/feats.xlsx", index_label="Features")
    feats = [] + dataset_feats.index.to_list()
    ds_df = df[['Age'] + feats]
    ds_df.dropna(axis=0, how='any', inplace=True)
    indexes_info = ds_df.index.intersection(df_info.index[df_info['РИСК'].notna()])
    ds_df.loc[indexes_info, 'РИСК'] = df_info.loc[indexes_info, 'РИСК']
    ds_df.to_excel(f"{path}/{fs}/data.xlsx", index_label="IDs")
    
    feats_cnt = ['Age'] + df_params.index[df_params['data_type'].isin(['decimal', 'integer'])].intersection(feats).to_list()
    df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_cnt), len(feats_cnt))), index=feats_cnt, columns=feats_cnt)
    for f_id_1 in range(len(feats_cnt)):
        for f_id_2 in range(f_id_1, len(feats_cnt)):
            f_1 = feats_cnt[f_id_1]
            f_2 = feats_cnt[f_id_2]
            if f_id_1 != f_id_2:
                vals_1 = ds_df.loc[:, f_1].values
                vals_2 = ds_df.loc[:, f_2].values
                corr, pval = stats.pearsonr(vals_1, vals_2)
                df_corr.at[f_2, f_1] = pval
                df_corr.at[f_1, f_2] = corr
            else:
                df_corr.at[f_2, f_1] = np.nan
    selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=bool)
    df_fdr = df_corr.where(selection).stack().reset_index()
    df_fdr.columns = ['row', 'col', 'pval']
    _, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
    nzmin = df_fdr['pval_fdr_bh'][df_fdr['pval_fdr_bh'].gt(0)].min(0) * 0.5
    df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
    df_corr_fdr = df_corr.copy()
    for line_id in range(df_fdr.shape[0]):
        df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])
    df_corr_fdr.to_excel(f"{path}/{fs}/feats_pearsonr.xlsx")
        
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(4.5 + 0.25 * len(feats_cnt), 2.5 + 0.25 * len(feats_cnt)))
    cmap_triu = plt.get_cmap("seismic").copy()
    mask_triu=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool)
    heatmap_diff = sns.heatmap(
        df_corr_fdr,
        mask=mask_triu,
        annot=True,
        fmt=".2f",
        center=0.0,
        cmap=cmap_triu,
        linewidth=0.1,
        linecolor='black',
        annot_kws={"fontsize": 25 / np.sqrt(len(df_corr_fdr.values))},
        ax=ax
    )
    ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$", size=13)
    for spine in ax.figure.axes[-1].spines.values():
        spine.set(visible=True, lw=0.25, edgecolor="black")
        
    cmap_tril = plt.get_cmap("viridis").copy()
    cmap_tril.set_under('black')
    mask_tril=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool).T
    heatmap_pval = sns.heatmap(
        df_corr_fdr,
        mask=mask_tril,
        annot=True,
        fmt=".1f",
        vmin=-np.log10(0.05),
        cmap=cmap_tril,
        linewidth=0.1,
        linecolor='black',
        annot_kws={"fontsize": 25 / np.sqrt(len(df_corr_fdr.values))},
        ax=ax
    )
    ax.figure.axes[-1].set_ylabel(r"$-\log_{10}(\mathrm{p-value})$", size=13)
    for spine in ax.figure.axes[-1].spines.values():
        spine.set(visible=True, lw=0.25, edgecolor="black")
    ax.set_xlabel('', fontsize=16)
    ax.set_ylabel('', fontsize=16)
    ax.set_title('', fontsize=16)
    # ax.set_xticklabels(ax.get_xticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
    # for tick_label in ax.get_xticklabels():
        # tick_label.set_color(colors_tissues[tick_label.get_text()])
        # ax.set_xticklabels(ax.get_xticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
    # for tick_label in ax.get_yticklabels():
        # tick_label.set_color(colors_tissues[tick_label.get_text()])
        # ax.set_yticklabels(ax.get_yticklabels(), path_effects=[pe.withStroke(linewidth=0.5, foreground="black")])
    plt.savefig(f"{path}/{fs}/feats_pearsonr.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/{fs}/feats_pearsonr.pdf", bbox_inches='tight')
    plt.close(fig)