# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [1]:
import pickle
from scipy import stats
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
import numpy as np
import pandas as pd
import warnings
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
from itertools import chain
from pathlib import Path
from pypdf import PdfReader
import os
from matplotlib_venn import venn3, venn3_circles
import upsetplot
from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE
import missingno as msno
from sklearn.cluster import DBSCAN, HDBSCAN
import glob
import os


def list_files_with_extension_glob(directory, extension):
    """
    Lists all files with a given extension using the glob module.

    Args:
        directory (str): The root directory to start searching from.
        extension (str): The file extension (e.g., '.py', 'py').

    Returns:
        list: A list of strings for the matching file paths.
    """
    # Ensure the extension starts with a dot
    if not extension.startswith('.'):
        extension = f'.{extension}'

    # The pattern '**/*{extension}' matches files recursively
    search_pattern = os.path.join(directory, f'**/*{extension}')
    # The recursive=True flag enables the '**' wildcard for subdirectories
    files = glob.glob(search_pattern, recursive=True)
    return files

# Load data from files

In [None]:
# Path to main directory
path = 'E:/YandexDisk/Work/bbd/millennium'

# Get all possible files with analysis
files = list_files_with_extension_glob(f"{path}/Результаты анализов/files", 'pdf') + \
    list_files_with_extension_glob(f"{path}/Результаты анализов/files_test", 'pdf') + \
    list_files_with_extension_glob(f"{path}/Половые гормоны", 'pdf') + \
    list_files_with_extension_glob(f"{path}/Результаты чекап", 'pdf')
display(len(files))

# Load feats
feats = pd.read_excel(f"{path}/Результаты анализов/features.xlsx")
if feats['prefix'].is_unique:
    feats.set_index('prefix', inplace=True)
else:
    raise ValueError(f"Features' prefixes are not unique!")
feats_same_line = feats.loc[feats['line'] == 0, :]
feats_diff_line = feats.loc[feats['line'] != 0, :]
feats_same_line_dict = dict(zip(feats_same_line.index.values, feats_same_line['feature'].values))
feats_diff_line_dict = dict(zip(feats_diff_line.index.values, feats_diff_line['feature'].values))

In [None]:
df = pd.DataFrame(columns=['Sample ID', 'Pages', 'File', '№ направления', 'ФИО', 'Дата рождения', 'Дата обследования', 'Пол'] + list(feats['feature'].unique()))

missed_pages = {}
missed_lines = []

with open(f"{path}/Результаты анализов/files/skip_starts.txt") as f:
    skip_starts = tuple(list(set(f.read().splitlines())))
    
for file in files:
    print(file)
    
    missed_pages[file] = []
    
    reader = PdfReader(file)
    
    for page_id, page in tqdm(enumerate(reader.pages)):
        lines = page.extract_text().splitlines()
        
        if lines[0] == 'Фамилия:':
            
            line_sex = 7
            line_birth = 4
            if lines[1] == 'Дата рождения:ЛПУ:':
                line_sex = 6
                line_birth = 3
            
            if lines[10].startswith('Дата:'):
                line_date = 10
                line_name = 12
            elif lines[9].startswith('Дата:'):
                line_date = 9
                line_name = 11
            else:
                raise ValueError(f"Wrong ID parsing: {file} {page_id}")
                
            sample_name = lines[line_name].capitalize() + ' ' + re.findall(r"(.*)Имя", lines[line_name - 1])[0]
            sample_date = re.findall(r"Дата: (.*)", lines[line_date])[0]
            sample_number = lines[3]
            
            sample_id = f"{sample_name} {sample_date} {sample_number}"
            
            if sample_id in df.index:
                df.at[sample_id, 'Pages'] += f' {page_id}'
            else:
                df.at[sample_id, 'Pages'] = f'{page_id}'

            df.at[sample_id, 'File'] = file
            df.at[sample_id, '№ направления'] = sample_number
            df.at[sample_id, 'ФИО'] = sample_name
            df.at[sample_id, 'Дата рождения'] = lines[line_birth].replace('17.03.7979', '17.03.1979')
            df.at[sample_id, 'Дата обследования'] = sample_date
            df.at[sample_id, 'Пол'] = re.findall(r"Пол: (.+)", lines[line_sex])[0][0]

            for line_id, line in enumerate(lines):
                line = line.replace("не обнаружено", "0.0")
                line = line.replace("\u2009", "")
                line = line.replace("в 1 мл", "в мл")
                if line in feats_diff_line_dict:
                    target_line = lines[line_id + feats_diff_line.at[line, 'line']]
                    line_parse = re.findall(fr"([-+]?(?:\d+\.\d+|\d+|\.\d+)).*", target_line)
                    df.at[sample_id, feats_diff_line_dict[line]] = line_parse[0]
                else:
                    line = line.replace(" - ", "-")
                    # line_parse_w_units = re.findall(fr"(.*\S)\s+([-+]?(?:\d+\.\d+|\d+|\.\d+)) (.*)", line)
                    # line_parse_wo_units = re.findall(fr"(.*\S)\s+([-+]?(?:\d+\.\d+|\d+|\.\d+)) ", line)
                    line_parse_w_units = re.findall(fr"(.*)\s([-+]?(?:\d+\.\d+|\d+|\.\d+)) (.*)", line)
                    line_parse_wo_units = re.findall(fr"(.*)\s([-+]?(?:\d+\.\d+|\d+|\.\d+)) ", line)
                    if len(line_parse_w_units) > 0:
                        if line_parse_w_units[0][0] in feats_same_line_dict:
                            feat_unit = feats_same_line.at[line_parse_w_units[0][0], 'unit']
                            if not pd.isna(feat_unit):
                                if feat_unit in line_parse_w_units[0][2] or feat_unit.replace('МЕ/', 'Ед/') in line_parse_w_units[0][2]:
                                    df.at[sample_id, feats_same_line_dict[line_parse_w_units[0][0]]] = line_parse_w_units[0][1]
                                else:
                                    print(f"{line} ({file} {page_id} {line_id})")
                            else:
                                df.at[sample_id, feats_same_line_dict[line_parse_w_units[0][0]]] = line_parse_w_units[0][1]
                        else:
                            if not line.startswith(skip_starts):
                                missed_lines.append(f"{line} ({file} {page_id} {line_id})")
                    elif len(line_parse_wo_units) > 0:
                        print(f"hello: {line_parse_wo_units}")
                        df.at[sample_id, feats_same_line_dict[line_parse_wo_units[0][0]]] = line_parse_wo_units[0][1]
        else:
            missed_pages[file].append(page_id)

In [None]:
df = df.apply(pd.to_numeric, errors='ignore')
df['Дата рождения'] = pd.to_datetime(df['Дата рождения'], format="%d.%m.%Y").dt.date
df['Дата обследования'] = pd.to_datetime(df['Дата обследования'], format="%d.%m.%Y").dt.date
df.insert(7, 'Возраст', (df['Дата обследования'] - df['Дата рождения']) / np.timedelta64(1, 'D') / 365.25)
df.to_excel(f"{path}/paper_sex_hormones/data.xlsx")

# Initial features precessing

In [2]:
path = 'E:/YandexDisk/Work/bbd/millennium'

df = pd.read_excel(f"{path}/paper_sex_hormones/data_all.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/paper_sex_hormones/features_all.xlsx", index_col='feature')
feats = feats[~feats.index.duplicated(keep='first')]

nan_feats = df[feats.index.to_list()].isna().sum(axis=0).to_frame(name="Number of NaNs")
feats.loc[feats.index, "Number of NaNs"] = nan_feats.loc[feats.index, "Number of NaNs"]
feats["% of NaNs"] = nan_feats["Number of NaNs"] / df.shape[0] * 100
feats["Number of not-NaNs"] = df[feats.index.to_list()].notna().sum(axis=0)
feats.sort_values(["% of NaNs"], ascending=[True], inplace=True)

feats[r"Pearson $\rho$"] = 0.0
for f in feats.index:
    df_tmp = df.loc[:, ['Возраст', f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        if df_tmp[f].nunique() > 1:
            vals_1 = df_tmp.loc[:, 'Возраст'].values
            vals_2 = df_tmp.loc[:, f].values
            rho, _ = stats.pearsonr(vals_1, vals_2)
            feats.at[f, r"Pearson $\rho$"] = rho
        else:
            feats.at[f, r"Pearson $\rho$"] = 0.0

feats.to_excel(f"{path}/paper_sex_hormones/feats_all_with_metrics.xlsx", index_label="Features")

df_fig = feats.loc[feats['Number of not-NaNs'] >= 200, :]
df_fig['Features'] = df_fig.index
f_cmap = sns.color_palette("coolwarm", as_cmap=True)
# f_norm = mcolors.Normalize(vmin=min(df_fig[r"Pearson $\rho$"]), vmax=max(df_fig[r"Pearson $\rho$"])) 
f_norm = mcolors.TwoSlopeNorm(vcenter=0.0, vmin=min(df_fig[r"Pearson $\rho$"]), vmax=max(df_fig[r"Pearson $\rho$"]))
f_colors = {}
for cval in df_fig[r"Pearson $\rho$"]:
    f_colors.update({cval: f_cmap(f_norm(cval))})
    
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(4, 30), layout='constrained')
barplot = sns.barplot(
    data=df_fig,
    x='Number of not-NaNs',
    y='Features',
    hue=r"Pearson $\rho$",
    edgecolor='black',
    palette=f_colors,
    dodge=False,
    ax=ax
)
for container in barplot.containers:
    barplot.bar_label(container, label_type='edge', color='gray', fmt='%d', fontsize=8, padding=4.0)
ax.set_ylabel('')
ax.set(yticklabels=df_fig.index.to_list())
ax.get_legend().remove()
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
ax.set_xlabel('Количество записей not-NaN')
sm = plt.cm.ScalarMappable(cmap=f_cmap, norm=f_norm)
sm.set_array([])
cbar = barplot.figure.colorbar(sm, orientation="horizontal")
cbar.set_label("Корреляция с возрастом")
plt.savefig(f"{path}/paper_sex_hormones/feats_nans_and_age_correlation.pdf", bbox_inches='tight')
plt.savefig(f"{path}/paper_sex_hormones/feats_nans_and_age_correlation.png", bbox_inches='tight', dpi=200)
plt.close(fig)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fig['Features'] = df_fig.index
  ax.set(yticklabels=df_fig.index.to_list())
  cbar = barplot.figure.colorbar(sm, orientation="horizontal")
  plt.savefig(f"{path}/paper_sex_hormones/feats_nans_and_age_correlation.pdf", bbox_inches='tight')
  plt.savefig(f"{path}/paper_sex_hormones/feats_nans_and_age_correlation.pdf", bbox_inches='tight')
  plt.savefig(f"{path}/paper_sex_hormones/feats_nans_and_age_correlation.png", bbox_inches='tight', dpi=200)


# Select features

In [68]:
path = 'E:/YandexDisk/Work/bbd/millennium'

df = pd.read_excel(f"{path}/paper_sex_hormones/data_m_cleared.xlsx", index_col=0)

feats = pd.read_excel(f"{path}/paper_sex_hormones/feats_all_with_metrics.xlsx", index_col=0)
feats_used = pd.read_excel(f"{path}/paper_sex_hormones/feats_used.xlsx", index_col=0)
feats_unused = feats.drop(feats_used.index.values, axis=0)

feats_cands = [
    # 'Фолликулостимулирующий гормон (ФСГ), мМЕ/мл',
    # 'Лютеинизирующий гормон (ЛГ), мМЕ/мл',
    # 'Пролактин, мМЕ/л',
    'Тестостерон общий, нмоль/л',
    'Тестостерон свободный, нмоль/л',
    'Индекс свободных андрогенов, %',
    'ГСПГ (глобулин, связывающий половые гормоны), нмоль/л',
    # 'Эстрадиол (Е2), пмоль/л'
    'Простат-специфический антиген (ПСА) общий, нг/мл',
    'Простата-специфический антиген (ПСА) свободный, нг/мл',
    '% свободного ПСА',
]

feats.loc[feats_cands, :].to_excel(f"{path}/paper_sex_hormones/feats_new.xlsx")
feats_to_add = feats_unused.drop(feats_cands, axis=0).index.values
dict_n_rows = {}
for f in feats_to_add:
    dict_n_rows[f] = df.dropna(subset=feats_cands+[f]).shape[0]
df_n_rows = pd.Series(dict_n_rows, name='Count').to_frame()
df_n_rows.sort_values(["Count"], ascending=[False], inplace=True)
print(df.dropna(subset=feats_cands).shape[0])
print(df_n_rows.head(20))

165
                                                    Count
Ферритин, мкг/л                                       164
Эритроциты, 10*12/л                                   163
Нейтрофилы, %                                         163
Коэффициент атерогенности                             163
Холестерин липопротеидов низкой плотности (ЛПНП...    163
Триглицериды, ммоль/л                                 163
Тромбоциты, 10*9/л                                    163
Холестерин общий, ммоль/л                             163
25-OH витамин D, суммарный (кальциферол), нг/мл       163
Средний объем тромбоцитов (MPV), фл                   163
Тромбокрит (PCT), %                                   163
Относит.ширина распред.тромбоцитов по объему (P...    163
Эозинофилы, %                                         163
Нейтрофилы, 10*9/л                                    163
Эозинофилы, 10*9/л                                    163
Общий белок, г/л                                      163
Базофилы, 

# Generate datasets

In [69]:
path = 'E:/YandexDisk/Work/bbd/millennium'

color = 'dodgerblue'
title = 'M'
n_cols = 3

df = pd.read_excel(f"{path}/paper_sex_hormones/data_m_cleared.xlsx", index_col=0)

df_feats = pd.read_excel(f"{path}/paper_sex_hormones/feats_new.xlsx", index_col=0)
feats_in = df_feats.index.to_list()
feats_out = ['Возраст']
feats_add = ['Pages', 'File', '№ направления', 'ФИО', 'Дата рождения', 'Дата обследования', 'Пол']
feats_all = feats_add + feats_out + feats_in

df = df[feats_all]

df_msno = df[feats_all].copy()
# df_msno.sort_values([feats_in[0]], ascending=[False], inplace=True)
msno_mtx = msno.matrix(
    df=df_msno,
    label_rotation=90,
    color=mcolors.to_rgb(color),
    figsize=(0.7 * len(feats_in), 5),
)
plt.xticks(ha='center')
plt.setp(msno_mtx.xaxis.get_majorticklabels(), ha="center")
msno_mtx.set_title(title, fontsize='large')
msno_mtx.set_ylabel("IDs", fontsize='large')
plt.savefig(f"{path}/paper_sex_hormones/{title}/msno.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/msno.pdf", bbox_inches='tight')
plt.clf()
df.dropna(axis=0, how='any', inplace=True)
df.to_excel(f"{path}/paper_sex_hormones/{title}/data.xlsx", index_label="ID")
print(df.shape)

# Age histogramm
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(6, 3.5), layout='constrained')
histplot = sns.histplot(
    data=df,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Возраст",
    color=color,
    ax=ax
)
histplot.set(xlim=(0, 120))
histplot.set_ylabel('Количество')
histplot.set_title(title)
plt.savefig(f"{path}/paper_sex_hormones/{title}/age_hist.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/age_hist.pdf", bbox_inches='tight')
plt.close(fig)

# Input features and output feature correlations
df_corr = pd.DataFrame(index=feats_in, columns=['rho'])
for f in tqdm(feats_in):
    df_tmp = df.loc[:, ['Возраст', f]].dropna(axis=0, how='any')
    if df_tmp.shape[0] > 1:
        vals_1 = df_tmp.loc[:, 'Возраст'].values
        vals_2 = df_tmp.loc[:, f].values
        df_corr.at[f, 'rho'], _ = stats.pearsonr(vals_1, vals_2)
df_corr.to_excel(f"{path}/paper_sex_hormones/{title}/feats.xlsx")
df_corr.dropna(axis=0, how='any', inplace=True)
df_corr.insert(1, "abs(rho)", df_corr['rho'].abs())
df_corr.sort_values(["abs(rho)"], ascending=[False], inplace=True)
feats_cnt_wo_age = df_corr.index.to_list()
feats_cnt = ['Возраст'] + feats_cnt_wo_age
df_corr = df_corr.apply(pd.to_numeric)
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(0.8 + 0.038 * df_corr.index.str.len().max(), 0.9 + 0.4 * len(feats_cnt_wo_age) + 0.04 * df_corr.index.str.len().max()) , layout='constrained')
heatmap = sns.heatmap(
    df_corr.loc[:, ['rho']],
    annot=True,
    fmt=".2f",
    vmin=-1.0,
    vmax=1.0,
    cmap='coolwarm',
    linewidth=0.1,
    linecolor='black',
    #annot_kws={"fontsize": 15},
    cbar_kws={
        # "shrink": 0.9,
        # "aspect": 30,
        #'fraction': 0.046, 
        #'pad': 0.04,
    },
    ax=ax
)
heatmap_pos = ax.get_position()
ax.figure.axes[-1].set_position([heatmap_pos.x1 + 0.05, heatmap_pos.y0, 0.1, heatmap_pos.height])
ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title(title, fontsize=16)
ax.set(xticklabels=[])
ax.set(xticks=[])
plt.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_pearsonr.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_pearsonr.pdf", bbox_inches='tight')
plt.close(fig)

n_rows = int(np.ceil(len(feats_in) / n_cols))
n_empty = n_rows * n_cols - len(feats_in)
sns.set_theme(style='ticks')
fig, axs = plt.subplots(
    nrows=n_rows,
    ncols=n_cols,
    figsize=(n_cols * 3.0, n_rows * 2.5),
    gridspec_kw={'wspace':0.10, 'hspace': 0.05}, 
    sharex=True,
    layout='constrained'
)
if axs.ndim > 1:
    for feat_id, feat in enumerate(df_corr.index.values):
        row_id, col_id = divmod(feat_id, n_cols)
        regplot = sns.regplot(
            data=df,
            x="Возраст",
            y=feat,
            color='crimson',
            scatter_kws=dict(
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                s=16,
            ),
            ax=axs[row_id, col_id]
        )
        axs[row_id, col_id].set_title(fr"Pearson $\rho$: {df_corr.loc[feat, 'rho']:0.3f}")
        y_labe_fontsize = min(15 / (len(feat) / 20), 13)
        axs[row_id, col_id].set_ylabel(feat, fontsize=y_labe_fontsize)
        axs[row_id, col_id].xaxis.set_tick_params(which='both', labelbottom=True)
    for empty_id in range(n_empty):   
        axs[n_rows - 1, n_cols - 1 - empty_id].axis('off')
else:
    for feat_id, feat in enumerate(df_corr.index.values):
        row_id, col_id = divmod(feat_id, n_cols)
        regplot = sns.regplot(
            data=df,
            x="Возраст",
            y=feat,
            color='crimson',
            scatter_kws=dict(
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                s=16,
            ),
            ax=axs[max(row_id, col_id)]
        )
        axs[max(row_id, col_id)].set_title(fr"Pearson $\rho$: {df_corr.loc[feat, 'rho']:0.3f}")
        y_labe_fontsize = min(15 / (len(feat) / 20), 13)
        axs[max(row_id, col_id)].set_ylabel(feat, fontsize=y_labe_fontsize)
        axs[max(row_id, col_id)].xaxis.set_tick_params(which='both', labelbottom=True)
fig.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_regplot.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_regplot.pdf", bbox_inches='tight')
plt.close(fig)

# Correlation heatmap
feats_cnt = ['Возраст'] + feats_in
df_corr = pd.DataFrame(data=np.zeros(shape=(len(feats_cnt), len(feats_cnt))), index=feats_cnt, columns=feats_cnt)
for f_id_1 in range(len(feats_cnt)):
    for f_id_2 in range(f_id_1, len(feats_cnt)):
        f_1 = feats_cnt[f_id_1]
        f_2 = feats_cnt[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = df.loc[:, f_1].values
            vals_2 = df.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_corr.at[f_2, f_1] = pval
            df_corr.at[f_1, f_2] = corr
        else:
            df_corr.at[f_2, f_1] = np.nan
selection = np.tri(df_corr.shape[0], df_corr.shape[1], -1, dtype=bool)
df_fdr = df_corr.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
nzmin = df_fdr['pval_fdr_bh'][df_fdr['pval_fdr_bh'].gt(0)].min(0) * 0.5
df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
df_corr_fdr = df_corr.copy()
for line_id in range(df_fdr.shape[0]):
    df_corr_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])
df_corr_fdr.to_excel(f"{path}/paper_sex_hormones/{title}/feats_pearsonr.xlsx")
sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(8.5 + 0.35 * len(feats_cnt), 6.5 + 0.25 * len(feats_cnt)), layout='constrained')
cmap_triu = plt.get_cmap("seismic").copy()
mask_triu=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool)
heatmap_diff = sns.heatmap(
    df_corr_fdr,
    mask=mask_triu,
    annot=True,
    fmt=".2f",
    center=0.0,
    cmap=cmap_triu,
    linewidth=0.1,
    linecolor='black',
    annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 8)},
    ax=ax
)
ax.figure.axes[-1].set_ylabel(r"Pearson $\rho$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')
mask_tril=np.tri(len(feats_cnt), len(feats_cnt), -1, dtype=bool).T
heatmap_pval = sns.heatmap(
    df_corr_fdr,
    mask=mask_tril,
    annot=True,
    fmt=".1f",
    vmin=-np.log10(0.05),
    cmap=cmap_tril,
    linewidth=0.1,
    linecolor='black',
    annot_kws={"fontsize": 32 / np.sqrt(len(df_corr_fdr.values) + 8)},
    ax=ax
)
ax.figure.axes[-1].set_ylabel(r"$-\log_{10}(\mathrm{p-value})$")
for spine in ax.figure.axes[-1].spines.values():
    spine.set(visible=True, lw=0.25, edgecolor="black")
ax.set_xlabel('', fontsize=16)
ax.set_ylabel('', fontsize=16)
ax.set_title(title, fontsize=16)
plt.savefig(f"{path}/paper_sex_hormones/{title}/feats_pearsonr.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/feats_pearsonr.pdf", bbox_inches='tight')
plt.close(fig)

df_proc = df.copy()

# IQR outliers
feats_cnt = ['Возраст'] + feats_cnt_wo_age
out_columns = []
for f in tqdm(feats_cnt):
    q1 = df_proc[f].quantile(0.25)
    q3 = df_proc[f].quantile(0.75)
    iqr = q3 - q1
    df_proc[f"{f} IQR Outlier"] = 1
    out_columns.append(f"{f} IQR Outlier")
    filter = (df_proc[f] >= q1 - 1.5 * iqr) & (df_proc[f] <= q3 + 1.5 * iqr)
    df_proc.loc[filter, f"{f} IQR Outlier"] = 0
df_proc[f"Number of IQR Outliers"] = df_proc.loc[:, out_columns].sum(axis=1)

hist_bins = np.linspace(-0.5, len(feats_cnt) + 0.5, len(feats_cnt) + 2)
fig = plt.figure(figsize=(5, 3))
sns.set_theme(style='ticks')
histplot = sns.histplot(
    data=df_proc,
    x=f"Number of IQR Outliers",
    multiple="stack",
    bins=hist_bins,
    edgecolor='k',
    linewidth=1.0,
    color=color,
)
histplot.set(xlim=(-0.5, max(df_proc['Number of IQR Outliers'] + 0.5)))
histplot.set_title(title)
histplot.set_xlabel("Количество IQR выбросов")
histplot.set_ylabel("Количество записей")
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_hist.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_hist.pdf", bbox_inches='tight')
plt.close(fig)

out_columns = [f"{f} IQR Outlier" for f in feats_cnt]
df_msno = df_proc.loc[:, out_columns].copy()
df_msno.replace({1: np.nan}, inplace=True)
df_msno.rename(columns=dict(zip(out_columns, feats_cnt)), inplace=True)

# Plot barplot for features with outliers
msno_bar = msno.bar(
    df=df_msno,
    label_rotation=90,
    color=color,
    # figsize=(0.4 * len(feats_cnt), 4),
)
plt.xticks(ha='center')
plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
msno_bar.set_title(title, fontsize='large')
msno_bar.set_ylabel("Записи без выбросов", fontsize='large')
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_bar.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_bar.pdf", bbox_inches='tight')
plt.clf()

# Plot matrix of samples outliers distribution
msno_mtx = msno.matrix(
    df=df_msno,
    label_rotation=90,
    color=mcolors.to_rgb(color),
    # figsize=(0.7 * len(feats_cnt), 5),
)
plt.xticks(ha='center')
plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
msno_mtx.set_title(title, fontsize='large')
msno_mtx.set_ylabel("Записи", fontsize='large')
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_matrix.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_matrix.pdf", bbox_inches='tight')
plt.clf()

# Plot heatmap of features outliers correlations
msno_heatmap = msno.heatmap(
    df=df_msno,
    label_rotation=90,
    cmap="bwr",
    fontsize=12,
    # figsize=(0.6 * len(feats_cnt), 0.6 * len(feats_cnt))
)
msno_heatmap.set_title(title, fontsize='large')
plt.setp(msno_heatmap.xaxis.get_majorticklabels(), ha="center")
msno_heatmap.collections[0].colorbar.ax.tick_params(labelsize=20)
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_heatmap.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/paper_sex_hormones/{title}/outs_iqr_heatmap.pdf", bbox_inches='tight')
plt.clf()
    
# Dimensionality reduction
feats_cnt = ['Возраст'] + feats_in
dim_red_models = {
    't-SNE': TSNE(n_components=2),
    'PCA': PCA(n_components=2, whiten=False),
    'IsoMap': Isomap(n_components=2, n_neighbors=5),
    'MDS': MDS(n_components=2, metric=True),
    'GRP': GaussianRandomProjection(n_components=2, eps=0.5),
    'SRP': SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False),
}
feats_dim_red = []
for drm in dim_red_models:
    dim_red_res = dim_red_models[drm].fit_transform(df_proc.loc[:, feats_cnt].values)
    df_proc.loc[:, f"{drm} 1"] = dim_red_res[:, 0]
    df_proc.loc[:, f"{drm} 2"] = dim_red_res[:, 1]
    df_proc.loc[:, f"{drm} HDBSCAN"] = HDBSCAN(min_cluster_size=int(df_proc.shape[0] * 0.05)).fit(df_proc.loc[:, [f"{drm} 1", f"{drm} 2"]].values).labels_
    feats_dim_red += [ f"{drm} 1",  f"{drm} 2"]
n_rows = 2
n_cols = 3
fig_height = 10
fig_width = 15
sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False, layout='constrained')
for drm_id, drm in enumerate(dim_red_models.keys()):
    row_id, col_id = divmod(drm_id, n_cols)
    scatter = sns.scatterplot(
        data=df_proc,
        x=f"{drm} 1",
        y=f"{drm} 2",
        # hue=f"{drm} HDBSCAN",
        hue='Пол',
        palette={'М': 'deepskyblue', 'Ж': 'hotpink'},
        linewidth=0.25,
        alpha=0.75,
        edgecolor="k",
        s=40,
        # color=colors[feats_set],
        ax=axs[row_id, col_id],
    )
    axs[row_id, col_id].set_title(drm)
    # axs[n_rows - 1, n_cols - 1].axis('off')
fig.suptitle(title, fontsize='large')   
fig.savefig(f"{path}/paper_sex_hormones/{title}/dim_red.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/paper_sex_hormones/{title}/dim_red.pdf", bbox_inches='tight')
df_proc.to_excel(f"{path}/paper_sex_hormones/{title}/df_proc.xlsx", index_label="ID")
plt.close(fig)


(165, 15)


100%|██████████| 7/7 [00:00<00:00, 880.50it/s]
  plt.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_pearsonr.png", bbox_inches='tight', dpi=200)
  plt.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_pearsonr.pdf", bbox_inches='tight')
  plt.savefig(f"{path}/paper_sex_hormones/{title}/age_feats_pearsonr.pdf", bbox_inches='tight')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fdr['pval_fdr_bh'].replace({0.0: nzmin}, inplace=True)
100%|██████████| 8/8 [00:00<00:00, 1105.84it/s]


<Figure size 2500x1000 with 0 Axes>

<Figure size 2500x1000 with 0 Axes>

<Figure size 2000x1200 with 0 Axes>