# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from plotly.subplots import make_subplots
from pytorch_tabular import TabularModel
import torch
import plotly.graph_objects as go
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.pt.hyper_opt import train_hyper_opt
from src.utils.hash import dict_hash
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu, variation, levene
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
import datetime
from collections import Counter
from matplotlib.ticker import MaxNLocator
from itertools import chain
from sklearn.metrics import mean_absolute_error
import pyaging as pya
import matplotlib.lines as mlines
from src.models.simage.tabular.widedeep.ft_transformer import WDFTTransformerModel
import statsmodels.formula.api as smf
from itertools import chain
from pingouin import ancova
from sklearn.preprocessing import LabelEncoder 
import upsetplot


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]


def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter


def get_sections(sets):
    """
    Given a list of sets, return a new list of sets with all the possible
    mutually exclusive overlapping combinations of those sets.  Another way
    to think of this is the mutually exclusive sections of a venn diagram
    of the sets.  If the original list has N sets, the returned list will
    have (2**N)-1 sets.

    Parameters
    ----------
    sets : list of set

    Returns
    -------
    combinations : list of tuple
        tag : str
            Binary string representing which sets are included / excluded in
            the combination.
        set : set
            The set formed by the overlapping input sets.
    """
    num_combinations = 2 ** len(sets)
    bit_flags = [2 ** n for n in range(len(sets))]
    flags_zip_sets = [z for z in zip(bit_flags, sets)]

    combo_sets = {}
    for bits in range(num_combinations - 1, 0, -1):
        include_sets = [s for flag, s in flags_zip_sets if bits & flag]
        exclude_sets = [s for flag, s in flags_zip_sets if not bits & flag]
        combo = set.intersection(*include_sets)
        combo = set.difference(combo, *exclude_sets)
        tag = ''.join([str(int((bits & flag) > 0)) for flag in bit_flags])
        combo_sets[tag] = combo
    return combo_sets


# Preprocess data

## Load all aux data

In [None]:
path = f"E:/YandexDisk/Work/bbd/immunology/004_data_processing"
path_unn = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"

feats_imm = pd.read_excel(f"{path_unn}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path_unn}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path_unn}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values
feats_imm_genes = pd.read_excel(f"{path_unn}/data/immuno/immuno_markers_genes.xlsx", index_col=0)
feats_imm_rename = dict(zip(feats_imm_genes.index, feats_imm_genes['gene']))

imm_old = pd.read_excel(f"{path}/data_GSEUNN_fmba.xlsx", index_col=0)
imm_old.index = imm_old.index.astype(str)
print(f"imm_old index unique: {imm_old.index.is_unique}")
imm_old_w_nans = pd.read_excel(f"{path}/data_GSEUNN_fmba_with_nans.xlsx", index_col=0)
imm_old_w_nans.index = imm_old_w_nans.index.astype(str)
print(f"imm_old_w_nans index unique: {imm_old_w_nans.index.is_unique}")
imm_old_selected = pd.read_excel(f"{path_unn}/special/059_imm_data_selection/df_imm.xlsx", index_col=0)
imm_old_selected.index = imm_old_selected.index.astype(str)
print(f"imm_old_selected index unique: {imm_old_selected.index.is_unique}")

ids_groups = {
    'Old Central': imm_old_selected.index[imm_old_selected['Region'] == 'Central'].values,
    'Old Yakutia': imm_old_selected.index[imm_old_selected['Region'] == 'Yakutia'].values,
    'Old Mirny': imm_old.index[imm_old['Region'] == 'Mirny'].values,
    'Old FMBA': imm_old.index[imm_old['Region'].isna()].values,
}
imm_old.loc[ids_groups['Old FMBA'], 'Sex'] = 'M'
imm_old.loc[ids_groups['Old FMBA'], 'Subject ID'] = imm_old.index[ids_groups['Old FMBA']].values
imm_old.insert(2, 'Group', None)
imm_old.insert(2, 'Group detailed', None)
for group_name, ids_group in ids_groups.items():
    imm_old.loc[ids_group, 'Group'] = group_name
    imm_old.loc[ids_group, 'Group detailed'] = group_name

df_samples_mirny_1 = pd.read_excel(f"E:/YandexDisk/Work/bbd/mirny/select_20_samples/selected.xlsx", index_col=0)
df_samples_mirny_1.index = df_samples_mirny_1.index.astype(str)
df_samples_mirny_1['Group'] = 'New 2025'
df_samples_mirny_1['Group detailed'] = 'New 2025 Mirny'
print(f"Mirny subset 1 index unique: {df_samples_mirny_1.index.is_unique}")
df_samples_mirny_2 = pd.read_excel(f"E:/YandexDisk/Work/bbd/mirny/select_20_samples/selected_new_20.xlsx", index_col=0)
df_samples_mirny_2.index = df_samples_mirny_2.index.astype(str)
df_samples_mirny_2['Group'] = 'New 2025'
df_samples_mirny_2['Group detailed'] = 'New 2025 Mirny'
print(f"Mirny subset 2 index unique: {df_samples_mirny_2.index.is_unique}")

df_samples_mriya_epi = pd.read_excel(f"E:/YandexDisk/Work/bbd/mriya/select_samples/Эпи.xlsx", index_col=0)
df_samples_mriya_epi.index = df_samples_mriya_epi.index.astype(str)
df_samples_mriya_epi['Group'] = 'New 2025'
df_samples_mriya_epi['Group detailed'] = 'New 2025 UNN'
print(f"Mriya ЭПИ index unique: {df_samples_mriya_epi.index.is_unique}")
df_samples_mriya_80 = pd.read_excel(f"E:/YandexDisk/Work/bbd/mriya/select_samples/80.xlsx", index_col=0)
df_samples_mriya_80.index = df_samples_mriya_80.index.astype(str)
df_samples_mriya_80['Group'] = 'New 2025'
df_samples_mriya_80['Group detailed'] = 'New 2025 UNN'
print(f"Mriya 80 index unique: {df_samples_mriya_80.index.is_unique}")

df_samples_fmba_20 = pd.read_excel(f"E:/YandexDisk/Work/bbd/fmba/04_select_samples_20/selected.xlsx", index_col=0)
df_samples_fmba_20.index = df_samples_fmba_20.index.astype(str)
df_samples_fmba_20['Sex'] = 'M'
df_samples_fmba_20['Group'] = 'New 2025'
df_samples_fmba_20['Group detailed'] = 'New 2025 FMBA'
print(f"FMBA 20 index unique: {df_samples_fmba_20.index.is_unique}")

df_samples_fmba_epi = pd.read_excel(f"E:/YandexDisk/Work/bbd/fmba/dnam/processed/pheno.xlsx", index_col=0)
df_samples_fmba_epi.index = df_samples_fmba_epi.index.astype(str)
df_samples_fmba_epi['Group'] = 'New 2025'
df_samples_fmba_epi['Group detailed'] = 'New 2025 FMBA'
print(f"FMBA Epi index unique: {df_samples_fmba_epi.index.is_unique}")

print(f"")
print(f"Mirny subset 1 (20): {len(imm_old.index.intersection(df_samples_mirny_1.index).to_list())}")
print(f"Mirny subset 2 (20): {len(imm_old.index.intersection(df_samples_mirny_2.index).to_list())}")
print(f"Mirny subset 1 and 2 intersection: {len(df_samples_mirny_1.index.intersection(df_samples_mirny_2.index).to_list())}")
print(f"FMBA subset (20 or 13?): {len(imm_old.index.intersection(df_samples_fmba_20.index).to_list())}")
print(f"Missed FMBA samples: {df_samples_fmba_20.index.difference(imm_old.index).to_list()}")
print(f"Mriya ЭПИ intersection with old UNN data: {len(imm_old.index.intersection(df_samples_mriya_epi.index).to_list())}")
print(f"Mriya 80 intersection with old UNN data: {len(imm_old.index.intersection(df_samples_mriya_80.index).to_list())}")

## Load and process duplicates in immuno data

In [None]:
imm_new = pd.read_excel(f"{path_unn}/data/immuno/files/processed/07-May-2025/NN-18.04.2025.xlsx", index_col=0)
imm_new.index = imm_new.index.astype(str)
print(f"imm_new index unique: {imm_new.index.is_unique}")
print(f"num duplicated indexes: {len(imm_new.index[imm_new.index.duplicated()].unique())}")
imm_new.insert(0, 'Subject ID', imm_new.index.values)
imm_new.insert(1, 'Has technical duplicates?', False)
imm_new.insert(2, 'Age', None)
imm_new.insert(3, 'Sex', None)
imm_new.insert(4, 'Group', None)
imm_new.insert(5, 'Group detailed', None)

# Fill Age and Sex
imm_new.fillna(df_samples_mirny_1[['Age', 'Sex', 'Group', 'Group detailed']], inplace=True)
imm_new.fillna(df_samples_mirny_2[['Age', 'Sex', 'Group', 'Group detailed']], inplace=True)
imm_new.fillna(df_samples_mriya_epi[['Age', 'Sex', 'Group', 'Group detailed']], inplace=True)
imm_new.fillna(df_samples_mriya_80[['Age', 'Sex', 'Group', 'Group detailed']], inplace=True)
imm_new.fillna(df_samples_fmba_20[['Age', 'Sex', 'Group', 'Group detailed']], inplace=True)
imm_new.fillna(df_samples_fmba_epi[['Age', 'Sex', 'Group', 'Group detailed']], inplace=True)

# Process duplicates
imm_new.loc[imm_new.index[imm_new.index.duplicated()], 'Has technical duplicates?'] = True
imm_new.index = imm_new.index.where(
    ~imm_new.index.duplicated(keep=False),  # Маска для уникальных элементов
    imm_new.index + '_technical_duplicate_' + imm_new.groupby(imm_new.index).cumcount().astype(str)  # Суффиксы для дубликатов
)
imm_new.index = imm_new.index.str.replace('_technical_duplicate_0', '')
print(f"imm_new index unique: {imm_new.index.is_unique}")
imm_new.to_excel(f"{path_unn}/data/immuno/files/processed/07-May-2025/NN-18.04.2025_processed_duplicates.xlsx")

## Load duplicates-processed immuno data and impute values

In [None]:
imm_new = pd.read_excel(f"{path_unn}/data/immuno/files/processed/07-May-2025/NN-18.04.2025_processed_duplicates.xlsx", index_col=0)
imm_new.index = imm_new.index.astype(str)
imm_new.rename(columns=feats_imm_rename, inplace=True)
print(f"imm_new index unique: {imm_new.index.is_unique}")

print(f"")
print(f"Mirny subset 1 (20): {len(imm_new.index.intersection(df_samples_mirny_1.index).to_list())}")
print(f"Mirny subset 2 (20): {len(imm_new.index.intersection(df_samples_mirny_2.index).to_list())}")
print(f"FMBA subset (20 or 13?): {len(imm_new.index.intersection(df_samples_fmba_20.index).to_list())}")
print(f"Missed FMBA samples: {df_samples_fmba_20.index.difference(imm_new.index).to_list()}")
print(f"Mriya ЭПИ (96): {len(imm_new.index.intersection(df_samples_mriya_epi.index).to_list())}")
print(f"Missed Mriya ЭПИ (96): {df_samples_mriya_epi.index.difference(imm_new.index).to_list()}")
print(f"Mriya 80 (80): {len(imm_new.index.intersection(df_samples_mriya_80.index).to_list())}")
print(f"Missed Mriya ЭПИ (96): {df_samples_mriya_80.index.difference(imm_new.index).to_list()}")

# Data with NaNs
imm_w_nans = imm_new.copy()
imm_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
imm_w_nans.replace(r'^([\>].*)$', 'NaN', inplace=True, regex=True)
imm_w_nans[feats_imm] = imm_w_nans[feats_imm].apply(pd.to_numeric, errors='coerce')
imm_w_nans[feats_imm].to_excel(f"{path}/data_052025_with_nans.xlsx")

# Impute max thresholds
imm_max_thld_nans = imm_new.loc[:, feats_imm].copy()
imm_max_thld_nans.replace(r'^([\>].*)$', None, inplace=True, regex=True)
imm_max_thld_nans = imm_max_thld_nans.stack(dropna=False)
max_thld_nans = [list(x) for x in imm_max_thld_nans.index[imm_max_thld_nans.isna()]]
print(f'\nNumber of max_thld_nans: {len(max_thld_nans)}')
imm_max_thld_imp = imm_new.loc[:, feats_imm].copy()
imm_max_thld_imp.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
imm_max_thld_imp.replace(r'^([\>].*)$', 'NaN', inplace=True, regex=True)
imm_max_thld_imp = imm_max_thld_imp.apply(pd.to_numeric, errors='coerce')
n_neighbors = 3
X = imm_max_thld_imp.loc[:, feats_imm].values
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
imm_max_thld_imp.loc[:, feats_imm] = X_imptd

# Fill with imputed max thresholds
imm_new[feats_imm].replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
imm_new[feats_imm].replace(r'^([\>].*)$', 'NaN', inplace=True, regex=True)
imm_new[feats_imm] = imm_new[feats_imm].apply(pd.to_numeric, errors='coerce')
print(f'Missing before max thresholds imputation: {imm_new[feats_imm].isna().sum().sum()}')
for max_imp_nan in max_thld_nans:
    imm_new.at[max_imp_nan[0], max_imp_nan[1]] = imm_max_thld_imp.at[max_imp_nan[0], max_imp_nan[1]]
print(f'Missing after max thresholds imputation: {imm_new[feats_imm].isna().sum().sum()}')
# Impute min thresholds and replace imputed values with the closest threshold values in Central
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

ids_imp_trn = ids_groups['Old Central']
ids_imp_tst = imm_new.index.values
df_imp = pd.concat([
    imm_old.loc[ids_imp_trn, feats_imm],
    imm_new.loc[:, feats_imm]
])
X = df_imp.loc[:, feats_imm].values
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
df_imp.loc[:, feats_imm] = X_imptd
for feat in feats_imm:
    srs_feat_base = imm_old_w_nans.loc[ids_imp_trn, feat].isna()
    ids_feat_base = srs_feat_base.index[srs_feat_base == True].values
    if len(ids_feat_base) > 0:
        feat_base_vals = imm_old.loc[ids_feat_base, feat].unique()
        srs_feat_trgt = imm_new.loc[ids_imp_tst, feat].isna()
        ids_feat_trgt = srs_feat_trgt.index[srs_feat_trgt == True].values
        for id_trgt in ids_feat_trgt:
            df_imp.at[id_trgt, feat] = find_nearest(feat_base_vals, df_imp.at[id_trgt, feat])
imm_new.loc[ids_imp_tst, feats_imm] = df_imp.loc[ids_imp_tst, feats_imm]
imm_new.to_excel(f"{path}/data_052025.xlsx")

## Merge all data, сalculate SImAge, calculate logs, save all data

In [None]:
print(f"imm_new index unique: {imm_new.index.is_unique}")
print(f"imm_old index unique: {imm_old.index.is_unique}")

ids_rep_mes = imm_new.index.intersection(imm_old.index).values
print(f"ids_rep_mes before index correction: {len(ids_rep_mes)}")

imm_new.insert(0, '2025 Repeated Measures?', False)
imm_new.insert(1, '2025 Repeated Measures Time', None)
imm_new.loc[ids_rep_mes, '2025 Repeated Measures?'] = True
imm_new.loc[ids_rep_mes, '2025 Repeated Measures Time'] = 1
imm_old.insert(0, '2025 Repeated Measures?', False)
imm_old.insert(1, '2025 Repeated Measures Time', None)
imm_old.loc[ids_rep_mes, '2025 Repeated Measures?'] = True
imm_old.loc[ids_rep_mes, '2025 Repeated Measures Time'] = 0

imm_w_nans = imm_w_nans.loc[imm_new.index, :]

imm_new['new_index'] = imm_new.index.values
imm_new.loc[ids_rep_mes, 'new_index'] += '_repeated_measure_1'
imm_new.set_index('new_index', inplace=True)

imm_w_nans.set_index(imm_new.index.values, inplace=True)

ids_rep_mes = imm_new.index.intersection(imm_old.index)
print(f"ids_rep_mes after index correction: {len(ids_rep_mes)}")

imm_all = pd.concat([imm_old, imm_new])
imm_all_with_nans = pd.concat([imm_old_w_nans, imm_w_nans])

path_simage = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/data/immuno/models/SImAge/best_fold_0002.ckpt"
model_simage = WDFTTransformerModel.load_from_checkpoint(checkpoint_path=path_simage)
model_simage.eval()
model_simage.freeze()
model_simage.to('cpu')
imm_all['SImAge'] = model_simage(torch.from_numpy(imm_all.loc[:, feats_imm_fimmu].values)).cpu().detach().numpy().ravel()
imm_all['SImAge acceleration'] = imm_all['SImAge'] - imm_all['Age']
imm_all['|SImAge acceleration|'] = imm_all['SImAge acceleration'].abs()

for f in feats_imm:
    imm_all[f"log({f})"] = np.log(imm_all[f"{f}"])

imm_all.to_excel(f"{path}/data_GSEUNN_fmba_052025.xlsx")
imm_all_with_nans[feats_imm].to_excel(f"{path}/data_GSEUNN_fmba_052025_with_nans.xlsx")

# Load preprocessed data

In [None]:
path = f"E:/YandexDisk/Work/bbd/immunology/004_data_processing"
path_unn = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"

imm = pd.read_excel(f"{path}/data_GSEUNN_fmba_052025.xlsx", index_col=0)
imm.index = imm.index.astype(str)
print(f"Is index unique: {imm.index.is_unique}")

imm_w_nans = pd.read_excel(f"{path}/data_GSEUNN_fmba_052025_with_nans.xlsx", index_col=0)
imm_w_nans.index = imm_w_nans.index.astype(str)
print(f"Is index unique: {imm_w_nans.index.is_unique}")

feats_imm = pd.read_excel(f"{path_unn}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path_unn}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path_unn}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values
feats_imm_genes = pd.read_excel(f"{path_unn}/data/immuno/immuno_markers_genes.xlsx", index_col=0)
feats_imm_rename = dict(zip(feats_imm_genes.index, feats_imm_genes['gene']))

feats_colors_raw = distinctipy.get_colors(len(feats_imm), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1899, pastel_factor=0.0)
feats_palette = {x: feats_colors_raw[x_id] for x_id, x in enumerate(feats_imm)}

# NaNs

In [None]:
pathlib.Path(f"{path}/01_new_data_05_2025/nans").mkdir(parents=True, exist_ok=True)

groups = ['New 2025', 'Old Central', 'Old Yakutia', 'Old Mirny', 'Old FMBA']

ids_groups = {x: imm.index[imm['Group'] == x].values for x in groups}

colors_groups = {
    'Old Central': 'gold',
    'Old Yakutia': 'lightslategray',
    'Old Mirny': 'crimson',
    'Old FMBA': 'dodgerblue',
    'New 2025': 'fuchsia'
}

n_cols = 3
n_rows = 2
fig_width = 23
fig_height = 9

sns.set_theme(style='ticks')
fig_bar, axs_bar = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, layout="constrained")
fig_hist, axs_hist = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharex=False, layout="constrained")

dfs_nan_feats = {}
df_nan_feats_by_group = pd.DataFrame(index=groups)
for group_id, group in enumerate(groups):
    row_id, col_id = divmod(group_id, n_cols)
    
    df_nan_feats = imm_w_nans.loc[ids_groups[group], feats_imm].isna().sum(axis=0).to_frame(name="Number of NaNs")
    df_nan_feats["% of NaNs"] = df_nan_feats["Number of NaNs"] / len(ids_groups[group]) * 100
    df_nan_feats["Number of not-NaNs"] = imm_w_nans.loc[ids_groups[group], feats_imm].notna().sum(axis=0)
    df_nan_feats.sort_values(["% of NaNs"], ascending=[False], inplace=True)
    dfs_nan_feats[group] = df_nan_feats
    df_nan_feats_by_group.at[group, "% of NaNs"] = df_nan_feats["Number of NaNs"].sum(axis=0) / imm_w_nans.loc[ids_groups[group], feats_imm].size * 100
    
    barplot = sns.barplot(
        data=df_nan_feats,
        x=df_nan_feats.index,
        y=f"% of NaNs",
        edgecolor='black',
        color=colors_groups[group],
        dodge=False,
        ax=axs_bar[row_id, col_id],
    )
    axs_bar[row_id, col_id].set(xlim=(-0.7, len(feats_imm)-0.3))
    axs_bar[row_id, col_id].set_title(f"{group} ({len(ids_groups[group])})")
    axs_bar[row_id, col_id].set_xticklabels(axs_bar[row_id, col_id].get_xticklabels(), rotation=90)
    axs_bar[row_id, col_id].set_xlabel(f"")

    df_nan_smpls = imm_w_nans.loc[ids_groups[group], feats_imm].isna().sum(axis=1).to_frame(name="Features with NaNs")
    
    hist_bins = np.linspace(0, len(feats_imm), len(feats_imm) + 1)
    histplot = sns.histplot(
        data=df_nan_smpls,
        discrete=True,
        edgecolor='k',
        linewidth=1,
        x="Features with NaNs",
        color=colors_groups[group],
        ax=axs_hist[row_id, col_id],
    )
    axs_hist[row_id, col_id].set(xlim=(-0.6, len(feats_imm)+0.6))
    axs_hist[row_id, col_id].set_title(f"{group} ({len(ids_groups[group])})")
    axs_hist[row_id, col_id].set_ylabel(f"Number of samples")
    axs_hist[row_id, col_id].set_xlabel(f"")

axs_hist[n_rows - 1, n_cols - 1].axis('off')
axs_bar[n_rows - 1, n_cols - 1].axis('off')

fig_bar.tight_layout() 
fig_bar.savefig(f"{path}/01_new_data_05_2025/nans/feats.png", bbox_inches='tight', dpi=200)
fig_bar.savefig(f"{path}/01_new_data_05_2025/nans/feats.pdf", bbox_inches='tight')
plt.close(fig_bar)

with pd.ExcelWriter(f'{path}/01_new_data_05_2025/nans/feats.xlsx', engine='xlsxwriter') as writer:
    for group_id, group in enumerate(groups):
        dfs_nan_feats[group].to_excel(writer, sheet_name=group)

fig_hist.tight_layout()    
fig_hist.savefig(f"{path}/01_new_data_05_2025/nans/samples.png", bbox_inches='tight', dpi=200)
fig_hist.savefig(f"{path}/01_new_data_05_2025/nans/samples.pdf", bbox_inches='tight')
plt.close(fig_hist)

fig, ax = plt.subplots(figsize=(3, 2))
sns.set_theme(style='whitegrid')
barplot = sns.barplot(
    data=df_nan_feats_by_group,
    y=df_nan_feats_by_group.index,
    x=f"% of NaNs",
    edgecolor='black',
    palette=colors_groups,
    dodge=False,
    orient='h',
    ax=ax
)
ax.set_ylabel(f"")
for x in barplot.containers:
    barplot.bar_label(x, fmt="%.1f", padding=2.0)
plt.savefig(f"{path}/01_new_data_05_2025/nans/global.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/01_new_data_05_2025/nans/global.pdf", bbox_inches='tight')
plt.close()

# Technical duplicates

In [None]:
pathlib.Path(f"{path}/01_new_data_05_2025/technical_duplicates").mkdir(parents=True, exist_ok=True)

tech_dupl_samples_count = imm.loc[imm['Has technical duplicates?'] == True, 'Subject ID'].value_counts().to_frame(name='Count').sort_values(by=['Count'], ascending=[False])
tech_dupl_samples_count.to_excel(f"{path}/01_new_data_05_2025/technical_duplicates/tech_dupl_samples_count.xlsx")

samples_techdupl = imm.loc[imm['Has technical duplicates?'] == True, 'Subject ID'].unique()
df_tech_dupl = pd.DataFrame(index=samples_techdupl, columns=feats_imm_slctd)
df_tech_dupl_mean = pd.DataFrame(index=feats_imm_slctd, columns=['Mean'])
for f in feats_imm_slctd:
    for sample_td in samples_techdupl:
        df_tech_dupl.at[sample_td, f] = stats.variation(imm.loc[(imm['Has technical duplicates?'] == True) & (imm['Subject ID'] == sample_td), f"{f}"], ddof=1)
    df_tech_dupl_mean.at[f, 'Mean'] = np.mean(df_tech_dupl.loc[:, f].values)
df_tech_dupl_mean.sort_values(by=['Mean'], ascending=[False], inplace=True)
df_tech_dupl['Subject ID'] = df_tech_dupl.index
df_tech_dupl_melt = df_tech_dupl.melt(id_vars='Subject ID', value_vars=df_tech_dupl_mean.index.values, var_name='Immunomarkers', value_name='Coefficient of variation (CV)')

sns.set_theme(style='ticks')
fig, ax = plt.subplots(figsize=(4, 10), layout='constrained')
barplot = sns.barplot(
    df_tech_dupl_melt,
    y="Immunomarkers",
    x="Coefficient of variation (CV)",
    hue="Immunomarkers",
    palette=feats_palette,
    ax=ax
)
ax.set_ylabel('')
plt.savefig(f"{path}/01_new_data_05_2025/technical_duplicates/barplot_cv.pdf", bbox_inches='tight')
plt.savefig(f"{path}/01_new_data_05_2025/technical_duplicates/barplot_cv.png", bbox_inches='tight', dpi=200)
plt.close(fig)

# Repeated measures

In [None]:
pathlib.Path(f"{path}/01_new_data_05_2025/repeated_measures").mkdir(parents=True, exist_ok=True)

df_repmes = imm.loc[imm['2025 Repeated Measures?'] == True, :]
df_repmes['2025 Repeated Measures Time'].replace({0: 'Old', 1: 'New'}, inplace=True)
samples_repmes = {
    'Mirny': df_repmes.loc[df_repmes['Group'] == 'Old Mirny', 'Subject ID'].values,
    'FMBA': df_repmes.loc[df_repmes['Group'] == 'Old FMBA', 'Subject ID'].values,
}

for subset_name, subset_samples in samples_repmes.items():
    df_repmes_subset = df_repmes.loc[df_repmes['Subject ID'].isin(subset_samples), :]
    df_repmes_subset.to_excel(f"{path}/01_new_data_05_2025/repeated_measures/data_{subset_name}.xlsx")
    df_stat = pd.DataFrame(index=list(feats_imm_slctd))
    
    for feat in feats_imm_slctd:
        df_pivot = df_repmes_subset.pivot(index='Subject ID', columns='2025 Repeated Measures Time', values=feat)
        diff = df_pivot.loc[:, 'New'].values - df_pivot.loc[:, 'Old'].values
        if np.linalg.norm(diff) > 0:
            res = stats.wilcoxon(
                x=df_pivot.loc[:, 'Old'].values,
                y=df_pivot.loc[:, 'New'].values,
                alternative='two-sided'
            )
            df_stat.at[feat, "wlxn_pval"] =  res.pvalue
        else:
            df_stat.at[feat, "wlxn_pval"] = 1.0
    
    _, df_stat.loc[feats_imm_slctd, "wlxn_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, "wlxn_pval"], 0.05, method='fdr_bh')
    df_stat.to_excel(f"{path}/01_new_data_05_2025/repeated_measures/{subset_name}.xlsx", index_label='Features')
    
    df_fig = df_stat.loc[feats_imm_slctd, :]
    df_fig.sort_values([f"wlxn_pval"], ascending=[True], inplace=True)
    df_fig['wlxn_pval_fdr_bh_log'] = -np.log10(df_fig['wlxn_pval_fdr_bh'])
    df_fig['color'] = 'pink'
    df_fig.loc[df_fig['wlxn_pval_fdr_bh'] < 0.05, 'color'] = 'red'

    fig, ax = plt.subplots(figsize=(3, 16))
    sns.set_theme(style='whitegrid')
    barplot = sns.barplot(
        data=df_fig,
        y=df_fig.index.values,
        x='wlxn_pval_fdr_bh_log',
        edgecolor='black',
        palette=df_fig['color'].values,
        ax=ax
    )
    ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
    ax.set_ylabel('', fontsize=20)
    ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
    plt.savefig(f"{path}/01_new_data_05_2025/repeated_measures/{subset_name}_barplot_wlxn.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/01_new_data_05_2025/repeated_measures/{subset_name}_barplot_wlxn.pdf", bbox_inches='tight')
    plt.close(fig)
    
    n_cols = 4
    n_rows = 8
    fig_width = 12
    fig_height = 16

    colors_reps = {
        "Old": 'crimson',
        "New": 'dodgerblue',
    }
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={})

    df_stat.sort_values([f"wlxn_pval"], ascending=[True], inplace=True)
    feats_sorted = df_stat.index.values
    
    samples_colors_raw = distinctipy.get_colors(len(subset_samples), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1899, pastel_factor=0.0)
    samples_palette = {x: samples_colors_raw[x_id] for x_id, x in enumerate(subset_samples)}

    for f_id, f in enumerate(feats_sorted):
        row_id, col_id = divmod(f_id, n_cols)
        
        sns.scatterplot(
            data=df_repmes_subset,
            x='2025 Repeated Measures Time',
            y=f,
            hue='Subject ID',
            edgecolor="k",
            linewidth=0.001,
            palette=samples_palette,
            hue_order=list(samples_palette.keys()),
            alpha=0.75,
            s=100,
            legend=False,
            ax=axs[row_id, col_id]
        )
        sns.lineplot(
            data=df_repmes_subset,
            x='2025 Repeated Measures Time',
            y=f,
            hue='Subject ID',
            palette=samples_palette,
            hue_order=list(samples_palette.keys()),
            legend=False,
            ax=axs[row_id, col_id]
        )
        
        axs[row_id, col_id].set_xlabel('')
        pval = df_stat.at[f, "wlxn_pval_fdr_bh"]
        axs[row_id, col_id].set_title(f'{pval:.2e}')

    fig.tight_layout()    
    plt.savefig(f"{path}/01_new_data_05_2025/repeated_measures/{subset_name}_feats.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/01_new_data_05_2025/repeated_measures/{subset_name}_feats.pdf", bbox_inches='tight')
    plt.close(fig)

# Old vs New by groups

In [None]:
pathlib.Path(f"{path}/01_new_data_05_2025/old_vs_new_by_groups").mkdir(parents=True, exist_ok=True)

samples_old_vs_new = {
    'Central': {
        'Old': imm.index[imm['Group detailed'] == 'Old Central'].values,
        'New': imm.index[imm['Group detailed'] == 'New 2025 UNN'].values
    },
    'Mirny': {
        'Old': imm.index[imm['Group detailed'] == 'Old Mirny'].values,
        'New': imm.index[imm['Group detailed'] == 'New 2025 Mirny'].values
    },
    'FMBA': {
        'Old': imm.index[imm['Group detailed'] == 'Old FMBA'].values,
        'New': imm.index[imm['Group detailed'] == 'New 2025 FMBA'].values
    },
}

colors_samples_groups = {
    'Central': 'gold',
    'Mirny': 'crimson',
    'FMBA': 'dodgerblue',
}

colors_samples_old_vs_new = {
    'Central': {
        'Old': 'goldenrod',
        'New': 'yellow'
    },
    'Mirny': {
        'Old': 'firebrick',
        'New': 'tomato'
    },
    'FMBA': {
        'Old': 'mediumblue',
        'New': 'deepskyblue'
    },
}

for group_name, group_samples_dict in samples_old_vs_new.items():
    df_imm_curr = imm.loc[list(set.union(*(set(group_samples) for group, group_samples in group_samples_dict.items()))), ['Age', 'SImAge', '|SImAge acceleration|', 'SImAge acceleration'] + list(feats_imm_slctd)]
    for group, group_samples in group_samples_dict.items():
        df_imm_curr.loc[group_samples, group_name] = group
    df_imm_curr.to_excel(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_data.xlsx")
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_imm_curr,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=group_name,
        palette=colors_samples_old_vs_new[group_name],
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_hist_age.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_hist_age.pdf", bbox_inches='tight')
    plt.close(fig)
    
    df_stat = pd.DataFrame(index=feats_imm_slctd)
    for f_if, f in enumerate(feats_imm_slctd):
        for group, group_samples in group_samples_dict.items():
            df_stat.at[f, f"Spearman (Age) for {group}"] = stats.spearmanr(
                df_imm_curr.loc[group_samples, 'Age'].values,
                df_imm_curr.loc[group_samples, f].values
            ).statistic
            df_stat.at[f, f"Pearson (Age) for {group}"] = stats.pearsonr(
                df_imm_curr.loc[group_samples, 'Age'].values,
                df_imm_curr.loc[group_samples, f].values
            ).statistic
        _, df_stat.at[f, "mannwhitneyu_pval"] = mannwhitneyu(df_imm_curr.loc[df_imm_curr[group_name] == 'New', f].values, df_imm_curr.loc[df_imm_curr[group_name] == 'Old', f].values, alternative='two-sided')
        _, df_stat.at[f, "levene_pval"] = levene(df_imm_curr.loc[df_imm_curr[group_name] == 'New', f].values, df_imm_curr.loc[df_imm_curr[group_name] == 'Old', f].values)
        regcov = smf.ols(formula=f"Q('{f}') ~ Q('{group_name}') + Age", data=df_imm_curr).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(group_name)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[f, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_stat.loc[feats_imm_slctd, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[feats_imm_slctd, "levene_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, "levene_pval"].values, 0.05, method='fdr_bh')
    pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(group_name)].values
    for pval_col in pvals_cols_ancova:
        _, df_stat.loc[feats_imm_slctd, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, pval_col].values, 0.05, method='fdr_bh')
    df_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_stat.to_excel(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_stat.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'white'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = colors_samples_groups[group_name]
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
        
    for corr_type in ['Spearman', 'Pearson']:
        df_fig = df_stat.copy()
        df_fig['Features'] = df_fig.index
        df_fig = df_fig.melt(id_vars='Features', value_vars=[f"{corr_type} (Age) for {group}" for group in ['Old', 'New']], var_name='Group', value_name=fr"{corr_type} $\rho$")
        df_fig['Group'].replace({f"{corr_type} (Age) for {group}": f"{group}" for group in ['Old', 'New']}, inplace=True)
        df_fig.sort_values([fr"{corr_type} $\rho$"], ascending=[False], inplace=True)
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.15))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=fr"{corr_type} $\rho$",
            edgecolor='black',
            palette=colors_samples_old_vs_new[group_name],
            hue='Group',
            hue_order=['Old', 'New'],
            ax=ax,
        )
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_{corr_type}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_{corr_type}.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(24, 20),
        layout="constrained"
    )
    subfigs = fig.subfigures(
        nrows=8,
        ncols=4,
        wspace=0.01,
        hspace=0.01,
    )
    feats_imm_slctd_sorted = df_stat.sort_values([f'Pearson (Age) for Old'], ascending=[False]).index.values
    for feat_id, feat in enumerate(feats_imm_slctd_sorted):
        row_id, col_id = divmod(feat_id, 4)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                # "bottom": 0.14,
                # "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.01,
                "hspace": 0.01,
            },
        )
        
        ds_table_age = pd.DataFrame(index=[fr"Pearson $\rho$", fr"Spearman $\rho$"], columns=['Old', 'New'])
        for corr_type in ['Spearman', 'Pearson']:
            for group in ['Old', 'New']:
                cell_value = df_stat.at[feat, f'{corr_type} (Age) for {group}']
                ds_table_age.at[fr"{corr_type} $\rho$", group] = f"{cell_value:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title='Correlation with Age',
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name='Old',
                title='Old',
                textprops={"ha": "center"},
                width=2.0,
            ),
            ColumnDefinition(
                name='New',
                title='New',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table_age,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 6},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=['Old', 'New'])
        
        ds_table_groups = pd.DataFrame(index=["Mann-Whitney", "Levene", 'ANCOVA'], columns=['p-values (FDR)'])
        mannwhitneyu_pval = df_stat.loc[feat, 'mannwhitneyu_pval_fdr_bh']
        levene_pval = df_stat.loc[feat, 'levene_pval_fdr_bh']
        pvals_cols_ancova = df_stat.columns[df_stat.columns.str.contains(group_name)].values
        ancova_pval = df_stat.loc[feat, pvals_cols_ancova[0]]
        ds_table_groups.at["Mann-Whitney", "p-values (FDR)"] = f"{mannwhitneyu_pval:0.2e}"
        ds_table_groups.at["Levene", "p-values (FDR)"] = f"{levene_pval:0.2e}"
        ds_table_groups.at["ANCOVA", "p-values (FDR)"] = f"{ancova_pval:0.2e}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title='Test',
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name='p-values (FDR)',
                title='p-values (FDR)',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table_groups,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['12'],
            textprops={"fontsize": 5},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=['p-values (FDR)'])
        
        for group in group_samples_dict.keys():    
            regplot = sns.regplot(
                data=df_imm_curr.loc[df_imm_curr[group_name] == group, :],
                x='Age',
                y=feat,
                label=group,
                color=colors_samples_old_vs_new[group_name][group],
                scatter_kws=dict(
                    linewidth=0.5,
                    alpha=0.75,
                    edgecolor="k",
                    s=16,
                ),
                ax=axs['21']
            )
        
        sns.violinplot(
            data=df_imm_curr,
            x=group_name,
            y=feat,
            hue=group_name,
            palette=colors_samples_old_vs_new[group_name],
            density_norm='width',
            order=['Old', 'New'],
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(feat)
        axs['22'].set_xlabel("")

    fig.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_feats.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/01_new_data_05_2025/old_vs_new_by_groups/{group_name}_feats.pdf", bbox_inches='tight')
    plt.close(fig)

# Different groups in new data

In [None]:
pathlib.Path(f"{path}/01_new_data_05_2025/different_groups_in_new_data").mkdir(parents=True, exist_ok=True)

group_samples_dict = {
    'Central': imm.index[imm['Group detailed'] == 'New 2025 UNN'].values,
    'Mirny': imm.index[imm['Group detailed'] == 'New 2025 Mirny'].values,
    'FMBA': imm.index[imm['Group detailed'] == 'New 2025 FMBA'].values
}

colors_samples_groups = {
    'Central': 'gold',
    'Mirny': 'crimson',
    'FMBA': 'dodgerblue',
}

df_imm_curr = imm.loc[list(set.union(*(set(sg) for g, sg in group_samples_dict.items()))), ['Age', 'SImAge', '|SImAge acceleration|', 'SImAge acceleration'] + list(feats_imm_slctd)]
for group, group_samples in group_samples_dict.items():
    df_imm_curr.loc[group_samples, 'Group'] = group
df_imm_curr.to_excel(f"{path}/01_new_data_05_2025/different_groups_in_new_data/data.xlsx")

hist_bins = np.linspace(5, 115, 23)
fig, ax = plt.subplots(figsize=(6, 4))
histplot = sns.histplot(
    data=df_imm_curr,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Group',
    palette=colors_samples_groups,
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/hist_age.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/hist_age.pdf", bbox_inches='tight')
plt.close(fig)

df_stat = pd.DataFrame(index=feats_imm_slctd)
for f_if, f in enumerate(feats_imm_slctd):
    
    for group, group_samples in group_samples_dict.items():
        df_stat.at[f, f"Spearman (Age) for {group}"] = stats.spearmanr(
            df_imm_curr.loc[group_samples, 'Age'].values,
            df_imm_curr.loc[group_samples, f].values
        ).statistic
        df_stat.at[f, f"Pearson (Age) for {group}"] = stats.pearsonr(
            df_imm_curr.loc[group_samples, 'Age'].values,
            df_imm_curr.loc[group_samples, f].values
        ).statistic
        
    for pairs in [['Central', 'Mirny'], ['Central', 'FMBA'], ['Mirny', 'FMBA']]:
        _, df_stat.at[f, f"mw_{pairs[0]}_vs_{pairs[1]}"] = mannwhitneyu(df_imm_curr.loc[df_imm_curr['Group'] == pairs[0], f].values, df_imm_curr.loc[df_imm_curr['Group'] == pairs[1], f].values, alternative='two-sided')
        _, df_stat.at[f, f"lv_{pairs[0]}_vs_{pairs[1]}"] = levene(df_imm_curr.loc[df_imm_curr['Group'] == pairs[0], f].values, df_imm_curr.loc[df_imm_curr['Group'] == pairs[1], f].values)
        regcov = smf.ols(formula=f"Q('{f}') ~ Group + Age", data=df_imm_curr.loc[df_imm_curr['Group'].isin(pairs), :]).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains('Group')].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_stat.at[f, f"ancova_{pairs[0]}_vs_{pairs[1]}"] = reg_sum.at[pval_col, 'P>|t|']
    
for pairs in [['Central', 'Mirny'], ['Central', 'FMBA'], ['Mirny', 'FMBA']]:   
    _, df_stat.loc[feats_imm_slctd, f"mw_{pairs[0]}_vs_{pairs[1]}_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, f"mw_{pairs[0]}_vs_{pairs[1]}"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[feats_imm_slctd, f"lv_{pairs[0]}_vs_{pairs[1]}_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, f"lv_{pairs[0]}_vs_{pairs[1]}"].values, 0.05, method='fdr_bh')
    _, df_stat.loc[feats_imm_slctd, f"ancova_{pairs[0]}_vs_{pairs[1]}_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, f"ancova_{pairs[0]}_vs_{pairs[1]}"].values, 0.05, method='fdr_bh')
df_stat.to_excel(f"{path}/01_new_data_05_2025/different_groups_in_new_data/stat.xlsx")

for stat_test in ['mw', 'lv', 'ancova']:
    for pairs in [['Central', 'Mirny'], ['Central', 'FMBA'], ['Mirny', 'FMBA']]:
        df_fig = df_stat.copy()
        df_fig.sort_values([f"{stat_test}_{pairs[0]}_vs_{pairs[1]}"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f"{stat_test}_{pairs[0]}_vs_{pairs[1]}_fdr_bh_log"] = -np.log10(df_fig[f"{stat_test}_{pairs[0]}_vs_{pairs[1]}_fdr_bh"])
        df_fig['color'] = 'white'
        df_fig.loc[df_fig[f"{stat_test}_{pairs[0]}_vs_{pairs[1]}_fdr_bh"] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f"{stat_test}_{pairs[0]}_vs_{pairs[1]}_fdr_bh_log",
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/{pairs[0]}_vs_{pairs[1]}_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/{pairs[0]}_vs_{pairs[1]}_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
        
for corr_type in ['Spearman', 'Pearson']:
    df_fig = df_stat.copy()
    df_fig['Features'] = df_fig.index
    df_fig = df_fig.melt(id_vars='Features', value_vars=[f"{corr_type} (Age) for {group}" for group in ['Central', 'Mirny', 'FMBA']], var_name='Group', value_name=fr"{corr_type} $\rho$")
    df_fig['Group'].replace({f"{corr_type} (Age) for {group}": f"{group}" for group in ['Central', 'Mirny', 'FMBA']}, inplace=True)
    df_fig.sort_values([fr"{corr_type} $\rho$"], ascending=[False], inplace=True)
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.15))
    barplot = sns.barplot(
        data=df_fig,
        y='Features',
        x=fr"{corr_type} $\rho$",
        edgecolor='black',
        palette=colors_samples_groups,
        hue='Group',
        hue_order=['Central', 'Mirny', 'FMBA'],
        ax=ax,
    )
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')
    ax.set_ylabel('')
    plt.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/{corr_type}.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/{corr_type}.pdf", bbox_inches='tight')
    plt.close(fig)
      
sns.set_theme(style='ticks')
fig = plt.figure(
    figsize=(24, 20),
    layout="constrained"
)
subfigs = fig.subfigures(
    nrows=8,
    ncols=4,
    wspace=0.01,
    hspace=0.01,
)
feats_imm_slctd_sorted = df_stat.sort_values([f'Pearson (Age) for Central'], ascending=[False]).index.values
for feat_id, feat in enumerate(feats_imm_slctd_sorted):
    row_id, col_id = divmod(feat_id, 4)

    axs = subfigs[row_id, col_id].subplot_mosaic(
        [
            ['11', '12'],
            ['21', '22'],
        ],
        height_ratios=[1, 4],
        width_ratios=[3, 1.5],
        gridspec_kw={
            # "bottom": 0.14,
            # "top": 0.95,
            # "left": 0.1,
            # "right": 0.5,
            "wspace": 0.01,
            "hspace": 0.01,
        },
    )
    
    ds_table_age = pd.DataFrame(index=[fr"Pearson $\rho$", fr"Spearman $\rho$"], columns=['Central', 'Mirny', 'FMBA'])
    for corr_type in ['Spearman', 'Pearson']:
        for group in ['Central', 'Mirny', 'FMBA']:
            cell_value = df_stat.at[feat, f'{corr_type} (Age) for {group}']
            ds_table_age.at[fr"{corr_type} $\rho$", group] = f"{cell_value:0.2f}"
    col_defs = [
        ColumnDefinition(
            name="index",
            title='Correlation with Age',
            textprops={"ha": "left"},
            width=4.5,
        ),
        ColumnDefinition(
            name='Central',
            title='Central',
            textprops={"ha": "center"},
            width=2.0,
        ),
        ColumnDefinition(
            name='Mirny',
            title='Mirny',
            textprops={"ha": "center"},
            width=2.0,
        ),
        ColumnDefinition(
            name='FMBA',
            title='FMBA',
            textprops={"ha": "center"},
            width=2.0,
        ),
    ]
    table = Table(
        ds_table_age,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs['11'],
        textprops={"fontsize": 6},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=['Central', 'Mirny', 'FMBA'])
    
    ds_table_groups = pd.DataFrame(index=["Mann-Whitney", "Levene", 'ANCOVA'], columns=['Central\nvs\nMirny', 'Central\nvs\nFMBA', 'Mirny\nvs\nFMBA'])
    for pairs in [['Central', 'Mirny'], ['Central', 'FMBA'], ['Mirny', 'FMBA']]:
        mannwhitneyu_pval = df_stat.loc[feat, f"mw_{pairs[0]}_vs_{pairs[1]}_fdr_bh"]
        levene_pval = df_stat.loc[feat, f"lv_{pairs[0]}_vs_{pairs[1]}_fdr_bh"]
        ancova_pval = df_stat.loc[feat, f"ancova_{pairs[0]}_vs_{pairs[1]}_fdr_bh"]
        ds_table_groups.at["Mann-Whitney", f"{pairs[0]}\nvs\n{pairs[1]}"] = f"{mannwhitneyu_pval:0.2e}"
        ds_table_groups.at["Levene", f"{pairs[0]}\nvs\n{pairs[1]}"] = f"{levene_pval:0.2e}"
        ds_table_groups.at["ANCOVA", f"{pairs[0]}\nvs\n{pairs[1]}"] = f"{ancova_pval:0.2e}"
    col_defs = [
        ColumnDefinition(
            name="index",
            title='Test',
            textprops={"ha": "left"},
            width=2.5,
        ),
        ColumnDefinition(
            name='Central\nvs\nMirny',
            title='Central-Mirny',
            textprops={"ha": "center"},
            width=2.7,
        ),
        ColumnDefinition(
            name='Central\nvs\nFMBA',
            title='Central-FMBA',
            textprops={"ha": "center"},
            width=2.7,
        ),
        ColumnDefinition(
            name='Mirny\nvs\nFMBA',
            title='Mirny-FMBA',
            textprops={"ha": "center"},
            width=2.7,
        ),
    ]
    table = Table(
        ds_table_groups,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs['12'],
        textprops={"fontsize": 4},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=['Central\nvs\nMirny', 'Central\nvs\nFMBA', 'Mirny\nvs\nFMBA'])
    
    for group in group_samples_dict.keys():    
        regplot = sns.regplot(
            data=df_imm_curr.loc[df_imm_curr['Group'] == group, :],
            x='Age',
            y=feat,
            label=group,
            color=colors_samples_groups[group],
            scatter_kws=dict(
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                s=16,
            ),
            ax=axs['21']
        )
    
    sns.violinplot(
        data=df_imm_curr,
        x='Group',
        y=feat,
        hue='Group',
        palette=colors_samples_groups,
        density_norm='width',
        order=['Central', 'Mirny', 'FMBA'],
        saturation=0.75,
        linewidth=1.0,
        ax=axs['22'],
        legend=False,
        cut=0,
    )
    axs['22'].set_ylabel(feat)
    axs['22'].set_xlabel("")

fig.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/feats.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/01_new_data_05_2025/different_groups_in_new_data/feats.pdf", bbox_inches='tight')
plt.close(fig)