# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from plotly.subplots import make_subplots
from pytorch_tabular import TabularModel
import torch
import plotly.graph_objects as go
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
import datetime
from collections import Counter
from matplotlib.ticker import MaxNLocator
from itertools import chain
from sklearn.metrics import mean_absolute_error
import pyaging as pya
import matplotlib.lines as mlines
from src.models.simage.tabular.widedeep.ft_transformer import WDFTTransformerModel
import statsmodels.formula.api as smf
from itertools import chain
from pingouin import ancova
from sklearn.preprocessing import LabelEncoder 
import upsetplot


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]


def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter


def get_sections(sets):
    """
    Given a list of sets, return a new list of sets with all the possible
    mutually exclusive overlapping combinations of those sets.  Another way
    to think of this is the mutually exclusive sections of a venn diagram
    of the sets.  If the original list has N sets, the returned list will
    have (2**N)-1 sets.

    Parameters
    ----------
    sets : list of set

    Returns
    -------
    combinations : list of tuple
        tag : str
            Binary string representing which sets are included / excluded in
            the combination.
        set : set
            The set formed by the overlapping input sets.
    """
    num_combinations = 2 ** len(sets)
    bit_flags = [2 ** n for n in range(len(sets))]
    flags_zip_sets = [z for z in zip(bit_flags, sets)]

    combo_sets = {}
    for bits in range(num_combinations - 1, 0, -1):
        include_sets = [s for flag, s in flags_zip_sets if bits & flag]
        exclude_sets = [s for flag, s in flags_zip_sets if not bits & flag]
        combo = set.intersection(*include_sets)
        combo = set.difference(combo, *exclude_sets)
        tag = ''.join([str(int((bits & flag) > 0)) for flag in bit_flags])
        combo_sets[tag] = combo
    return combo_sets


# Adding status column

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

data = pd.read_excel(f"{path}/ПриложениеГ (КОНФИДЕНЦИАЛЬНО).xlsx", index_col='работник_ID')
data_map = pd.read_excel(f"{path}/origin/Список Лесной 19.11.2024 (3).xlsx", index_col='работник_ID')
data['Status'] = None
data.loc[data['признак 500 здоровые больные'] == '536_б', 'Status'] = 'Case'
data.loc[data['признак 500 здоровые больные'] == '536_з', 'Status'] = 'Control'
# data.loc[data_map.index[data_map['Controls which are Cases'] == 1], 'Status'] = 'Case'

cols_diseases = {
    'невропатолог - код_заболевания': 'Невропатолог',
    'отоларинголог - код_заболевания': 'Отоларинголог',
    'офтальмолог - код_заболевания': 'Офтальмолог',
    'дерматолог - код_заболевания': 'Дерматолог',
    'хирург - код_заболевания': 'Хирург',
    'терапевт - код_заболевания': 'Терапевт',
}

df_diseases = pd.DataFrame()

for col_disease, doctor in cols_diseases.items():
    print(col_disease)
    statuses = np.concatenate(data[col_disease].replace({'нет': np.nan}).dropna().str.split(';').values)
    statuses = statuses[statuses != '']
    statuses_counter = Counter(statuses)
    df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
    df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)
    for icd_code in df_statuses_counter.index:
        data[f"{doctor}, {icd_code}"] = 0
        df_diseases.at[f"{doctor}, {icd_code}", 'Doctor'] = doctor
    
    srs_statuses = data[col_disease].replace({'нет': np.nan}).dropna().str.split(';')
    srs_statuses = srs_statuses[srs_statuses != '']
    for sample_id in srs_statuses.index:
        sample_diseases = srs_statuses[sample_id]
        for sample_disease in sample_diseases:
            if sample_disease != '':
                data.at[sample_id, f"{doctor}, {sample_disease}"] = 1

df_diseases.to_excel(f"{path}/diseases.xlsx")
data.to_excel(f"{path}/pheno.xlsx")

# DNAm

## Prepare samples.csv

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

data = pd.read_excel(f"{path}/ПриложениеГ (КОНФИДЕНЦИАЛЬНО).xlsx", index_col='работник_ID')
data = data[data['признак 500 здоровые больные'].notna()]
data['дата рождения'] = pd.to_datetime(data['дата рождения'])
data['date_now'] = pd.to_datetime("2024-11-11")
data['Age'] = (data['date_now'] - data['дата рождения']) / np.timedelta64(1, 'D') / 365.25
data = data[data['Age'].notna()]

dnam = pd.read_csv(f"{path}/dnam/raw/samples.csv", index_col=0)
dnam.index.difference(data.index).tolist()

In [None]:
dnam.loc[dnam.index, ['Age']] = data.loc[dnam.index, ['Age']]
dnam.to_csv(f"{path}/dnam/samples.csv")

## Betas to pkl

In [None]:
path = "E:/YandexDisk/bbd/fmba/dnam/processed/special_63/GSE220622/gen"
pheno = pd.read_csv(f"{path}/pheno.csv", index_col=0)
pheno.index = pheno.index.astype(str)
betas = pd.read_csv(f"{path}/betas_funnorm.csv", index_col=0).transpose()
betas = betas.loc[pheno.index.values, :]
betas.to_pickle(f"{path}/betas_funnorm.pkl")

## Calculate epigenetic ages

### Load DNAm data

In [None]:
path = "E:/YandexDisk/bbd/fmba/dnam/processed/special_63/GSE220622/gen"
pheno = pd.read_csv(f"{path}/pheno.csv", index_col=0)
pheno.index = pheno.index.astype(str)
betas = pd.read_pickle(f"{path}/betas_funnorm.pkl")

feats_pheno = ['Age', 'Sex', 'Tissue']
pheno = pheno[feats_pheno]

df_for_ages = pd.merge(pheno, betas, left_index=True, right_index=True)

df_for_ages['Female'] = (df_for_ages['Sex'] == 'F').astype(int)
df_for_ages = pya.pp.epicv2_probe_aggregation(df_for_ages, verbose=True)

### Calculate EpImAge

In [None]:
path_epim = "D:/EpInflammAge"

imms_epim = pd.read_excel(f"{path_epim}/models/InflammatoryMarkers/InflammatoryMarkers.xlsx", index_col='feature').index.values
imms_epim_log = [f"{f}_log" for f in imms_epim]
cpgs_epim = pd.read_excel(f"{path_epim}/data/CpGs.xlsx", index_col=0).index.to_list()
cpgs_epim_missed = list(set(cpgs_epim) - set(df_for_ages.columns.values))
cpgs_epim_present = list(set.intersection(set(cpgs_epim), set(df_for_ages.columns.values)))

unn_samples = pd.read_excel(f"{path_epim}/data/cytokines-regression/data.xlsx", index_col=0)
unn_samples = unn_samples.index[unn_samples['Status'] == 'Control'].values

df_for_epim = df_for_ages.loc[:, ['Age'] + cpgs_epim_present]
df_for_epim.loc[:, cpgs_epim_missed] = None

models_imms = {}
for imm in (pbar := tqdm(imms_epim)):
    pbar.set_description(f"Loading model for {imm}")
    models_imms[imm] = TabularModel.load_model(f"{path_epim}/models/InflammatoryMarkers/{imm}")

model_age = TabularModel.load_model(f"{path_epim}/models/EpInflammAge")

bkgrd_imp = pd.read_pickle(f"{path_epim}/models/background-imputation.pkl")
# bkgrd_imp = bkgrd_imp.loc[bkgrd_imp.index.intersection(set(unn_samples)), :]

imp_method = 'KNN'
n_nans = df_for_epim.isna().sum().sum()
if n_nans > 0:
    bkgrd_imp.set_index(bkgrd_imp.index.astype(str) + f'_imputation_{imp_method}', inplace=True)
    data_epim_all = pd.concat([df_for_epim, bkgrd_imp], axis=0, verify_integrity=True)
    if imp_method == "KNN":
        imputer = KNNImputer(n_neighbors=5)
    data_epim_all.loc[:, cpgs_epim] = imputer.fit_transform(data_epim_all.loc[:, cpgs_epim].values) 
    df_for_epim.loc[df_for_epim.index, cpgs_epim] = data_epim_all.loc[df_for_epim.index, cpgs_epim]

for imm in imms_epim:
    df_for_epim[f"{imm}_log"] = models_imms[imm].predict(df_for_epim)
df_for_epim['EpInflammAge'] = model_age.predict(df_for_epim.loc[:, [f"{imm}_log" for imm in imms_epim]])
df_for_epim['Age Acceleration'] = df_for_epim['EpInflammAge'] - df_for_epim['Age']
df_for_epim[['Age', 'EpInflammAge', 'Age Acceleration'] + imms_epim_log].to_excel(f'{path}/EpInflammAge.xlsx')

### Plot EpImAge

In [None]:
path = "E:/YandexDisk/Work/bbd/fmba/dnam"
path_epim = "E:/Git/EpImAge"

imms_epim = pd.read_excel(f"{path_epim}/models/Immunomarkers/Immunomarkers.xlsx", index_col='feature').index.values
imms_epim_log = [f"{f}_log" for f in imms_epim]

df_for_epim = pd.read_excel(f'{path}/processed/EpImAge.xlsx', index_col=0)
for imm in imms_epim:
    df_for_epim[imm] = np.exp(df_for_epim[f"{imm}_log"])
    
xy_min, xy_max = np.quantile(df_for_epim[['Age', 'EpImAge']].values.flatten(), [0.01, 0.99])
xy_ptp = xy_max - xy_min

n_rows = 2
n_cols = 2
fig_height = 5
fig_width = 7
sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), height_ratios=[2, 8],  width_ratios=[4, 2], gridspec_kw={'wspace':0.10, 'hspace': 0.05}, layout='constrained')

ds_table = pd.DataFrame(index=['MAE', r"Pearson $\rho$", 'Bias'], columns=[f'{df_for_epim.shape[0]} FMBA samples'])
mae = mean_absolute_error(df_for_epim['Age'].values, df_for_epim['EpImAge'].values)
rho, _ = stats.pearsonr(df_for_epim['Age'].values, df_for_epim['EpImAge'].values)
bias = np.mean(df_for_epim['EpImAge'] - df_for_epim['Age'])
ds_table.at['MAE', f'{df_for_epim.shape[0]} FMBA samples'] = f"{mae:0.3f}"
ds_table.at[ r"Pearson $\rho$", f'{df_for_epim.shape[0]} FMBA samples'] = f"{rho:0.3f}"
ds_table.at['Bias', f'{df_for_epim.shape[0]} FMBA samples'] = f"{bias:0.3f}"

col_defs = [
    ColumnDefinition(
        name="index",
        title='Metrics',
        textprops={"ha": "left"},
        width=4.5,
    ),
    ColumnDefinition(
        name=f'{df_for_epim.shape[0]} FMBA samples',
        textprops={"ha": "center"},
        width=2.0,
    ),
]
table = Table(
    ds_table,
    column_definitions=col_defs,
    row_dividers=True,
    footer_divider=False,
    ax=axs[0, 0],
    textprops={"fontsize": 8},
    row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
    col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
    column_border_kw={"linewidth": 1, "linestyle": "-"},
).autoset_fontcolors(colnames=[f'{df_for_epim.shape[0]} FMBA samples'])

scatter = sns.scatterplot(
    data=df_for_epim,
    x='Age',
    y="EpImAge",
    linewidth=0.5,
    alpha=0.8,
    edgecolor="k",
    s=25,
    color='crimson',
    ax=axs[1, 0],
)
bisect = sns.lineplot(
    x=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
    y=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
    linestyle='--',
    color='black',
    linewidth=1.0,
    ax=axs[1, 0]
)
regplot = sns.regplot(
    data=df_for_epim,
    x='Age',
    y='EpImAge',
    color='red',
    scatter=False,
    truncate=False,
    ax=axs[1, 0]
)
axs[1, 0].set_xlim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
axs[1, 0].set_ylim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
axs[1, 0].set_ylabel("EpImAge")
axs[1, 0].set_xlabel("Age")

axs[0, 1].axis('off')

violin = sns.violinplot(
    data=df_for_epim,
    x=[0] * df_for_epim.shape[0],
    y='Age Acceleration',
    color=make_rgb_transparent(mcolors.to_rgb('crimson'), (1, 1, 1), 0.5),
    density_norm='width',
    saturation=0.75,
    linewidth=1.0,
    ax=axs[1, 1],
    legend=False,
)
axs[1, 1].set_ylabel('Age Acceleration')
axs[1, 1].set_xlabel('')
axs[1, 1].set(xticklabels=[]) 
axs[1, 1].set(xticks=[]) 
fig.savefig(f"{path}/processed/EpImAge.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/processed/EpImAge.pdf", bbox_inches='tight')
plt.close(fig)

### Calculate pyaging

In [None]:
path_clocks = "E:/YandexDisk/pydnameth/datasets/pyaging"
clocks = [
    "altumage",
    "dunedinpace",
    "han",
    "knight",
    "leecontrol",
    "leerefinedrobust",
    "leerobust",
    "dnamfitage",
    "dnamphenoage",
    "dnamtl",
    "encen100",
    "encen40",
    "grimage",
    "grimage2",
    "hannum",
    "horvath2013",
    "hrsinchphenoage",
    "lin",
    "pcdnamtl",
    "pcgrimage",
    "pchannum",
    "pchorvath2013",
    "pcphenoage",
    "pcskinandblood",
    "pedbe",
    "replitali",
    "skinandblood",
    "stemtoc",
    "stoch",
    "stocp",
    "stocz",
    "yingadaptage",
    "yingcausage",
    "yingdamage",
    "zhangblup",
    "zhangen",
    "zhangmortality",
    "epitoc1",
    "retroelementagev1",
    "retroelementagev2",
    "intrinclock",
    "abec",
    "cabec",
    "eabec",
    "pipekelasticnet",
    "pipekfilteredh",
    "pipekretrainedh",
    "dnamic"
]

adata = pya.pp.df_to_adata(df_for_ages, metadata_cols=['Sex', 'Tissue'], imputer_strategy='knn', verbose=True)
pya.pred.predict_age(adata=adata, dir=path_clocks, clock_names=clocks, verbose=True)
results = pd.merge(pheno.loc[:, feats_pheno], adata.obs[clocks], left_index=True, right_index=True)
results.to_excel(f"{path}/pheno_funnorm.xlsx")

## Plot epigenetic ages

In [None]:
path_clocks = "D:/YandexDisk/Work/pydnameth/datasets/pyaging"
path = "D:/YandexDisk/Work/bbd/fmba/dnam"
clocks_meta = pd.read_excel(f"{path_clocks}/clocks_meta.xlsx", index_col='Clock Name')
clocks_meta['Clock Name'] = clocks_meta.index
pheno = pd.read_excel(f"{path}/processed/pheno.xlsx", index_col=0)
pheno.rename(columns=dict(zip(clocks_meta['Model ID'].values, clocks_meta['Clock Name'].values)), inplace=True)
clocks_meta.drop(index=['Knight', 'LeeControl', 'LeeRefinedRobust', 'LeeRobust', 'PedBE', 'RepliTali', 'ENCen100'], inplace=True)
clocks_ages = clocks_meta[clocks_meta['Type'] == 'Age']
clocks_metrics = clocks_meta[clocks_meta['Type'] != 'Age']

colors = distinctipy.get_colors(clocks_meta.shape[0], [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1337, pastel_factor=0.5)
colors_clocks = {clock: colors[clock_id] for clock_id, clock in enumerate(clocks_meta.index)}
for ea in clocks_ages.index:
    pheno[f"{ea} Acceleration"] = pheno[ea] - pheno['Age'] 

In [None]:
xy_min, xy_max = np.quantile(pheno[['Age'] + clocks_ages.index.to_list()].values.flatten(), [0.01, 0.99])
xy_ptp = xy_max - xy_min

sns.set_theme(style='ticks')
fig = plt.figure(
    figsize=(12, 3),
    layout="constrained"
)

fig, axs = plt.subplots(
    nrows=1,
    ncols=6,
    figsize=(12, 3),
    gridspec_kw={
        'wspace':0.05,
        'hspace': 0.15
    },
    sharey=False,
    sharex=False,
    layout="constrained"
)

for em_id, em in enumerate(clocks_metrics.index):
    violin = sns.violinplot(
        data=pheno,
        x=[0] * pheno.shape[0],
        y=em,
        color=make_rgb_transparent(mcolors.to_rgb(colors_clocks[em]), (1, 1, 1), 0.75),
        density_norm='width',
        saturation=0.75,
        linewidth=1.0,
        ax=axs[em_id],
        legend=False,
    )
    axs[em_id].set_ylabel(em)
    axs[em_id].set_xlabel('')
    axs[em_id].set(xticklabels=[]) 
    axs[em_id].set(xticks=[]) 
fig.savefig(f"{path}/metrics.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/metrics.pdf", bbox_inches='tight')
plt.close(fig)

In [None]:
sns.set_theme(style='ticks')
fig = plt.figure(
    figsize=(32, 14),
    layout="constrained"
)
subfigs = fig.subfigures(
    nrows=4,
    ncols=7,
    # wspace=0.001,
    # hspace=0.001,
)

for ea_id, ea in enumerate(clocks_ages.index):
    row_id, col_id = divmod(ea_id, 7)

    axs = subfigs[row_id, col_id].subplot_mosaic(
        [
            ['A', 'A'],
            ['B', 'C'],
        ],
        height_ratios=[1, 4],
        width_ratios=[3, 1],
        gridspec_kw={
            "bottom": 0.14,
            "top": 0.95,
            # "left": 0.1,
            # "right": 0.5,
            "wspace": 0.63,
            "hspace": 0.01,
        },
    )
    
    ds_table = pd.DataFrame(index=['MAE', fr"Pearson $\mathbf{{\rho}}$", "Bias"], columns=[ea])
    mae = mean_absolute_error(pheno['Age'].values, pheno[ea].values)
    rho, _ = stats.pearsonr(pheno['Age'].values, pheno[ea].values)
    bias = np.mean(pheno[ea] - pheno['Age'])
    ds_table.at['MAE', ea] = f"{mae:0.2f}"
    ds_table.at[fr"Pearson $\mathbf{{\rho}}$", ea] = f"{rho:0.2f}"
    ds_table.at["Bias", ea] = f"{bias:0.2f}"
    col_defs = [
        ColumnDefinition(
            name="index",
            title=ea,
            textprops={"ha": "left"},
            width=4.5,
        ),
        ColumnDefinition(
            name=ea,
            title='',
            textprops={"ha": "center"},
            width=2.0,
        ),
    ]
    table = Table(
        ds_table,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs['A'],
        textprops={"fontsize": 7},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=[ea])
    
    scatter = sns.scatterplot(
        data=pheno,
        x='Age',
        y=ea,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        s=25,
        color=colors_clocks[ea],
        ax=axs['B'],
    )
    bisect = sns.lineplot(
        x=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
        y=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
        linestyle='--',
        color='black',
        linewidth=1.0,
        ax=axs['B']
    )
    regplot = sns.regplot(
        data=pheno,
        x='Age',
        y=ea,
        color=colors_clocks[ea],
        scatter=False,
        truncate=False,
        ax=axs['B']
    )
    axs['B'].set_xlim(xy_min, xy_max)
    axs['B'].set_ylim(xy_min, xy_max)
    axs['B'].set_ylabel(ea)
    axs['B'].set_xlabel("Age")
    
    violin = sns.violinplot(
        data=pheno,
        x=[0] * pheno.shape[0],
        y=f"{ea} Acceleration",
        color=make_rgb_transparent(mcolors.to_rgb(colors_clocks[ea]), (1, 1, 1), 0.75),
        density_norm='width',
        saturation=0.75,
        linewidth=1.0,
        ax=axs['C'],
        legend=False,
    )
    axs['C'].set_ylabel(f"{ea} Acceleration")
    axs['C'].set_xlabel('')
    axs['C'].set(xticklabels=[]) 
    axs['C'].set(xticks=[]) 

fig.savefig(f"{path}/ages.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/ages.pdf", bbox_inches='tight')
plt.close(fig)

## Select 20 samples

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_pyaging = "E:/YandexDisk/Work/pydnameth/datasets/pyaging"

df_pyaging = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_pyaging.index = df_pyaging.index.astype(str)

path_save = f"{path}/04_select_samples_20"

ids_groups = {
    'All': df_pyaging.index.values,
}
colors_groups = {
    'All': 'crimson',
    'Selected': 'dodgerblue',
}

age_bin_edges = np.linspace(5, 115, 23)
age_prob = np.asarray([1/22] * 22)
# age_prob = np.asarray([10.0]*10 + [1.0]*8 +  [10.0]*10)
age_prob /= np.sum(age_prob)
bin_diff = 5

print(df_pyaging.shape[0])
df_pyaging.loc[:, 'Prob Age'] = age_prob[np.rint((df_pyaging.loc[:, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]

n_same_age = 20
print(n_same_age)

for seed in range(100):

    index_selected = df_pyaging.sample(n=n_same_age, replace=False, weights='Prob Age', random_state=seed+100).index
    if index_selected.is_unique:
        ids_selected = index_selected.to_list()
        ids_groups['Selected'] = ids_selected
    else:
        print("Not unique index")

    # Plot age histogram
    df_fig_1 = df_pyaging.loc[ids_groups['All'], ['Age']].copy()
    df_fig_1['Group'] = 'All'
    df_fig_2 = df_pyaging.loc[ids_groups['Selected'], ['Age']].copy()
    df_fig_2['Group'] = 'Selected'
    df_fig = pd.concat([df_fig_1, df_fig_2], ignore_index=True)
    hist_bins = np.linspace(5, 115, 23)
    sns.set_theme(style='whitegrid')
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_fig,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue='Group',
        palette=colors_groups,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path_save}/hist_age_{seed}.png", bbox_inches='tight', dpi=200)
    # plt.savefig(f"{path_save}/hist_age.pdf", bbox_inches='tight')
    plt.close(fig)

    df_pyaging.loc[ids_groups['Selected'], ['Age']].to_excel(f"{path_save}/selected_{seed}.xlsx")

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_pyaging = "E:/YandexDisk/Work/pydnameth/datasets/pyaging"

df_pyaging = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_pyaging.index = df_pyaging.index.astype(str)

path_save = f"{path}/04_select_samples_20"

ids_groups = {
    'All': df_pyaging.index.values,
    'Selected': pd.read_excel(f"{path_save}/selected.xlsx", index_col=0).index.astype(str).values
}
colors_groups = {
    'All': 'crimson',
    'Selected': 'dodgerblue',
}

# Plot age histogram
df_fig_1 = df_pyaging.loc[ids_groups['All'], ['Age']].copy()
df_fig_1['Group'] = 'All'
df_fig_2 = df_pyaging.loc[ids_groups['Selected'], ['Age']].copy()
df_fig_2['Group'] = 'Selected'
df_fig = pd.concat([df_fig_1, df_fig_2], ignore_index=True)
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(6, 4))
histplot = sns.histplot(
    data=df_fig,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Group',
    palette=colors_groups,
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path_save}/hist_age.png", bbox_inches='tight', dpi=200)
# plt.savefig(f"{path_save}/hist_age.pdf", bbox_inches='tight')
plt.close(fig)

# Immuno

## Get data with and without NaNs

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_old = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"

feats_imm = pd.read_excel(f"{path_old}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path_old}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path_old}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values
feats_imm_genes = pd.read_excel(f"{path_old}/data/immuno/immuno_markers_genes.xlsx", index_col=0)
feats_imm_rename = dict(zip(feats_imm_genes.index, feats_imm_genes['gene']))

imm_old = pd.read_excel(f"{path_old}/data/immuno/data.xlsx", index_col=0)
imm_old_w_nans = pd.read_excel(f"{path_old}/data/immuno/data_with_nans.xlsx", index_col=0)
imm_old_selected = pd.read_excel(f"{path_old}/special/059_imm_data_selection/df_imm.xlsx", index_col=0)

ids_groups = {
    'Central': imm_old_selected.index[imm_old_selected['Region'] == 'Central'].values,
    'Yakutia': imm_old_selected.index[imm_old_selected['Region'] == 'Yakutia'].values,
    'Mirny': imm_old.index[imm_old['Region'] == 'Mirny'].values,
}

data = pd.read_excel(f"{path}/ПриложениеГ (КОНФИДЕНЦИАЛЬНО).xlsx", index_col='работник_ID')
data['дата рождения'] = pd.to_datetime(data['дата рождения'])
data['date_now'] = pd.to_datetime("2024-11-11")
data['Age'] = (data['date_now'] - data['дата рождения']) / np.timedelta64(1, 'D') / 365.25

imm_raw = pd.read_excel(f"{path}/immuno/raw.xlsx", index_col=0)
imm_raw.insert(0, 'Age', None)
ids_imm_only = imm_raw.index.difference(data.index).to_list()
ids_cmn = imm_raw.index.intersection(data.index).to_list()
print(f"ids_imm_only: {ids_imm_only}")
imm_raw.rename(columns=feats_imm_rename, inplace=True)
imm_raw = imm_raw.loc[:, ['Age'] + list(feats_imm)]
imm_raw.loc[ids_cmn, 'Age'] = data.loc[ids_cmn, 'Age']

# Data with NaNs
imm_w_nans = imm_raw.copy()
imm_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
imm_w_nans.replace(r'^([\>].*)$', 'NaN', inplace=True, regex=True)
imm_w_nans = imm_w_nans.apply(pd.to_numeric, errors='coerce')
imm_w_nans.to_excel(f"{path}/immuno/data_w_nans.xlsx")

# Impute max thresholds
imm_max_thld_nans = imm_raw.copy()
imm_max_thld_nans.drop(['Age'], axis=1, inplace=True)
imm_max_thld_nans.replace(r'^([\>].*)$', None, inplace=True, regex=True)
imm_max_thld_nans = imm_max_thld_nans.stack(dropna=False)
max_thld_nans = [list(x) for x in imm_max_thld_nans.index[imm_max_thld_nans.isna()]]
print(f'Number of max_thld_nans: {len(max_thld_nans)}')
imm_max_thld_imp = imm_raw.copy()
imm_max_thld_imp.drop(['Age'], axis=1, inplace=True)
imm_max_thld_imp.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
imm_max_thld_imp.replace(r'^([\>].*)$', 'NaN', inplace=True, regex=True)
imm_max_thld_imp = imm_max_thld_imp.apply(pd.to_numeric, errors='coerce')
n_neighbors = 3
X = imm_max_thld_imp.loc[:, feats_imm].values
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
imm_max_thld_imp.loc[:, feats_imm] = X_imptd

# Fill with imputed max thresholds
imm_raw.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
imm_raw.replace(r'^([\>].*)$', 'NaN', inplace=True, regex=True)
imm_raw = imm_raw.apply(pd.to_numeric, errors='coerce')
print(f'Missing before max thresholds imputation: {imm_raw.isna().sum().sum()}')
for max_imp_nan in max_thld_nans:
    imm_raw.at[max_imp_nan[0], max_imp_nan[1]] = imm_max_thld_imp.at[max_imp_nan[0], max_imp_nan[1]]
print(f'Missing after max thresholds imputation: {imm_raw.isna().sum().sum()}')

# Impute min thresholds and replace imputed values with the closest threshold values in Central
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

ids_imp_trn = imm_old.loc[ids_groups['Central'], ].index.values
ids_imp_tst = imm_raw.index.values
df_imp = pd.concat([
    imm_old.loc[ids_imp_trn, feats_imm],
    imm_raw.loc[:, feats_imm]
])
X = df_imp.loc[:, feats_imm].values
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
df_imp.loc[:, feats_imm] = X_imptd
for feat in feats_imm:
    srs_feat_base = imm_old_w_nans.loc[ids_imp_trn, feat].isna()
    ids_feat_base = srs_feat_base.index[srs_feat_base == True].values
    if len(ids_feat_base) > 0:
        feat_base_vals = imm_old.loc[ids_feat_base, feat].unique()
        srs_feat_trgt = imm_raw.loc[ids_imp_tst, feat].isna()
        ids_feat_trgt = srs_feat_trgt.index[srs_feat_trgt == True].values
        for id_trgt in ids_feat_trgt:
            df_imp.at[id_trgt, feat] = find_nearest(feat_base_vals, df_imp.at[id_trgt, feat])
imm_raw.loc[ids_imp_tst, feats_imm] = df_imp.loc[ids_imp_tst, feats_imm]
imm_raw.to_excel(f"{path}/immuno/data.xlsx")

## Compare immuno with synthetic data from EpImAge

In [None]:
path = "E:/YandexDisk/Work/bbd/fmba"
path_epim = "E:/Git/EpImAge"

df_for_epim = pd.read_excel(f'{path}/dnam/processed/EpImAge.xlsx', index_col=0)

imms_epim = pd.read_excel(f"{path_epim}/models/Immunomarkers/Immunomarkers.xlsx", index_col='feature').index.values
imms_epim_log = [f"{f}_log" for f in imms_epim]

df_epim = pd.read_excel(f'{path}/dnam/processed/EpImAge.xlsx', index_col=0)
for imm in imms_epim:
    df_epim[f"{imm} synthetic"] = df_for_epim[f"{imm}_log"]
    
df_imm = pd.read_excel(f"{path}/immuno/data.xlsx", index_col=0)
for imm in imms_epim:
    df_imm[f"{imm}"] = np.log(df_imm[imm])

df_cmn = pd.merge(df_imm[['Age'] + list(imms_epim)], df_epim[[f"{imm} synthetic" for imm in imms_epim]], left_index=True, right_index=True)

In [None]:
n_rows = 4 * 3
n_cols = 6
fig_height = 20
fig_width = 28

imm_colors = distinctipy.get_colors(len(imms_epim), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1337, pastel_factor=0.2)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), height_ratios=[0.2, 0.8, 0.2]*4, gridspec_kw={'wspace':0.35, 'hspace': 0.05}, sharey=False, sharex=False)

for imm_id, imm in tqdm(enumerate(imms_epim)):
    imm_color = imm_colors[imm_id]
    row_id, col_id = divmod(imm_id, n_cols)
    row_id_table = row_id * 3
    row_id_scatter = row_id * 3 + 1
    row_id_empty = row_id * 3 + 2
    
    xy_min, xy_max = np.quantile(df_cmn[[imm, f"{imm} synthetic"]].values.flatten(), [0.01, 0.99])
    xy_ptp = xy_max - xy_min
    
    ds_table = pd.DataFrame(index=['MAE', r"Pearson $\rho$", 'Bias'], columns=[f'{df_cmn.shape[0]} FMBA samples'])
    mae = mean_absolute_error(df_cmn[imm].values, df_cmn[f"{imm} synthetic"].values)
    rho, _ = stats.pearsonr(df_cmn[imm].values, df_cmn[f"{imm} synthetic"].values)
    bias = np.mean(df_cmn[f"{imm} synthetic"] - df_cmn[imm])
    ds_table.at['MAE', f'{df_cmn.shape[0]} FMBA samples'] = f"{mae:0.3f}"
    ds_table.at[ r"Pearson $\rho$", f'{df_cmn.shape[0]} FMBA samples'] = f"{rho:0.3f}"
    ds_table.at['Bias', f'{df_cmn.shape[0]} FMBA samples'] = f"{bias:0.3f}"
    
    col_defs = [
        ColumnDefinition(
            name="index",
            title='Metrics',
            textprops={"ha": "left"},
            width=4.5,
        ),
        ColumnDefinition(
            name=f'{df_cmn.shape[0]} FMBA samples',
            textprops={"ha": "center"},
            width=2.0,
        ),
    ]
    table = Table(
        ds_table,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs[row_id_table, col_id],
        textprops={"fontsize": 8},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=[f'{df_cmn.shape[0]} FMBA samples'])
    
    scatter = sns.scatterplot(
        data=df_cmn,
        x=imm,
        y=f"{imm} synthetic",
        linewidth=0.5,
        alpha=0.8,
        edgecolor="k",
        s=25,
        color=imm_color,
        ax=axs[row_id_scatter, col_id],
    )
    bisect = sns.lineplot(
        x=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
        y=[xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp],
        linestyle='--',
        color='black',
        linewidth=1.0,
        ax=axs[row_id_scatter, col_id],
    )
    regplot = sns.regplot(
        data=df_cmn,
        x=imm,
        y=f"{imm} synthetic",
        color='black',
        scatter=False,
        truncate=False,
        ax=axs[row_id_scatter, col_id],
    )
    ax=axs[row_id_scatter, col_id].set_xlim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
    ax=axs[row_id_scatter, col_id].set_ylim(xy_min - 0.15 * xy_ptp, xy_max + 0.15 * xy_ptp)
    ax=axs[row_id_scatter, col_id].set_ylabel(f"{imm} synthetic", color=imm_color, path_effects=[pe.withStroke(linewidth=1.0, foreground="black")])
    ax=axs[row_id_scatter, col_id].set_xlabel(imm, color=imm_color, path_effects=[pe.withStroke(linewidth=1.0, foreground="black")])
    
    axs[row_id_empty, col_id].axis('off')

fig.tight_layout()    
fig.savefig(f"{path}/dnam/processed/EpImAge_immunomarkers.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/dnam/processed/EpImAge_immunomarkers.pdf", bbox_inches='tight')
plt.close(fig)

## Compare FMBA to Central

### Generate common data

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_old = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_simage = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/data/immuno/models/SImAge/best_fold_0002.ckpt"
path_epimage = f"E:/Git/EpImAge/models/EpImAge"

model_epimage = TabularModel.load_model(path_epimage)

model_simage = WDFTTransformerModel.load_from_checkpoint(checkpoint_path=path_simage)
model_simage.eval()
model_simage.freeze()
model_simage.to('cpu')

feats_imm = pd.read_excel(f"{path_old}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path_old}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path_old}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values
feats_imm_genes = pd.read_excel(f"{path_old}/data/immuno/immuno_markers_genes.xlsx", index_col=0)
feats_imm_rename = dict(zip(feats_imm_genes.index, feats_imm_genes['gene']))

imm_old = pd.read_excel(f"{path_old}/data/immuno/data.xlsx", index_col=0)
imm_old_w_nans = pd.read_excel(f"{path_old}/data/immuno/data_with_nans.xlsx", index_col=0)
imm_old_selected = pd.read_excel(f"{path_old}/special/059_imm_data_selection/df_imm.xlsx", index_col=0)

imm_fmba = pd.read_excel(f"{path}/immuno/data.xlsx", index_col=0)
imm_fmba = imm_fmba[imm_fmba['Age'].notna()]
imm_fmba_w_nans = pd.read_excel(f"{path}/immuno/data_w_nans.xlsx", index_col=0)
imm_fmba_w_nans = imm_fmba_w_nans[imm_fmba_w_nans['Age'].notna()]

ids_groups = {
    'Central': imm_old_selected.index[imm_old_selected['Region'] == 'Central'].values,
    'Yakutia': imm_old_selected.index[imm_old_selected['Region'] == 'Yakutia'].values,
    'Mirny': imm_old.index[imm_old['Region'] == 'Mirny'].values,
    'FMBA': imm_fmba.index.values,
}

colors_groups = {
    'Central': 'gold',
    'Yakutia': 'lightslategray',
    'Mirny': 'crimson',
    'FMBA': 'dodgerblue',
}

imm = pd.concat([imm_old, imm_fmba])
imm['SImAge'] = model_simage(torch.from_numpy(imm.loc[:, feats_imm_fimmu].values)).cpu().detach().numpy().ravel()
imm['SImAge acceleration'] = imm['SImAge'] - imm['Age']
imm['|SImAge acceleration|'] = imm['SImAge acceleration'].abs()
for f in feats_imm_slctd:
    imm[f"{f}_log"] = np.log(imm[f"{f}"])
imm['EpImAge'] = model_epimage.predict(imm)
imm['EpImAge acceleration'] = imm['EpImAge'] - imm['Age']
imm['|EpImAge acceleration|'] = imm['EpImAge acceleration'].abs()
imm.to_excel(f"{path}/immuno/data_full.xlsx")

imm_w_nans = pd.concat([imm_old_w_nans, imm_fmba_w_nans])
imm_w_nans.to_excel(f"{path}/immuno/data_full_w_nans.xlsx")

### Load synthetic

In [None]:
path_epim = "E:/Git/EpImAge"
imms_epim = pd.read_excel(f"{path_epim}/models/Immunomarkers/Immunomarkers.xlsx", index_col='feature').index.values
imm_synt = pd.read_excel(f"{path}/dnam/processed/EpImAge.xlsx", index_col=0)
for f in imms_epim:
    imm_synt[f] = np.exp(imm_synt[f"{f}_log"].values)

### NaNs

In [None]:
pathlib.Path(f"{path}/immuno/nans").mkdir(parents=True, exist_ok=True)

groups = ['FMBA', 'Central', 'Yakutia', 'Mirny']

n_rows = 2
n_cols = 2
fig_width = 15
fig_height = 9

sns.set_theme(style='ticks')
fig_bar, axs_bar = plt.subplots(n_rows, n_rows, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, layout="constrained")
fig_hist, axs_hist = plt.subplots(n_rows, n_rows, figsize=(fig_width, fig_height), gridspec_kw={}, sharex=True, layout="constrained")

dfs_nan_feats = {}
df_nan_feats_by_group = pd.DataFrame(index=groups)
for group_id, group in enumerate(groups):
    row_id, col_id = divmod(group_id, n_cols)
    
    df_nan_feats = imm_w_nans.loc[ids_groups[group], feats_imm].isna().sum(axis=0).to_frame(name="Number of NaNs")
    df_nan_feats["% of NaNs"] = df_nan_feats["Number of NaNs"] / len(ids_groups[group]) * 100
    df_nan_feats["Number of not-NaNs"] = imm_w_nans.loc[ids_groups[group], feats_imm].notna().sum(axis=0)
    df_nan_feats.sort_values(["% of NaNs"], ascending=[False], inplace=True)
    dfs_nan_feats[group] = df_nan_feats
    df_nan_feats_by_group.at[group, "% of NaNs"] = df_nan_feats["Number of NaNs"].sum(axis=0) / imm_w_nans.loc[ids_groups[group], feats_imm].size * 100
    
    barplot = sns.barplot(
        data=df_nan_feats,
        x=df_nan_feats.index,
        y=f"% of NaNs",
        edgecolor='black',
        color=colors_groups[group],
        dodge=False,
        ax=axs_bar[row_id, col_id],
    )
    axs_bar[row_id, col_id].set(xlim=(-0.7, len(feats_imm)-0.3))
    axs_bar[row_id, col_id].set_title(f"{group} ({len(ids_groups[group])})")
    axs_bar[row_id, col_id].set_xticklabels(axs_bar[row_id, col_id].get_xticklabels(), rotation=90)

    df_nan_smpls = imm_w_nans.loc[ids_groups[group], feats_imm].isna().sum(axis=1).to_frame(name="Features with NaNs")
    
    hist_bins = np.linspace(0, len(feats_imm), len(feats_imm) + 1)
    histplot = sns.histplot(
        data=df_nan_smpls,
        discrete=True,
        edgecolor='k',
        linewidth=1,
        x="Features with NaNs",
        color=colors_groups[group],
        ax=axs_hist[row_id, col_id],
    )
    axs_hist[row_id, col_id].set(xlim=(-0.6, len(feats_imm)+0.6))
    axs_hist[row_id, col_id].set_title(f"{group} ({len(ids_groups[group])})")
    axs_hist[row_id, col_id].set_ylabel(f"Number of samples")

fig_bar.tight_layout()    
fig_bar.savefig(f"{path}/immuno/nans/feats.png", bbox_inches='tight', dpi=200)
fig_bar.savefig(f"{path}/immuno/nans/feats.pdf", bbox_inches='tight')
plt.close(fig_bar)

with pd.ExcelWriter(f'{path}/immuno/nans/feats.xlsx', engine='xlsxwriter') as writer:
    for group_id, group in enumerate(groups):
        dfs_nan_feats[group].to_excel(writer, sheet_name=group)

fig_hist.tight_layout()    
fig_hist.savefig(f"{path}/immuno/nans/samples.png", bbox_inches='tight', dpi=200)
fig_hist.savefig(f"{path}/immuno/nans/samples.pdf", bbox_inches='tight')
plt.close(fig_hist)

plt.figure(figsize=(3, 2))
sns.set_theme(style='whitegrid')
barplot = sns.barplot(
    data=df_nan_feats_by_group,
    y=df_nan_feats_by_group.index,
    x=f"% of NaNs",
    edgecolor='black',
    palette=colors_groups,
    dodge=False,
    orient='h'
)
for x in barplot.containers:
    barplot.bar_label(x, fmt="%.1f", padding=2.0)
plt.savefig(f"{path}/immuno/nans/global.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/nans/global.pdf", bbox_inches='tight')
plt.close()

### Regression with covariates

In [None]:
pathlib.Path(f"{path}/immuno/vs_central/reg_w_covs").mkdir(parents=True, exist_ok=True)

groups = ['Central', 'FMBA']

imm_vs = imm.loc[list(chain.from_iterable([ids_groups[g] for g in groups])), :]
for g in groups: imm_vs.loc[ids_groups[g], 'Group'] = g 
imm_vs.loc[ids_groups[g], 'Sex'] = 'M'
imm_vs = imm_vs.loc[:, list(feats_imm_slctd) + ['Age', 'Sex', 'Group']]

df_stat = pd.DataFrame(index=list(feats_imm_slctd))
for feat in list(feats_imm_slctd):
    reg = smf.ols(formula=f"np.log({feat}) ~ Group + Age + Sex", data=imm_vs).fit()
    # reg = smf.ols(formula=f"{feat} ~ Group + Age + Sex", data=imm_vs).fit()
    reg_sum = reg.summary2().tables[1]
    df_stat.at[feat, "pval"] = reg_sum.at['Group[T.FMBA]', 'P>|t|']
_, df_stat.loc[feats_imm_slctd, "pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, "pval"], 0.05, method='fdr_bh')
df_stat.sort_values([f"pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path}/immuno/vs_central/reg_w_covs/stat.xlsx", index_label='Features')

# Plot barplot for p-values
df_stat['pval_fdr_bh_log'] = -np.log10(df_stat['pval_fdr_bh'])
df_stat['color'] = 'pink'
df_stat.loc[df_stat['pval_fdr_bh'] < 0.05, 'color'] = 'red'
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(4, 8))
barplot = sns.barplot(
    data=df_stat,
    y=df_stat.index.values,
    x='pval_fdr_bh_log',
    edgecolor='black',
    palette=df_stat['color'].values,
    dodge=False,
    ax=ax
)
ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
ax.set_ylabel('', fontsize=20)
ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
plt.savefig(f"{path}/immuno/vs_central/reg_w_covs/barplot_pvals.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/vs_central/reg_w_covs/barplot_pvals.pdf", bbox_inches='tight')
plt.close(fig)

### ANCOVA

In [None]:
pathlib.Path(f"{path}/immuno/vs_central/ancova").mkdir(parents=True, exist_ok=True)

groups = ['Central', 'FMBA']

imm_vs = imm.loc[list(chain.from_iterable([ids_groups[g] for g in groups])), :]
for g in groups: imm_vs.loc[ids_groups[g], 'Group'] = g 
imm_vs.loc[ids_groups[g], 'Sex'] = 'M'
imm_vs['Sex'] = LabelEncoder().fit_transform(imm_vs['Sex'])
imm_vs = imm_vs.loc[:, list(feats_imm_slctd) + ['Age', 'Sex', 'Group']]

df_stat = pd.DataFrame(index=list(feats_imm_slctd))
for feat in list(feats_imm_slctd):
    res = ancova(data=imm_vs, dv=feat, covar=['Age', 'Sex'], between='Group').set_index('Source')
    df_stat.at[feat, "pval"] = res.at['Group', 'p-unc']
_, df_stat.loc[feats_imm_slctd, "pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, "pval"], 0.05, method='fdr_bh')
df_stat.sort_values([f"pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path}/immuno/vs_central/ancova/stat.xlsx", index_label='Features')

# Plot barplot for p-values
df_stat['pval_fdr_bh_log'] = -np.log10(df_stat['pval_fdr_bh'])
df_stat['color'] = 'pink'
df_stat.loc[df_stat['pval_fdr_bh'] < 0.05, 'color'] = 'red'
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(4, 8))
barplot = sns.barplot(
    data=df_stat,
    y=df_stat.index.values,
    x='pval_fdr_bh_log',
    edgecolor='black',
    palette=df_stat['color'].values,
    dodge=False,
    ax=ax
)
ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
ax.set_ylabel('', fontsize=20)
ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
plt.savefig(f"{path}/immuno/vs_central/ancova/barplot_pvals.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/vs_central/ancova/barplot_pvals.pdf", bbox_inches='tight')
plt.close(fig)

### Mann-Whitney with the same age distribution

In [None]:
pathlib.Path(f"{path}/immuno/vs_central/mw_same_age").mkdir(parents=True, exist_ok=True)

# Select samples with same age
n_same_age = 200
hist_bins = np.linspace(5, 115, 23)
age_counts, age_bin_edges = np.histogram(imm.loc[ids_groups['FMBA'], 'Age'].values, bins=hist_bins)
age_prob = age_counts / len(imm.loc[ids_groups['FMBA'], 'Age'].values)
bin_diff = age_bin_edges[1] - age_bin_edges[0]
imm.loc[ids_groups['Central'], 'Prob Age (FMBA)'] = age_prob[np.rint((imm.loc[ids_groups['Central'], 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]
index_central_same_age = imm.loc[ids_groups['Central'], :].sample(n=n_same_age, replace=False, weights='Prob Age (FMBA)', random_state=1337).index
if index_central_same_age.is_unique:
    ids_central_same_age = index_central_same_age.values
else:
    print("Not unique index")
ids_groups['Central (Age as FMBA)'] = ids_central_same_age
colors_groups['Central (Age as FMBA)'] = 'green'

# Plot age histogram
df_fig_1 = imm.loc[ids_groups['Central'], ['Age']].copy()
df_fig_1['Group'] = 'Central'
df_fig_2 = imm.loc[ids_groups['Central (Age as FMBA)'], ['Age']].copy()
df_fig_2['Group'] = 'Central (Age as FMBA)'
df_fig_3 = imm.loc[ids_groups['FMBA'], ['Age']].copy()
df_fig_3['Group'] = 'FMBA'
df_fig = pd.concat([df_fig_1, df_fig_2, df_fig_3], ignore_index=True)
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(6, 4))
histplot = sns.histplot(
    data=df_fig,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Group',
    palette=colors_groups,
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/hist_age.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/hist_age.pdf", bbox_inches='tight')
plt.close(fig)

# Calc statistics
groups = ['Central (Age as FMBA)', 'FMBA']
df_stat = pd.DataFrame(index=list(feats_imm_slctd))
for feat in list(feats_imm_slctd):
    vals = {}
    for group in groups:
        vals[group] = imm.loc[ids_groups[group], feat].values
        df_stat.at[feat, f"Mean {group}"] = np.mean(vals[group])
        df_stat.at[feat, f"Median {group}"] = np.median(vals[group])
        df_stat.at[feat, f"q75 {group}"], df_stat.at[feat, f"q25 {group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at[feat, f"iqr {group}"] = df_stat.at[feat, f"q75 {group}"] - df_stat.at[feat, f"q25 {group}"]
    _, df_stat.at[feat, "mw_pval"] = mannwhitneyu(vals[groups[0]], vals[groups[1]], alternative='two-sided')
_, df_stat.loc[feats_imm_slctd, "mw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_imm_slctd, "mw_pval"], 0.05, method='fdr_bh')
df_stat.sort_values([f"mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path}/immuno/vs_central/mw_same_age/stat.xlsx", index_label='Features')

# Plot barplot for p-values
df_stat['mw_pval_fdr_bh_log'] = -np.log10(df_stat['mw_pval_fdr_bh'])
df_stat['color'] = 'pink'
df_stat.loc[df_stat['mw_pval_fdr_bh'] < 0.05, 'color'] = 'red'
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(4, 8))
barplot = sns.barplot(
    data=df_stat,
    y=df_stat.index.values,
    x='mw_pval_fdr_bh_log',
    edgecolor='black',
    palette=df_stat['color'].values,
    dodge=False,
    ax=ax
)
ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
ax.set_ylabel('', fontsize=20)
ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/barplot_pvals.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/barplot_pvals.pdf", bbox_inches='tight')
plt.close(fig)

# Plot violins
n_rows = 4
n_cols = 8
fig_width = 24
fig_height = 12
df_fig = imm.loc[list(set.union(set(ids_groups[groups[0]]), set(ids_groups[groups[1]]))), :]
df_fig.loc[ids_groups['Central (Age as FMBA)'], 'Group'] = 'Central (Age as FMBA)'
df_fig.loc[ids_groups['FMBA'], 'Group'] = 'FMBA'
sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharex=True, layout="constrained")
feats_sorted = df_stat.index.values
for f_id, f in enumerate(feats_sorted):
    row_id, col_id = divmod(f_id, n_cols)
    q01 = df_fig[f].quantile(0.01)
    q99 = df_fig[f].quantile(0.99)
    violin = sns.violinplot(
        data=df_fig.loc[(df_fig[f] > q01) & (df_fig[f] < q99), :],
        x='Group',
        y=f,
        palette=colors_groups,
        scale='width',
        order=groups,
        saturation=0.75,
        legend=False,
        ax=axs[row_id, col_id],
    )
    axs[row_id, col_id].set_ylabel(f)
    axs[row_id, col_id].set_xlabel('')
    axs[row_id, col_id].set(xticklabels=[]) 
    mw_pval = df_stat.at[f, "mw_pval_fdr_bh"]
    pval_formatted = [f'{mw_pval:.2e}']
    annotator = Annotator(
        axs[row_id, col_id],
        pairs=[(groups[0], groups[1])],
        data=df_fig,
        x='Group',
        y=f,
        order=groups,
    )
    annotator.set_custom_annotations(pval_formatted)
    annotator.configure(loc='outside')
    annotator.annotate()
legend_handles = [
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_groups['Central (Age as FMBA)'], markersize=10, label='Central (Age as FMBA)'),
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_groups['FMBA'], markersize=10, label='FMBA'),
]
fig.legend(handles=legend_handles, bbox_to_anchor=(0.5, 1.0), loc="lower center", ncol=2, frameon=False, fontsize='large')   
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/violins.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/violins.pdf", bbox_inches='tight')
plt.close(fig)

# Plot violins with synthetic
n_rows = 4
n_cols = 6
fig_width = 20
fig_height = 12
df_fig = imm.loc[list(set.union(set(ids_groups[groups[0]]), set(ids_groups[groups[1]]))), :]
df_synt = imm_synt.copy()
df_synt.set_index(df_synt.index.astype(str) + '_synt', inplace=True)
df_synt['Group'] = 'FMBA synthetic'
df_fig.loc[ids_groups['Central (Age as FMBA)'], 'Group'] = 'Central (Age as FMBA)'
df_fig.loc[ids_groups['FMBA'], 'Group'] = 'FMBA'
df_fig = pd.concat([df_fig, df_synt])
colors_groups['FMBA synthetic'] = 'crimson'
groups = ['Central (Age as FMBA)', 'FMBA', 'FMBA synthetic']
sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharex=True, layout="constrained")
for f_id, f in enumerate(imms_epim):
    row_id, col_id = divmod(f_id, n_cols)
    q01 = df_fig[f].quantile(0.01)
    q99 = df_fig[f].quantile(0.99)
    violin = sns.violinplot(
        data=df_fig.loc[(df_fig[f] > q01) & (df_fig[f] < q99), :],
        x='Group',
        y=f,
        palette=colors_groups,
        scale='width',
        order=groups,
        saturation=0.75,
        legend=False,
        ax=axs[row_id, col_id],
        cut=0,
    )
    axs[row_id, col_id].set_ylabel(f)
    axs[row_id, col_id].set_xlabel('')
    axs[row_id, col_id].set(xticklabels=[]) 
legend_handles = [
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_groups['Central (Age as FMBA)'], markersize=10, label='Central (Age as FMBA)'),
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_groups['FMBA'], markersize=10, label='FMBA'),
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_groups['FMBA synthetic'], markersize=10, label='FMBA synthetic'),
]
fig.legend(handles=legend_handles, bbox_to_anchor=(0.5, 1.0), loc="lower center", ncol=3, frameon=False, fontsize='large')   
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/violins_with_syntetic.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/vs_central/mw_same_age/violins_with_syntetic.pdf", bbox_inches='tight')
plt.close(fig)

### SImAge and EpImAge

In [None]:
groups = ['Central', 'FMBA']

for age_type in ['SImAge', 'EpImAge']:
    df_fig = imm.loc[list(set.union(set(ids_groups[groups[0]]), set(ids_groups[groups[1]]))), :]
    df_fig.loc[ids_groups['Central'], 'Group'] = 'Central'
    df_fig.loc[ids_groups['FMBA'], 'Group'] = 'FMBA'
    sns.set_theme(style='whitegrid')
    fig, ax = plt.subplots(figsize=(4, 4), layout="constrained")
    scatter = sns.scatterplot(
        data=df_fig,
        x="Age",
        y=age_type,
        hue="Group",
        palette=colors_groups,
        linewidth=0.2,
        alpha=0.75,
        edgecolor="k",
        s=40,
        ax=ax
    )
    bisect = sns.lineplot(
        x=[0, 120],
        y=[0, 120],
        linestyle='--',
        color='black',
        linewidth=1.0,
        ax=ax
    )

    mae_central = mean_absolute_error(df_fig.loc[ids_groups['Central'], 'Age'].values, df_fig.loc[ids_groups['Central'], age_type].values)
    mae_fmba = mean_absolute_error(df_fig.loc[ids_groups['FMBA'], 'Age'].values, df_fig.loc[ids_groups['FMBA'], age_type].values)
    ax.set_title(f"MAE Central: {mae_central:0.1f}\nMAE FMBA: {mae_fmba:0.1f}")
    ax.set_xlim(0, 120)
    ax.set_ylim(0, 120)
    plt.gca().set_aspect('equal', adjustable='box')
    fig.savefig(f"{path}/immuno/vs_central/{age_type}_scatter.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/immuno/vs_central/{age_type}_scatter.pdf", bbox_inches='tight')
    plt.close()

    sns.set_theme(style='whitegrid')
    fig, ax = plt.subplots(figsize=(4, 4), layout="constrained")
    violin = sns.violinplot(
        data=df_fig,
        x='Group',
        y=f'{age_type} acceleration',
        palette=colors_groups,
        scale='width',
        order=groups,
        saturation=0.75,
    )
    violin.set_xlabel(f"")
    mw_pval = mannwhitneyu(
        df_fig.loc[df_fig['Group'] == groups[0], f'{age_type} acceleration'].values,
        df_fig.loc[df_fig['Group'] == groups[1], f'{age_type} acceleration'].values,
        alternative='two-sided').pvalue
    pval_formatted = [f'{mw_pval:.2e}']
    annotator = Annotator(
        violin,
        pairs=[(groups[0], groups[1])],
        data=df_fig,
        x='Group',
        y=f'{age_type} acceleration',
        order=groups
    )
    annotator.set_custom_annotations(pval_formatted)
    annotator.configure(loc='outside')
    annotator.annotate()
    fig.savefig(f"{path}/immuno/vs_central/{age_type}_acceleration.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/immuno/vs_central/{age_type}_acceleration.pdf", bbox_inches='tight')
    plt.close()

## CXCL9 special

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_old = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"

imm = pd.read_excel(f"{path}/immuno/data_full.xlsx", index_col=0)

imm_old_selected = pd.read_excel(f"{path_old}/special/059_imm_data_selection/df_imm.xlsx", index_col=0)
imm_fmba = pd.read_excel(f"{path}/immuno/data.xlsx", index_col=0)
imm_fmba = imm_fmba[imm_fmba['Age'].notna()]
ids_groups = {
    'Central': imm_old_selected.index[imm_old_selected['Region'] == 'Central'].values,
    'FMBA': imm_fmba.index.values,
}
colors_groups = {
    'Central': 'crimson',
    'FMBA': 'dodgerblue',
}

# Select samples with same age
n_same_age = 200
hist_bins = np.linspace(5, 115, 23)
age_counts, age_bin_edges = np.histogram(imm.loc[ids_groups['FMBA'], 'Age'].values, bins=hist_bins)
age_prob = age_counts / len(imm.loc[ids_groups['FMBA'], 'Age'].values)
bin_diff = age_bin_edges[1] - age_bin_edges[0]
imm.loc[ids_groups['Central'], 'Prob Age (FMBA)'] = age_prob[np.rint((imm.loc[ids_groups['Central'], 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]
index_central_same_age = imm.loc[ids_groups['Central'], :].sample(n=n_same_age, replace=False, weights='Prob Age (FMBA)', random_state=1337).index
if index_central_same_age.is_unique:
    ids_central_same_age = index_central_same_age.values
else:
    print("Not unique index")
ids_groups['Central (Age as FMBA)'] = ids_central_same_age
colors_groups['Central (Age as FMBA)'] = 'green'

# Plot age histogram
df_fig_1 = imm.loc[ids_groups['Central'], ['Age']].copy()
df_fig_1['Group'] = 'Central'
df_fig_2 = imm.loc[ids_groups['Central (Age as FMBA)'], ['Age']].copy()
df_fig_2['Group'] = 'Central (Age as FMBA)'
df_fig_3 = imm.loc[ids_groups['FMBA'], ['Age']].copy()
df_fig_3['Group'] = 'FMBA'
df_fig = pd.concat([df_fig_1, df_fig_2, df_fig_3], ignore_index=True)
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(6, 4))
histplot = sns.histplot(
    data=df_fig,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Group',
    palette=colors_groups,
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path}/immuno/CXCL9_special/hist_age.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/immuno/CXCL9_special/hist_age.pdf", bbox_inches='tight')
plt.close(fig)


for feat_trgt in ['CXCL9', 'CXCL9_log']:

    n_rows = 2
    n_cols = 1
    fig_height = 5
    fig_width = 5
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), height_ratios=[2, 5], gridspec_kw={'wspace':0.02, 'hspace': 0.02}, layout='constrained')

    ds_table = pd.DataFrame(index=[r"Pearson $\rho$", r"$R^2$", 'Slope', 'Intercept'], columns=['Central', 'Central (Age as FMBA)', 'FMBA'])
    for group in ids_groups:
        rho, _ = stats.pearsonr(imm.loc[ids_groups[group], 'Age'].values, imm.loc[ids_groups[group], feat_trgt].values)
        ds_table.at[r"Pearson $\rho$", group] = f"{rho:0.2f}"
        linreg = smf.ols(formula=f"{feat_trgt} ~ Age", data=imm.loc[ids_groups[group], :]).fit()
        ds_table.at[r"$R^2$", group] = f"{linreg.rsquared:0.2f}"
        ds_table.at['Slope', group] = f"{linreg.params['Age']:0.2e}"
        ds_table.at['Intercept', group] = f"{linreg.params['Intercept']:0.2e}"
    col_defs = [
        ColumnDefinition(
            name="index",
            title='Metrics',
            textprops={"ha": "left"},
            width=4.5,
        )
    ]
    for group in ids_groups:
        col_defs.append(
                ColumnDefinition(
                    name=group,
                    textprops={"ha": "center"},
                    width=4.0,
                ),
        ) 
    table = Table(
        ds_table,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs[0],
        textprops={"fontsize": 8},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    )

    for group in ids_groups:    
        regplot = sns.regplot(
            data=imm.loc[ids_groups[group], :],
            x='Age',
            y=feat_trgt,
            label=group,
            color=colors_groups[group],
            scatter_kws=dict(
                linewidth=0.2,
                alpha=0.75,
                edgecolor="k",
                s=10,
            ),
            ax=axs[1]
        )
    fig.savefig(f"{path}/immuno/CXCL9_special/regplot_{feat_trgt}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/immuno/CXCL9_special/regplot_{feat_trgt}.pdf", bbox_inches='tight')
    plt.close(fig)

# PhenoAge

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

data = pd.read_excel(f"{path}/ПриложениеГ (КОНФИДЕНЦИАЛЬНО).xlsx", index_col='работник_ID')
# data = data[data['признак 500 здоровые больные'].notna()]
data['дата рождения'] = pd.to_datetime(data['дата рождения'])
data['date_now'] = pd.to_datetime("2024-11-11")
data['Age'] = (data['date_now'] - data['дата рождения']) / np.timedelta64(1, 'D') / 365.25
data = data[data['Age'].notna()]

phenoage = pd.read_excel(f"E:/YandexDisk/Work/bbd/phenoage/phenoage.xlsx", index_col=0)
data = data.loc[:, list(phenoage['FMBA'].values) + ['признак 500 здоровые больные']].dropna(axis=0, how='any')
data.replace({'нет': np.nan}, inplace=True)
data = data.dropna(axis=0, how='any')
for col in phenoage['FMBA'].values:
    # data[col] = pd.to_numeric(data[col].astype(str).str.replace(',', '.'), errors='coerce')
    data[col] = pd.to_numeric(data[col])
data['NonLog С-реакт белок мг л'] = data['С-реакт белок мг л'].values
data['С-реакт белок мг л'] = np.log(data['С-реакт белок мг л'].values / 10)
data['LinearComb'] = -19.9067
gamma = 0.0077
for f in phenoage.index.values:
    data['LinearComb'] += phenoage.at[f, 'Coeff'] * data[phenoage.at[f, 'FMBA']].values
data['MortalityScore'] = 1 - np.exp(-np.exp(data['LinearComb'].values) * (np.exp(120 * gamma) - 1) / gamma)

data['PhenoAge'] = 141.50225 + np.log(-0.00553 * np.log(1 - data['MortalityScore'].values)) / 0.090165
data['PhenoAge acceleration'] = data['PhenoAge'] - data['Age']
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data = data.dropna(axis=0, how='any')

linreg = smf.ols(formula=f"PhenoAge ~ Age", data=data).fit()
data[f"PhenoAge Linear Pred"] = linreg.predict(data)
data[f"PhenoAge acceleration corrected"] = data['PhenoAge'] - data[f"PhenoAge Linear Pred"]
data[f"PhenoAge corrected"] = data["Age"] + data[f"PhenoAge acceleration corrected"]

print(f"'PhenoAge acceleration: {np.mean(data['PhenoAge acceleration'].values)}")
print(f"'PhenoAge acceleration corrected: {np.mean(data['PhenoAge acceleration corrected'].values)}")
data.to_excel(f"{path}/03_pheno_age/data_PhenoAge.xlsx")

In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=False, shared_xaxes=False, column_widths=[5, 3], horizontal_spacing=0.15)
min_plot_age = data[["Age", "PhenoAge"]].min().min()
max_plot_age = data[["Age", "PhenoAge"]].max().max()
shift_plot_age = max_plot_age - min_plot_age
min_plot_age -= 0.1 * shift_plot_age
max_plot_age += 0.1 * shift_plot_age
fig.add_trace(
    go.Scatter(
        x=[min_plot_age, max_plot_age],
        y=[min_plot_age, max_plot_age],
        showlegend=False,
        mode='lines',
        line = dict(color='black', width=2, dash='dot')
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Scatter(
        name='Scatter',
        x=data.loc[:, 'Age'].values,
        y=data.loc[:, 'PhenoAge'].values,
        text=data.index.values,
        hovertext=data.index.values,
        showlegend=False,
        mode='markers',
        marker=dict(
            size=10,
            opacity=0.75,
            line=dict(
                width=1,
                color='black'
            ),
            color='crimson'
        )
    ),
    row=1,
    col=1
)
fig.update_xaxes(
    row=1,
    col=1,
    automargin=True,
    title_text="Age",
    autorange=False,
    range=[min_plot_age, max_plot_age],
    showgrid=False,
    zeroline=False,
    linecolor='black',
    showline=True,
    gridcolor='gainsboro',
    gridwidth=0.05,
    mirror=True,
    ticks='outside',
    titlefont=dict(
        color='black',
        size=20
    ),
    showticklabels=True,
    tickangle=0,
    tickfont=dict(
        color='black',
        size=16
    ),
    exponentformat='e',
    showexponent='all'
)
fig.update_yaxes(
    row=1,
    col=1,
    automargin=True,
    title_text=f"PhenoAge",
    # scaleanchor="x",
    # scaleratio=1,
    autorange=False,
    range=[min_plot_age, max_plot_age],
    showgrid=False,
    zeroline=False,
    linecolor='black',
    showline=True,
    gridcolor='gainsboro',
    gridwidth=0.05,
    mirror=True,
    ticks='outside',
    titlefont=dict(
        color='black',
        size=20
    ),
    showticklabels=True,
    tickangle=0,
    tickfont=dict(
        color='black',
        size=16
    ),
    exponentformat='e',
    showexponent='all'
)
fig.add_trace(
    go.Violin(
        y=data.loc[:, 'PhenoAge acceleration'].values,
        hovertext=data.index.values,
        name="Violin",
        box_visible=True,
        meanline_visible=True,
        showlegend=False,
        line_color='black',
        fillcolor='crimson',
        marker=dict(color='crimson', line=dict(color='black', width=0.5), opacity=0.75),
        points='all',
        bandwidth=np.ptp(data.loc[:, 'PhenoAge acceleration'].values) / 32,
        opacity=0.75
    ),
    row=1,
    col=2
)
fig.update_yaxes(
    row=1,
    col=2,
    automargin=True,
    title_text="PhenoAge acceleraton",
    autorange=True,
    showgrid=False,
    zeroline=True,
    linecolor='black',
    showline=True,
    gridcolor='gainsboro',
    gridwidth=0.05,
    mirror=True,
    ticks='outside',
    titlefont=dict(
        color='black',
        size=20
    ),
    showticklabels=True,
    tickangle=0,
    tickfont=dict(
        color='black',
        size=16
    ),
    exponentformat='e',
    showexponent='all'
)
fig.update_xaxes(
    row=1,
    col=2,
    automargin=True,
    autorange=False,
    range=[-0.5, 0.3],
    showgrid=False,
    showline=True,
    zeroline=False,
    showticklabels=False,
    mirror=True,
    ticks='outside',
    tickvals=[],

)
fig.update_layout(
    template="simple_white",
    width=800,
    height=450,
    margin=go.layout.Margin(l=100, r=20, b=50, t=50, pad=0),
)
fig.show()
fig.write_image(f"{path}/03_pheno_age/PhenoAge.png")
fig.write_image(f"{path}/03_pheno_age/PhenoAge.pdf", format="pdf")

# Check samples intersection: Selected-200 vs Immunology vs DNAm

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

df_epi = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_imm = pd.read_excel(f"{path}/immuno/data.xlsx", index_col=0)
df_sel = pd.read_excel(f"{path}/02_select_samples/data_selected.xlsx", index_col=0)

imm_only = df_imm.index.difference(df_sel.index).to_list()
print(f'imm_only: {imm_only}')

epi_only = df_epi.index.difference(df_sel.index).to_list()
print(f'imm_only: {epi_only}')

sections = get_sections([set(df_sel.index), set(df_imm.index), set(df_epi.index)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path}/02_select_samples/intersection/{sec}.xlsx", index_label='gene')

dict_upset_lists = {
    "Selected-200": df_sel.index.values,
    "Immuno": df_imm.index.values,
    "DNAm": df_epi.index,
}
upset_all = list(set().union(*list(dict_upset_lists.values())))
df_upset = pd.DataFrame(index=upset_all)
for k, v in dict_upset_lists.items():
    df_upset[k] = df_upset.index.isin(v)
df_upset = df_upset.set_index(list(dict_upset_lists.keys()))
tmp = plt.figure(figsize=(7, 4))
upset_fig = upsetplot.UpSet(
    df_upset,
    sort_categories_by='input',
    subset_size='count',
    show_counts=True,
    min_degree=0,
    element_size=None,
    totals_plot_elements=3,
    include_empty_subsets=False
)
upset_fig.plot(tmp)
plt.savefig(f"{path}/02_select_samples/intersection/upset.png", bbox_inches='tight')
plt.savefig(f"{path}/02_select_samples/intersection/upset.pdf", bbox_inches='tight')
plt.close()

# Legacy

## Comparing tables

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

data_1 = pd.read_excel(f"{path}/Таблица_НИР (1).xlsx", index_col='работник_ID')
data_2 = pd.read_excel(f"{path}/ПриложениеГ (КОНФИДЕНЦИАЛЬНО).xlsx", index_col='работник_ID')

print(data_1.index.difference(data_2.index).tolist())
print(data_2.index.difference(data_1.index).tolist())
print(data_1.columns.difference(data_2.columns).tolist())
print(data_2.columns.difference(data_1.columns).tolist())

cmn_cols = data_1.columns.intersection(data_2.columns).tolist()
cmn_ids = data_1.index.intersection(data_2.index).tolist()
with pd.ExcelWriter(f"{path}/difference.xlsx", engine='xlsxwriter') as writer:
    for col_id, col in enumerate(cmn_cols):
        data_diff = data_1.loc[cmn_ids, col].compare(data_2.loc[cmn_ids, col])
        if data_diff.shape[0] > 0:
            data_diff.to_excel(writer, sheet_name=col[0:min(16, len(col))])

## Load data

In [None]:
path = f"D:/YandexDisk/Work/bbd/fmba"

data = pd.read_excel(f"{path}/Список_отправка.xlsx", index_col='работник_ID')
data = data[data['признак 500 здоровые больные'].notna()]
data['дата рождения'] = pd.to_datetime(data['дата рождения'])
data['date_now'] = pd.to_datetime("2024-10-10")
data['Age'] = (data['date_now'] - data['дата рождения']) / np.timedelta64(1, 'D') / 365.25
data = data[data['Age'].notna()]

cols_diseases = [
    'невропатолог - код_заболевания',
    'отоларинголог - код_заболевания',
    'офтальмолог - код_заболевания',
    'дерматолог - код_заболевания',
    'хирург - код_заболевания',
    'терапевт - код_заболевания',
]

cols_diseases_colors = {}
for col_disease in cols_diseases:
    statuses = np.concatenate(data[col_disease].dropna().str.split(';').values)
    statuses = statuses[statuses != 'нет']
    statuses_counter = Counter(statuses)
    df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
    df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)
    colors = distinctipy.get_colors(df_statuses_counter.shape[0], [mcolors.hex2color(mcolors.CSS4_COLORS['white']), mcolors.hex2color(mcolors.CSS4_COLORS['black'])], rng=1337)
    cols_diseases_colors[col_disease] = {status: colors[status_id] for status_id, status in enumerate(df_statuses_counter.index.values)}

## Select samples

In [None]:
path_save = f"{path}/02_select_samples"

ids_hlty = data.index[data['признак 500 здоровые больные'] == '537_з'].values
ids_sick = data.index[data['признак 500 здоровые больные'] == '537_б'].values
ids_spv = data.index[data['признак руководителей'] == 'рук.'].values
ids_rad = data.index[data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;'].values
ids_norad = data.index[data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;'].values

groups = {
    'Heathy with Radiation': set.intersection(set(ids_hlty), set(ids_rad)),
    'Sick with Radiation': set.intersection(set(ids_sick), set(ids_rad)),
    'Heathy without Radiation': set.intersection(set(ids_hlty), set(ids_norad)),
    'Sick without Radiation': set.intersection(set(ids_sick), set(ids_norad)),
}

groups_spv = {
    'Heathy with Radiation': set.intersection(set(ids_spv), set(ids_hlty), set(ids_rad)),
    'Sick with Radiation': set.intersection(set(ids_spv), set(ids_sick), set(ids_rad)),
    'Heathy without Radiation': set.intersection(set(ids_spv), set(ids_hlty), set(ids_norad)),
    'Sick without Radiation': set.intersection(set(ids_spv), set(ids_sick), set(ids_norad)),
}

groups_colors = {
    'Heathy with Radiation': 'crimson',
    'Sick with Radiation': 'dodgerblue',
    'Heathy without Radiation': 'lawngreen',
    'Sick without Radiation': 'darkorchid',
}

df_count_all = pd.DataFrame()
df_count_all.at['Healthy', 'Radiation'] = len(groups['Heathy with Radiation'])
df_count_all.at['Sick', 'Radiation'] = len(groups['Sick with Radiation'])
df_count_all.at['Healthy', 'No radiation'] = len(groups['Heathy without Radiation'])
df_count_all.at['Sick', 'No radiation'] = len(groups['Sick without Radiation'])

df_count_spv = pd.DataFrame()
df_count_spv.at['Healthy', 'Radiation'] = len(groups_spv['Heathy with Radiation'])
df_count_spv.at['Sick', 'Radiation'] = len(groups_spv['Sick with Radiation'])
df_count_spv.at['Healthy', 'No radiation'] = len(groups_spv['Heathy without Radiation'])
df_count_spv.at['Sick', 'No radiation'] = len(groups_spv['Sick without Radiation'])

n_rows = 2
n_cols = 2
fig_width = 10
fig_height = 8
hist_bins = np.linspace(0, 120, 25)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=data.loc[list(g_ids), ],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} (Total {len(g_ids)}, Supervisors {len(groups_spv[g])})")
fig.tight_layout()    
fig.savefig(f"{path_save}/hist_age.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/hist_age.pdf", bbox_inches='tight')
plt.close(fig)

needed_samples = {
    'Heathy with Radiation': 12,
    'Sick with Radiation': 11,
    'Heathy without Radiation': 14,
    'Sick without Radiation': 17,
}

seed = 11

groups_selected_100 = {x: list(groups_spv[x]) for x in groups_spv}
groups_selected_200 = {}

age_bin_edges = np.linspace(0, 120, 25)
age_prob = np.asarray([10.0]*7 + [1.0]*5 +  [10.0]*12)
age_prob /= np.sum(age_prob)
bin_diff = 5
for g in needed_samples:
    
    data_cands_100 = data.loc[list(groups[g] - groups_spv[g]), :]
    data_cands_100.loc[:, 'Prob Age'] = age_prob[np.rint((data_cands_100.loc[:, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]
    n_same_age = needed_samples[g]
    index_selected = data_cands_100.sample(n=n_same_age, replace=False, weights='Prob Age', random_state=seed).index
    if index_selected.is_unique:
        ids_selected = index_selected.to_list()
        groups_selected_100[g] += ids_selected
        groups_selected_200[g] = copy.deepcopy(groups_selected_100[g])
    else:
        print("Not unique index")
        
    data_cands_200 = data.loc[list(groups[g] - set(groups_selected_100[g])), :]
    data_cands_200.loc[:, 'Prob Age'] = age_prob[np.rint((data_cands_200.loc[:, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]
    n_same_age = 25
    index_selected = data_cands_200.sample(n=n_same_age, replace=False, weights='Prob Age', random_state=seed).index
    if index_selected.is_unique:
        ids_selected = index_selected.to_list()
        groups_selected_200[g] += ids_selected
    else:
        print("Not unique index")
        
sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups_selected_100.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=data.loc[g_ids, ],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} (Total {len(g_ids)}, Supervisors {len(set(g_ids).intersection(set(ids_spv)))})")
    axs[row_id, col_id].yaxis.set_major_locator(MaxNLocator(integer=True))
fig.tight_layout()    
fig.savefig(f"{path_save}/hist_age_selected_100.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/hist_age_selected_100.pdf", bbox_inches='tight')
plt.close(fig)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups_selected_200.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=data.loc[g_ids, ],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} (Total {len(g_ids)}, Supervisors {len(set(g_ids).intersection(set(ids_spv)))})")
    axs[row_id, col_id].yaxis.set_major_locator(MaxNLocator(integer=True))
fig.tight_layout()    
fig.savefig(f"{path_save}/hist_age_selected_200.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/hist_age_selected_200.pdf", bbox_inches='tight')
plt.close(fig) 
        
data_selected = data.loc[chain.from_iterable(groups_selected_200.values()), :]
print(f"Index unique: {data_selected.index.is_unique}")
print(f"Duplicated indexes: {data_selected.index[data_selected.index.duplicated()].unique().to_list()}")
data_selected.insert(0, 'Selected-100', 0)
data_selected.loc[chain.from_iterable(groups_selected_100.values()), 'Selected-100'] = 1
col = data_selected.pop("Age")
data_selected.insert(1, col.name, col)
data_selected.insert(2, 'Status', 'Sick')
data_selected.loc[groups_selected_200['Heathy with Radiation'] + groups_selected_200['Heathy without Radiation'], 'Status'] = 'Healthy'
data_selected.insert(3, 'Radiation', 0)
data_selected.loc[groups_selected_200['Heathy with Radiation'] + groups_selected_200['Sick with Radiation'], 'Radiation'] = 1

data_selected.to_excel(f"{path_save}/data_selected.xlsx")

## Load data

In [None]:
path = f"D:/YandexDisk/Work/bbd/fmba"

# data_raw = pd.read_excel(f"{path}/2024-08-30 Пример 1000 за 2023 г в формате широких данных.xlsx", index_col=0)
data = pd.read_excel(f"{path}/2024-10-14 1147 за 2023 г в формате широких данных.xlsx", index_col=0)
data['дата рождения'] = pd.to_datetime(data['дата рождения'])
data['date_now'] = pd.to_datetime("2024-10-10")
data['Age'] = (data['date_now'] - data['дата рождения']) / np.timedelta64(1, 'D') / 365.25
data = data[data['Age'].notna()]

df_hlty = pd.read_excel(f"{path}/здоровые_бпд.xlsx", index_col=0)
df_sick = pd.read_excel(f"{path}/больные_бпд.xlsx", index_col=0)

ids_hlty_sick = df_hlty.index.intersection(df_sick.index).to_list()
print(f'ids_hlty_sick:\n{ids_hlty_sick}')

df_inventory = pd.read_excel(f"{path}/Опись биоматериала. Отправка 28.10.2024_selected.xlsx", index_col='ID')
df_inventory = df_inventory.loc[df_inventory.index.drop_duplicates(), :]
df_inventory = df_inventory[df_inventory.index.notnull()]

df_supervisors = pd.read_excel(f"{path}/Руководители.xlsx", index_col='ID')
df_supervisors = df_supervisors.loc[df_supervisors.index.drop_duplicates(), :]
df_supervisors = df_supervisors[df_supervisors.index.notnull()]

missed_hlty = set(df_hlty.index) - set(df_hlty.index.intersection(data.index))
missed_sick = set(df_sick.index) - set(df_sick.index.intersection(data.index))

ids_inventory_intxn = {
    '1000+ List (with Age)': df_inventory.index.intersection(data.index).values,
    'Healthy': df_inventory.index.intersection(df_hlty.index).values,
    'Sick': df_inventory.index.intersection(df_sick.index).values,
    'Radiation': df_inventory.index.intersection(data.index[data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;']).values,
    'No radiation': df_inventory.index.intersection(data.index[data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;']).values,
}
df_inventory_dist = pd.DataFrame()
df_inventory_dist.at['Total', 'Count'] = len(df_inventory.index)
for g in ids_inventory_intxn:
    df_inventory_dist.at[g, 'Count'] = len(ids_inventory_intxn[g])
    df_inventory[g] = 0
    df_inventory.loc[ids_inventory_intxn[g], g] = 1
df_inventory_dist.to_excel(f"{path}/distribution_Опись.xlsx", index_label='Опись')
df_inventory.to_excel(f"{path}/Опись_intxn.xlsx")

ids_supervisors_intxn = {
    '1000+ List (with Age)': df_supervisors.index.intersection(data.index).values,
    'Healthy': df_supervisors.index.intersection(df_hlty.index).values,
    'Sick': df_supervisors.index.intersection(df_sick.index).values,
    'Radiation': df_supervisors.index.intersection(data.index[data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;']).values,
    'No radiation': df_supervisors.index.intersection(data.index[data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;']).values,
    'Опись': df_supervisors.index.intersection(df_inventory.index).values,
}
df_supervisors_dist = pd.DataFrame()
df_supervisors_dist.at['Total', 'Count'] = len(df_supervisors.index)
for g in ids_supervisors_intxn:
    df_supervisors_dist.at[g, 'Count'] = len(ids_supervisors_intxn[g])
    df_supervisors[g] = 0
    df_supervisors.loc[ids_supervisors_intxn[g], g] = 1
df_supervisors_dist.to_excel(f"{path}/distribution_Руководители.xlsx", index_label='Руководители')
df_supervisors.to_excel(f"{path}/Руководители_intxn.xlsx")

groups_ids = {
    'Heathy': df_hlty.index.intersection(data.index).values,
    'Sick': df_sick.index.intersection(data.index).values
}

for group, ids in groups_ids.items():
    print(f"{group}: {len(ids)}")

cols_diseases = [
    'невропатолог - код_заболевания',
    'отоларинголог - код_заболевания',
    'офтальмолог - код_заболевания',
    'дерматолог - код_заболевания',
    'хирург - код_заболевания',
    'терапевт - код_заболевания',
]

cols_diseases_colors = {}
for col_disease in cols_diseases:
    statuses = np.concatenate(data[col_disease].dropna().str.split(';').values)
    statuses = statuses[statuses != '']
    statuses_counter = Counter(statuses)
    df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
    df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)
    colors = distinctipy.get_colors(df_statuses_counter.shape[0], [mcolors.hex2color(mcolors.CSS4_COLORS['white']), mcolors.hex2color(mcolors.CSS4_COLORS['black'])], rng=1337)
    cols_diseases_colors[col_disease] = {status: colors[status_id] for status_id, status in enumerate(df_statuses_counter.index.values)}

In [None]:
df_sick[df_sick.index.duplicated()]

In [None]:
df_hlty[df_hlty.index.duplicated()]

## Select samples

In [None]:
path_save = f"{path}/02_select_samples"

ids_hlty = df_hlty.drop(ids_hlty_sick).index.values
ids_sick = df_sick.drop(ids_hlty_sick).index.values
ids_inv = df_inventory.index.values
ids_spv = df_supervisors.index.values
ids_rad = data.index[data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;'].values
ids_norad = data.index[data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;'].values

groups = {
    'Heathy with Radiation': set.intersection(set(ids_inv), set(ids_hlty), set(ids_rad)),
    'Sick with Radiation': set.intersection(set(ids_inv), set(ids_sick), set(ids_rad)),
    'Heathy without Radiation': set.intersection(set(ids_inv), set(ids_hlty), set(ids_norad)),
    'Sick without Radiation': set.intersection(set(ids_inv), set(ids_sick), set(ids_norad)),
}

groups_colors = {
    'Heathy with Radiation': 'crimson',
    'Sick with Radiation': 'dodgerblue',
    'Heathy without Radiation': 'lawngreen',
    'Sick without Radiation': 'darkorchid',
}

df_count_all = pd.DataFrame()
df_count_all.at['Healthy', 'Radiation'] = len(groups['Heathy with Radiation'])
df_count_all.at['Sick', 'Radiation'] = len(groups['Sick with Radiation'])
df_count_all.at['Healthy', 'No radiation'] = len(groups['Heathy without Radiation'])
df_count_all.at['Sick', 'No radiation'] = len(groups['Sick without Radiation'])

df_count_spv = pd.DataFrame()
df_count_spv.at['Healthy', 'Radiation'] = len(groups['Heathy with Radiation'].intersection(set(ids_spv)))
df_count_spv.at['Sick', 'Radiation'] = len(groups['Sick with Radiation'].intersection(set(ids_spv)))
df_count_spv.at['Healthy', 'No radiation'] = len(groups['Heathy without Radiation'].intersection(set(ids_spv)))
df_count_spv.at['Sick', 'No radiation'] = len(groups['Sick without Radiation'].intersection(set(ids_spv)))

n_rows = 2
n_cols = 2
fig_width = 10
fig_height = 8
hist_bins = np.linspace(5, 115, 23)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=data.loc[list(g_ids), ],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} (Total {len(g_ids)}, Supervisors {len(groups[g].intersection(set(ids_spv)))})")
fig.tight_layout()    
fig.savefig(f"{path_save}/hist_age.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/hist_age.pdf", bbox_inches='tight')
plt.close(fig)

needed_samples = {
    'Heathy with Radiation': 51 - len(groups['Heathy with Radiation'].intersection(set(ids_spv))),
    'Sick with Radiation': 51 - len(groups['Sick with Radiation'].intersection(set(ids_spv))),
    'Heathy without Radiation': 0,
    'Sick without Radiation': 0,
}

groups_selected = {
    'Heathy with Radiation': [],
    'Sick with Radiation': [],
    'Heathy without Radiation': list(set.intersection(set(ids_inv), set(ids_hlty), set(ids_norad))),
    'Sick without Radiation': list(set.intersection(set(ids_inv), set(ids_sick), set(ids_norad))),
}

age_bin_edges = np.linspace(5, 115, 23)
# age_prob = np.asarray([1/22] * 22)
age_prob = np.asarray([10.0]*6 + [1.0]*4 +  [10.0]*12)
age_prob /= np.sum(age_prob)
bin_diff = 5
for g in ['Heathy with Radiation', 'Sick with Radiation']:
    data_cands = data.loc[list(groups[g] - set(ids_spv)), :]
    print(data_cands.shape[0])
    data_cands.loc[:, 'Prob Age'] = age_prob[np.rint((data_cands.loc[:, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]
    
    n_same_age = needed_samples[g]
    print(n_same_age)
    index_selected = data_cands.sample(n=n_same_age, replace=False, weights='Prob Age', random_state=36).index
    if index_selected.is_unique:
        ids_selected = index_selected.to_list()
        groups_selected[g] = ids_selected + list(groups[g].intersection(set(ids_spv)))
        print(len(groups_selected[g]))
    else:
        print("Not unique index")

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups_selected.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=data.loc[g_ids, ],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} (Total {len(g_ids)}, Supervisors {len(set(groups_selected[g]).intersection(set(ids_spv)))})")
    axs[row_id, col_id].yaxis.set_major_locator(MaxNLocator(integer=True))
fig.tight_layout()    
fig.savefig(f"{path_save}/hist_age_selected.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/hist_age_selected.pdf", bbox_inches='tight')
plt.close(fig)  
        
data_selected = data.loc[chain.from_iterable(groups_selected.values()), :]
col = data_selected.pop("Age")
data_selected.insert(0, col.name, col)
data_selected.insert(1, 'Status', 'Sick')
data_selected.loc[groups_selected['Heathy with Radiation'] + groups_selected['Heathy without Radiation'], 'Status'] = 'Healthy'
data_selected.insert(2, 'Radiation', 0)
data_selected.loc[groups_selected['Heathy with Radiation'] + groups_selected['Sick with Radiation'], 'Radiation'] = 1
data_selected.insert(3, 'Supervisor', 0)
data_selected.loc[data_selected.index.intersection(df_supervisors.index), 'Supervisor'] = 1
cols_to_front = [
    'Текущая основная вредность - Физические факторы',
    'невропатолог - код_заболевания',
    'отоларинголог - код_заболевания',
    'офтальмолог - код_заболевания',
    'дерматолог - код_заболевания',
    'хирург - код_заболевания',
    'терапевт - код_заболевания',
]
for col_front_id, col_front in enumerate(cols_to_front):
    col = data_selected.pop(col_front)
    data_selected.insert(col_front_id + 4, col.name, col)
print(f"Duplicated indexes: {data_selected.index[data_selected.index.duplicated()].unique().to_list()}")
data_selected.to_excel(f"{path_save}/data_selected.xlsx")

## NaNs analysis

In [None]:
nan_pct = data.isna().sum().sum() / data.size * 100
print(nan_pct)

nan_feats = data.isna().sum(axis=0).to_frame(name="Number of NaNs")
nan_feats["% of NaNs"] = nan_feats["Number of NaNs"] / data.shape[0] * 100
nan_feats["Number of not-NaNs"] = data.notna().sum(axis=0)
nan_feats.sort_values(["% of NaNs"], ascending=[False], inplace=True)
nan_feats.to_excel(f"{path_save}/nan_feats.xlsx", index_label="Features")

## Healthy and Sick groups analysis 

In [None]:
path_save = f"{path}/01_test_data"

for group, ids in groups_ids.items():

    df_group = data.loc[ids, :]
    print(df_group.shape[0])
    
    hue_counts = df_group['пол'].value_counts()
    hue_colors = {'М': 'dodgerblue', 'F': 'crimson'}
    hue_replace = {x: f"{x} ({y})" for x, y in hue_counts.items()}
    hue_colors = {f"{x} ({y})": hue_colors[x] for x, y in hue_counts.items()}
    df_group['пол'].replace(hue_replace, inplace=True)

    hist_bins = np.linspace(5, 115, 23)
    
    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(6, 3.5))
    histplot = sns.histplot(
        data=df_group,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue='пол',
        palette=hue_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    histplot.set_title(group)
    plt.savefig(f"{path_save}/age_hist_{group}.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_save}/age_hist_{group}.pdf", bbox_inches='tight')
    plt.close(fig)
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(1, len(cols_diseases), figsize=(30, 15), gridspec_kw={'wspace':0.4}, sharey=False, sharex=False)
    
    for col_disease_id, col_disease in enumerate(cols_diseases):
        statuses = np.concatenate(df_group[col_disease].dropna().str.split(';').values)
        statuses = statuses[statuses != '']
        statuses_counter = Counter(statuses)
        df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
        df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)
        
        df_statuses_counter = df_statuses_counter.head(50)
        df_statuses_counter['Status'] = df_statuses_counter.index.values
        barplot = sns.barplot(
            data=df_statuses_counter,
            x='Count',
            y='Status',
            hue='Status',
            palette=cols_diseases_colors[col_disease],
            edgecolor='black',
            dodge=False,
            ax=axs[col_disease_id]
        )
        for container in barplot.containers:
            barplot.bar_label(container, label_type='edge', fmt='%.d', fontsize=12, padding=2.0)
        axs[col_disease_id].set_title(col_disease, fontsize='large')
        axs[col_disease_id].set_ylabel('')
        axs[col_disease_id].get_legend().remove()
    plt.savefig(f"{path_save}/barplot_icd_{group}.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_save}/barplot_icd_{group}.pdf", bbox_inches='tight')
    plt.close(fig)

## Age and sex analysis

In [None]:
data['дата рождения'] = pd.to_datetime(data['дата рождения'])
data['date_now'] = pd.to_datetime("2024-01-01")
data['age'] = (data['date_now'] - data['дата рождения']) / np.timedelta64(1, 'D') / 365.25
hp = sns.histplot(data=data, x='age', hue="пол", binwidth=1, multiple="stack")
figure = hp.get_figure()    
figure.savefig(f'{path_save}/age_hist.png')

In [None]:
data_vred = data[data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;']
hpv = sns.histplot(data=data_vred, x='age', hue="пол", binwidth=1, multiple="stack").set_title("Ионизирующие излучения")
figure = hpv.get_figure()    
figure.savefig(f'{path_save}/ion_age_hist.png')

In [None]:
data_no_vred = data[data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;']
hpv = sns.histplot(data=data_no_vred, x='age', hue="пол", binwidth=1, multiple="stack").set_title("Без ионизирующих излучений")
figure = hpv.get_figure()    
figure.savefig(f'{path_save}/no_ion_age_hist.png')

## Diseases statistics

In [None]:
data['терапевт - код_заболевания'].replace({None: 'Healthy'}, inplace=True)

subsets = {
    'Все данные': data.index.values,
    'Женщины': data.index[data['пол'] == 'Ж'].values,
    'Мужчины': data.index[data['пол'] == 'М'].values,
    'Ионизирующие излучения': data.index[data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;'].values,
    'Ионизирующие излучения\nЖенщины': data.index[(data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;') & (data['пол'] == 'Ж')].values,
    'Ионизирующие излучения\nМужчины': data.index[(data['Текущая основная вредность - Физические факторы'] == 'Ионизирующие излученияК, радиоактивные веществаК;') & (data['пол'] == 'М')].values,
    'Нет излучения': data.index[data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;'].values,
    'Нет излучения\nЖенщины': data.index[(data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;') & (data['пол'] == 'Ж')].values,
    'Нет излучения\nМужчины': data.index[(data['Текущая основная вредность - Физические факторы'] != 'Ионизирующие излученияК, радиоактивные веществаК;') & (data['пол'] == 'М')].values,
}

In [None]:
statuses = np.concatenate(data['терапевт - код_заболевания'].str.split(';').values)
statuses = statuses[statuses != '']
statuses_counter = Counter(statuses)
df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)
colors = distinctipy.get_colors(df_statuses_counter.shape[0], [mcolors.hex2color(mcolors.CSS4_COLORS['white']), mcolors.hex2color(mcolors.CSS4_COLORS['black'])], rng=1337)
colors_statuses = {status: colors[status_id] for status_id, status in enumerate(df_statuses_counter.index.values)}

In [None]:
sns.set_theme(style='ticks')
fig, axs = plt.subplots(1, 9, figsize=(30, 20), gridspec_kw={'wspace':0.4},sharey=False, sharex=False)

for subset_id, (subset, subset_ids) in enumerate(subsets.items()):
    df_data_subset = data.loc[subset_ids, :]
    print(f"{subset}: {len(df_data_subset)}")
    statuses = np.concatenate(df_data_subset['терапевт - код_заболевания'].str.split(';').values)
    statuses = statuses[statuses != '']
    statuses_counter = Counter(statuses)
    df_statuses_counter = pd.DataFrame.from_dict(statuses_counter, orient='index', columns=['Count'])
    df_statuses_counter.sort_values(['Count'], ascending=[False], inplace=True)

    df_fig = df_statuses_counter.head(50)
    df_fig['Status'] = df_fig.index.values
    barplot = sns.barplot(
        data=df_fig,
        x='Count',
        y='Status',
        hue='Status',
        palette=colors_statuses,
        edgecolor='black',
        dodge=False,
        ax=axs[subset_id]
    )
    for container in barplot.containers:
        barplot.bar_label(container, label_type='edge', fmt='%.d', fontsize=12, padding=2.0)
    axs[subset_id].set_title(subset, fontsize='large')
    axs[subset_id].set_ylabel('')
    axs[subset_id].get_legend().remove()
plt.savefig(f"{path_save}/barplot_icd.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/barplot_icd.pdf", bbox_inches='tight')
plt.close(fig)