# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import torch
import pickle
import numpy as np
from pytorch_tabular import TabularModel
from torchmetrics.functional.regression import mean_absolute_error, pearson_corrcoef
import pandas as pd
import warnings
import pathlib
import os
from tqdm import tqdm
from sklearn.impute import KNNImputer
import pyaging as pya
import matplotlib.pyplot as plt
import seaborn as sns
import distinctipy
import matplotlib.colors as mcolors
import matplotlib.patheffects as pe
from plottable import ColumnDefinition, Table
import ast
from scipy.stats import mannwhitneyu
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap
import copy
import plotly.graph_objects as go
from statsmodels.stats.multitest import multipletests

import warnings
warnings.filterwarnings("ignore", ".*does not have many workers.*")
warnings.filterwarnings("ignore", ".*exists and is not empty.*")
warnings.filterwarnings("ignore", ".*is smaller than the logging interval Trainer.*")

def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]


# Setup variables and paths

In [None]:
feats_imm = pd.read_excel(f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

epi_data_type = 'no_harm'
imm_data_type = 'imp_source(imm)_method(knn)_params(5)' # 'origin' 'imp_source(imm)_method(knn)_params(5)' 'imp_source(imm)_method(miceforest)_params(2)'

selection_method = 'mrmr' # 'f_regression' 'spearman' 'mrmr'
n_feats = 100

path_imm = f"D:/YandexDisk/Work/bbd/immunology/003_EpImAge/{imm_data_type}/{epi_data_type}/{selection_method}_{n_feats}"
path_save = f"{path_imm}/EpImAge"
pathlib.Path(path_save).mkdir(parents=True, exist_ok=True)

df_models = pd.read_excel(f"{path_imm}/best_models_v4.xlsx", index_col=0)

path_epi = "D:/YandexDisk/Work/bbd/immunology/003_EpImAge/epi"
feats_pheno = ['Age', 'Sex', 'Status', 'Tissue']
path_clocks = "D:/YandexDisk/Work/pydnameth/datasets/pyaging"
clocks = [
    "altumage",
    "dunedinpace",
    "han",
    "knight",
    "leecontrol",
    "leerefinedrobust",
    "leerobust",
    "dnamfitage",
    "dnamphenoage",
    "dnamtl",
    "encen100",
    "encen40",
    "grimage",
    "grimage2",
    "hannum",
    "horvath2013",
    "hrsinchphenoage",
    "lin",
    "pcdnamtl",
    "pcgrimage",
    "pchannum",
    "pchorvath2013",
    "pcphenoage",
    "pcskinandblood",
    "pedbe",
    "replitali",
    "skinandblood",
    "stemtoc",
    "stoch",
    "stocp",
    "stocz",
    "yingadaptage",
    "yingcausage",
    "yingdamage",
    "zhangblup",
    "zhangen",
    "zhangmortality",
]

# Create data

## Load immunomarkers models

In [None]:
imm_epi_feats = {}              
imm_models = {}
for imm in (pbar := tqdm(feats_imm)):
    pbar.set_description(f"Processing {imm}")
    imm_epi_feats[imm] = pd.read_excel(f"{path_imm}/{imm}/feats_con.xlsx", index_col=0).index.values.tolist()
    imm_path_model = f"{path_imm}/{imm}/pytorch_tabular/candidates/{df_models.at[imm, 'model']}/{df_models.at[imm, 'directory']}/model.ckpt"
    head, tail = os.path.split(imm_path_model)
    imm_models[imm] = TabularModel.load_model(f"{head}")

feats_epi_cmn = list(set.union(*[set(x) for x in imm_epi_feats.values()]))
print(f"Number of CpGs: {len(feats_epi_cmn)}")

## Load epigenetics data and calculate clocks

In [None]:
gpls = [f.name for f in os.scandir(path_epi) if f.is_dir()]
gse_missed_cpgs = {}
dfs_gses = []
for gpl in gpls:
    print(gpl)
    gses = [f.name for f in os.scandir(f"{path_epi}/{gpl}") if f.is_dir()]
    for gse in (pbar := tqdm(gses)):
        pbar.set_description(f"Processing {gse}")
        if gse == 'GSEUNN':
            df_betas = pd.read_pickle(f"{path_epi}/{gpl}/{gse}/{epi_data_type}/betas.pkl")
            df_pheno = pd.read_csv(f"{path_epi}/{gpl}/{gse}/{epi_data_type}/pheno.csv", index_col='index')
        elif gse == 'GSE53740':
            df_betas = pd.read_pickle(f"{path_epi}/{gpl}/{gse}/betas.pkl")
            df_pheno = pd.read_csv(f"{path_epi}/{gpl}/{gse}/pheno.csv", index_col=0)
            df_pheno.drop(df_pheno.index[df_pheno['Status'] == 'Unknown'], inplace=True)
        elif gse == 'GSE87648':
            df_betas = pd.read_pickle(f"{path_epi}/{gpl}/{gse}/betas.pkl")
            df_pheno = pd.read_csv(f"{path_epi}/{gpl}/{gse}/pheno.csv", index_col=0)
            df_pheno.drop(df_pheno.index[df_pheno['Status'] == 'HS'], inplace=True)
        else:
            df_betas = pd.read_pickle(f"{path_epi}/{gpl}/{gse}/betas.pkl")
            df_pheno = pd.read_csv(f"{path_epi}/{gpl}/{gse}/pheno.csv", index_col='gsm')
        df_for_ages = pd.merge(df_pheno.loc[:, feats_pheno], df_betas, left_index=True, right_index=True)
        if df_for_ages['Sex'].value_counts().size > 2:
            raise ValueError(f"More than 2 sexes")
        elif df_for_ages['Sex'].value_counts().size == 1:
            print(f"{gse} contains only one sex")
        else:
            print(f"{gse} contains 2 sexes")
        df_for_ages['Female'] = (df_for_ages['Sex'] == 'F').astype(int)
        df_for_ages = pya.pp.epicv2_probe_aggregation(df_for_ages, verbose=False)
        adata = pya.pp.df_to_adata(df_for_ages, metadata_cols=['Sex', 'Status', 'Tissue'], imputer_strategy='knn', verbose=False)
        pya.pred.predict_age(adata=adata, dir=path_clocks, clock_names=clocks, verbose=False)
        df_pheno = pd.merge(df_pheno.loc[:, feats_pheno], adata.obs[clocks], left_index=True, right_index=True)
        gse_missed_cpgs[gse] = len(set(feats_epi_cmn) - set(df_betas.columns))
        exist_cpgs = list(set.intersection(set(df_betas.columns), set(feats_epi_cmn)))
        df_gse = pd.merge(df_pheno, df_betas.loc[:, exist_cpgs], left_index=True, right_index=True)
        if df_gse.shape[0] == 0:
            raise ValueError(f"{gse} indexes problem!")
        df_gse.insert(0, 'GPL', gpl)
        df_gse.insert(0, 'GSE', gse)
        dfs_gses.append(df_gse)
        
df_gse_missed_cpgs = pd.DataFrame.from_dict(gse_missed_cpgs, orient='index', columns=['Missed CpGs'])
df_gse_missed_cpgs.to_excel(f"{path_save}/gse_missed_cpgs.xlsx", index=True, index_label='GSE')

df = pd.concat(dfs_gses, verify_integrity=True)

## Impute missing values

In [None]:
n_neighbors = 5
X = df.loc[:, feats_epi_cmn + ['Age']].values
print(f'Missing before imputation: {np.isnan(X).sum()}')
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
print(f'Missing after imputation: {np.isnan(X_imptd).sum()}')

In [None]:
df.loc[:, feats_epi_cmn + ['Age']] = X_imptd

## Calculate immunomarkers

In [None]:
for imm in (pbar := tqdm(feats_imm)):
    pbar.set_description(f"Processing {imm}")
    df[f"{imm}_log"] = imm_models[imm].predict(df.loc[:, imm_epi_feats[imm]])
    df[imm] = np.exp(df[f"{imm}_log"])

## Add ICD-11 information

In [None]:
statuses_icd = pd.read_excel(f"{path_epi}/statuses.xlsx", index_col='Status')
for status, row in statuses_icd.iterrows():
    df.loc[df['Status'] == status, 'ICD-11 chapter'] = row['ICD-11 chapter']
    df.loc[df['Status'] == status, 'ICD-11 chapter and description'] = row['ICD-11 chapter and description']
    df.loc[df['Status'] == status, 'ICD-11 code'] = row['ICD-11 code']
    df.loc[df['Status'] == status, 'ICD-11 code and description'] = row['ICD-11 code and description']
cols_icd = ['ICD-11 chapter', 'ICD-11 chapter and description', 'ICD-11 code', 'ICD-11 code and description']

## Save data

In [None]:
df[['GPL', 'GSE'] + feats_pheno + cols_icd + clocks + list(feats_imm) + [f"{imm}_log" for imm in feats_imm]].to_excel(f"{path_save}/data.xlsx", index_label='ID')
df.to_pickle(f"{path_save}/data_full.pkl")

## Check models on GSEUNN

In [None]:
tst_n_splits = 5
tst_n_repeats = 5
tst_random_state = 1337
tst_split_id = 5

val_n_splits = 4
val_n_repeats = 2
val_random_state = 1337
val_fold_id = 5

fn_samples = f"samples_tst({tst_random_state}_{tst_n_splits}_{tst_n_repeats})_val({val_random_state}_{val_n_splits}_{val_n_repeats})"
with open(f"D:/YandexDisk/Work/bbd/immunology/003_EpImAge/{fn_samples}.pickle", 'rb') as handle:
    samples = pickle.load(handle)
    
for split_id in range(tst_n_splits * tst_n_repeats):
    for fold_id in range(val_n_splits * val_n_repeats):
        test_samples = samples[split_id]['test']
        train_samples = samples[split_id]['trains'][fold_id]
        validation_samples = samples[split_id]['validations'][fold_id]

        intxns = {
            'train_validation': set.intersection(set(train_samples), set(validation_samples)),
            'validation_test': set.intersection(set(validation_samples), set(test_samples)),
            'train_test': set.intersection(set(train_samples), set(test_samples))
        }

        for intxn_name, intxn_samples in intxns.items():
            if len(intxn_samples) > 0:
                print(f"Non-zero {intxn_name} intersection ({len(intxn_samples)}) for {split_id} Split and {fold_id} Fold!")

split_dict = samples[tst_split_id]

df_models_check = pd.DataFrame(index=feats_imm)
for imm in (pbar := tqdm(feats_imm)):
    pbar.set_description(f"Processing {imm}")
    data_imm = pd.read_excel(f"{path_imm}/{imm}/data.xlsx", index_col=0)
    
    y_train_real = torch.from_numpy(data_imm.loc[split_dict['trains'][val_fold_id], f"{imm}_log"].values)
    y_validation_real = torch.from_numpy(data_imm.loc[split_dict['validations'][val_fold_id], f"{imm}_log"].values)
    y_test_real = torch.from_numpy(data_imm.loc[split_dict['test'], f"{imm}_log"].values)
    
    y_train_pred = torch.from_numpy(df.loc[split_dict['trains'][val_fold_id], f"{imm}_log"].values)
    y_validation_pred = torch.from_numpy(df.loc[split_dict['validations'][val_fold_id], f"{imm}_log"].values)
    y_test_pred = torch.from_numpy(df.loc[split_dict['test'], f"{imm}_log"].values)
    
    df_models_check.at[imm, 'train_mae_before'] = df_models.at[imm, 'train_mean_absolute_error']
    df_models_check.at[imm, 'validation_mae_before'] = df_models.at[imm, 'validation_mean_absolute_error']
    df_models_check.at[imm, 'test_mae_before'] = df_models.at[imm, 'test_mean_absolute_error']
    df_models_check.at[imm, 'train_mae_after'] = mean_absolute_error(y_train_pred, y_train_real).numpy()
    df_models_check.at[imm, 'validation_mae_after'] = mean_absolute_error(y_validation_pred, y_validation_real).numpy()
    df_models_check.at[imm, 'test_mae_after'] = mean_absolute_error(y_test_pred, y_test_real).numpy()
    
    df_models_check.at[imm, 'train_rho_before'] = df_models.at[imm, 'train_pearson_corrcoef']
    df_models_check.at[imm, 'validation_rho_before'] = df_models.at[imm, 'validation_pearson_corrcoef']
    df_models_check.at[imm, 'test_rho_before'] = df_models.at[imm, 'test_pearson_corrcoef']
    df_models_check.at[imm, 'train_rho_after'] = pearson_corrcoef(y_train_pred, y_train_real).numpy()
    df_models_check.at[imm, 'validation_rho_after'] = pearson_corrcoef(y_validation_pred, y_validation_real).numpy()
    df_models_check.at[imm, 'test_rho_after'] = pearson_corrcoef(y_test_pred, y_test_real).numpy()

df_models_check['train_mae_diff'] = df_models_check['train_mae_after'] - df_models_check['train_mae_before']
df_models_check['validation_mae_diff'] = df_models_check['validation_mae_after'] - df_models_check['validation_mae_before']
df_models_check['test_mae_diff'] = df_models_check['test_mae_after'] - df_models_check['test_mae_before']

df_models_check['train_rho_diff'] = df_models_check['train_rho_after'] - df_models_check['train_rho_before']
df_models_check['validation_rho_diff'] = df_models_check['validation_rho_after'] - df_models_check['validation_rho_before']
df_models_check['test_rho_diff'] = df_models_check['test_rho_after'] - df_models_check['test_rho_before']

df_models_check.to_excel(f"{path_save}/models_check.xlsx")

# Plot immunomarkers results

In [None]:
df_models.sort_values(['test_pearson_corrcoef'], ascending=[False], inplace=True)
       
imm_results = {}
for imm, row in (pbar := tqdm(df_models.iterrows())):
    pbar.set_description(f"Processing {imm}")
    imm_result = pd.read_excel(f"{path_imm}/{imm}/pytorch_tabular/candidates/{row['model']}/{row['directory']}/df.xlsx", index_col=0)
    imm_result.rename(columns={f"{imm}_log": imm}, inplace=True)
    imm_results[imm] = imm_result

In [None]:
n_rows = 4 * 3
n_cols = 8
fig_height = 20
fig_width = 35

imm_colors = distinctipy.get_colors(n_colors=df_models.shape[0], exclude_colors=[mcolors.hex2color(mcolors.CSS4_COLORS['gray'])], rng=42)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), height_ratios=[0.2, 0.8, 0.2]*4, gridspec_kw={'wspace':0.35, 'hspace': 0.05}, sharey=False, sharex=False)

for imm_id, imm in tqdm(enumerate(df_models.index.values)):
    imm_color = imm_colors[imm_id]
    imm_result = imm_results[imm]
    row_id, col_id = divmod(imm_id, n_cols)
    row_id_table = row_id * 3
    row_id_scatter = row_id * 3 + 1
    row_id_empty = row_id * 3 + 2

    q01 = imm_result[imm].quantile(0.01)
    q99 = imm_result[imm].quantile(0.99)

    df_metrics = pd.DataFrame(index=['MAE', fr"Pearson $\mathbf{{\rho}}$"], columns=['Train', 'Validation', 'Test'])
    df_metrics.at['MAE', 'Train'] = f"{df_models.at[imm, 'train_mean_absolute_error']:0.3f}"
    df_metrics.at['MAE', 'Validation'] = f"{df_models.at[imm, 'validation_mean_absolute_error']:0.3f}"
    df_metrics.at['MAE', 'Test'] = f"{df_models.at[imm, 'test_mean_absolute_error']:0.3f}"
    df_metrics.at[fr"Pearson $\mathbf{{\rho}}$", 'Train'] = f"{df_models.at[imm, 'train_pearson_corrcoef']:0.3f}"
    df_metrics.at[fr"Pearson $\mathbf{{\rho}}$", 'Validation'] = f"{df_models.at[imm, 'validation_pearson_corrcoef']:0.3f}"
    df_metrics.at[fr"Pearson $\mathbf{{\rho}}$", 'Test'] = f"{df_models.at[imm, 'test_pearson_corrcoef']:0.3f}"
    
    col_defs = [
        ColumnDefinition(
            name="index",
            title=imm,
            textprops={"ha": "center", "weight": "bold"},
            width=2.5,
            # border="both"
        ),
        ColumnDefinition(
            name="Train",
            textprops={"ha": "left"},
            width=1.5,
            border="left"
        ),
        ColumnDefinition(
            name="Validation",
            textprops={"ha": "left"},
            width=1.5,
            # border="both"
        ),
        ColumnDefinition(
            name="Test",
            textprops={"ha": "left"},
            width=1.5,
            # border="both"
        )
    ]

    table = Table(
        df_metrics,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs[row_id_table, col_id],
        textprops={"fontsize": 8},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    ).autoset_fontcolors(colnames=['Train', 'Validation', 'Test'])

    kdeplot = sns.kdeplot(
        data=imm_result.loc[imm_result['Group'] != 'Test', :],
        x=imm,
        y='Prediction',
        fill=True,
        cbar=False,
        color='gray',
        cut=0,
        legend=False,
        ax=axs[row_id_scatter, col_id]
    )
    scatter = sns.scatterplot(
        data=imm_result.loc[imm_result['Group'] == 'Test', :],
        x=imm,
        y="Prediction",
        linewidth=0.5,
        alpha=0.8,
        edgecolor="k",
        s=35,
        color=imm_color,
        ax=axs[row_id_scatter, col_id],
    )
    axs[row_id_scatter, col_id].axline((0, 0), slope=1, color="black", linestyle=":")
    axs[row_id_scatter, col_id].set_xlim(q01, q99)
    axs[row_id_scatter, col_id].set_ylim(q01, q99)
    axs[row_id_scatter, col_id].set_xlabel(imm, color=imm_color, path_effects=[pe.withStroke(linewidth=1.0, foreground="black")])
    
    axs[row_id_empty, col_id].axis('off')

fig.tight_layout()    
fig.savefig(f"{path_save}/immuno.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/immuno.pdf", bbox_inches='tight')
plt.close(fig)

# Epigenetics clocks statistics

In [None]:
data = pd.read_excel(f"{path_save}/data_filtered.xlsx", index_col='ID')

df_groups = pd.read_excel(f"{path_epi}/groups.xlsx", index_col=0)
icd_chpts = np.sort(df_groups['ICD-11 chapter'].unique())
icd_codes = np.sort(df_groups['ICD-11 code'].unique())
icd_cols = []
for icd_chpt in icd_chpts:
    icd_cols.append(f'Passed\nICD-11\nChapter {icd_chpt}')
    icd_codes_for_chpt = [f"Passed\nICD-11\nCode {x}" for x in np.sort(df_groups.loc[df_groups['ICD-11 chapter'] == icd_chpt, 'ICD-11 code'].unique())]
    icd_cols += icd_codes_for_chpt
icd_cols_max = [f"Max\n{x}" for x in icd_cols]

colors = distinctipy.get_colors(len(icd_chpts), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1337, pastel_factor=0.5)
colors_icd_chpts = {icd_chpt: colors[icd_chpt_id] for icd_chpt_id, icd_chpt in enumerate(icd_chpts)}
colormaps_icd_chpts = {
    icd_chpt: LinearSegmentedColormap.from_list(
        name=f"ICD-11 Chapter {icd_chpt} cmap",
        colors=[make_rgb_transparent(colors_icd_chpts[icd_chpt], (1, 1, 1), 0.2), colors_icd_chpts[icd_chpt]], N=256
    )
    for icd_chpt in icd_chpts
}
colormap_total = LinearSegmentedColormap.from_list(
    name=f"ICD-11 Total cmap",
    colors=[
        mcolors.hex2color(mcolors.CSS4_COLORS['lavender']),
        mcolors.hex2color(mcolors.CSS4_COLORS['dimgray'])],
    N=256,
)

In [None]:
df_clocks = pd.read_excel(f"{path_epi}/clocks_meta.xlsx", index_col=0)
df_clocks_tests = df_clocks.copy()
new_cols = ['Total Rho', 'Total MAE', 'Passed\nICD-11\nTotal'] + icd_cols + ['Max\nPassed\nICD-11\nTotal'] + icd_cols_max
for col in new_cols: 
    df_clocks[col] = None
    
for clock_name in (pbar := tqdm(df_clocks.index)):
    pbar.set_description(f"Processing {clock_name}")
    clock_type = df_clocks.at[clock_name, 'Type']

    if clock_type == 'Age':
        data['Error'] = data[clock_name] - data['Age']
    else:
        data['Error'] = data[clock_name]
        
    for section_id, section_row in df_groups.iterrows():
        section_statuses = ast.literal_eval(section_row['Statuses'])
        section_groups = ast.literal_eval(section_row['Groups'])
        section_directions = ast.literal_eval(section_row['Directions'])
        df_section = data.loc[(data['GSE'] == section_row['GSE']) & (data['Status'].isin(section_statuses)), ['Status', 'Error']]
        
        for section_group_id, section_group in enumerate(section_groups):
            
            _, pval = mannwhitneyu(
                df_section.loc[df_section["Status"] == section_group[0], "Error"].values,
                df_section.loc[df_section["Status"] == section_group[1], "Error"].values,
                alternative="two-sided",
            )
            bias_0 = np.mean(df_section.loc[df_section['Status'] == section_group[0], 'Error'])
            bias_1 = np.mean(df_section.loc[df_section['Status'] == section_group[1], 'Error'])
            
            df_clocks_tests.at[clock_name, f"pval\n{section_id}\n{section_group}"] = pval
            df_clocks_tests.at[clock_name, f"bias_0\n{section_id}\n{section_group}"] = bias_0
            df_clocks_tests.at[clock_name, f"bias_1\n{section_id}\n{section_group}"] = bias_1
df_clocks_tests.to_excel(f"{path_epi}/clocks_tests_raw_before_correction.xlsx", index_label='Model ID')

# Here we can modify clocks' test results (p-values)
pvals_cols = [col for col in df_clocks_tests.columns if 'pval' in col]
for clock_name in (pbar := tqdm(df_clocks.index)):
    _, df_clocks_tests.loc[clock_name, pvals_cols], _, _ = multipletests(df_clocks_tests.loc[clock_name, pvals_cols], 0.05, method='fdr_bh')
df_clocks_tests.to_excel(f"{path_epi}/clocks_tests_raw_after_correction.xlsx", index_label='Model ID')

for clock_name in (pbar := tqdm(df_clocks.index)):
    pbar.set_description(f"Processing {clock_name}")
    clock_type = df_clocks.at[clock_name, 'Type']
    
    if clock_type == 'Age':
        real_all = torch.from_numpy(data.loc[data['Status'] == 'Control', 'Age'].values)
        pred_all = torch.from_numpy(data.loc[data['Status'] == 'Control', clock_name].values)
        df_clocks.at[clock_name, 'Total Rho'] = pearson_corrcoef(pred_all, real_all).numpy()
        df_clocks.at[clock_name, 'Total MAE'] = mean_absolute_error(pred_all, real_all).numpy()

    passed_icd_chpt = {icd_chpt: 0 for icd_chpt in icd_chpts}
    passed_icd_chpt_max = {icd_chpt: 0 for icd_chpt in icd_chpts}
    for icd_chpt in icd_chpts:
        df_chpt = df_groups[df_groups['ICD-11 chapter'] == icd_chpt]
        for section_id, section_row in df_chpt.iterrows():
            section_statuses = ast.literal_eval(section_row['Statuses'])
            section_groups = ast.literal_eval(section_row['Groups'])
            section_directions = ast.literal_eval(section_row['Directions'])
            
            for section_group_id, section_group in enumerate(section_groups):
                passed_icd_chpt_max[icd_chpt] += 1
                
                pval = df_clocks_tests.at[clock_name, f"pval\n{section_id}\n{section_group}"]
                bias_0 = df_clocks_tests.at[clock_name, f"bias_0\n{section_id}\n{section_group}"]
                bias_1 = df_clocks_tests.at[clock_name, f"bias_1\n{section_id}\n{section_group}"]
                
                group_direction = section_directions[section_group_id]
                if pval < 0.05:
                    if group_direction == 'Increasing' and bias_1 > bias_0:
                        passed_icd_chpt[icd_chpt] += 1
                    elif group_direction == 'Decreasing' and bias_1 < bias_0:
                        passed_icd_chpt[icd_chpt] += 1
        df_clocks.at[clock_name, f'Passed\nICD-11\nChapter {icd_chpt}'] = passed_icd_chpt[icd_chpt]
        df_clocks.at[clock_name, f'Max\nPassed\nICD-11\nChapter {icd_chpt}'] = passed_icd_chpt_max[icd_chpt]              
    df_clocks.at[clock_name, f'Passed\nICD-11\nTotal'] = sum(passed_icd_chpt.values())
    df_clocks.at[clock_name, f'Max\nPassed\nICD-11\nTotal'] = sum(passed_icd_chpt_max.values())
    
    passed_icd_code = {icd_code: 0 for icd_code in icd_codes}
    passed_icd_code_max = {icd_code: 0 for icd_code in icd_codes}
    for icd_code in icd_codes:
        df_code = df_groups[df_groups['ICD-11 code'] == icd_code]
        for section_id, section_row in df_code.iterrows():
            section_statuses = ast.literal_eval(section_row['Statuses'])
            section_groups = ast.literal_eval(section_row['Groups'])
            section_directions = ast.literal_eval(section_row['Directions'])
            
            for section_group_id, section_group in enumerate(section_groups):
                passed_icd_code_max[icd_code] += 1
                
                pval = df_clocks_tests.at[clock_name, f"pval\n{section_id}\n{section_group}"]
                bias_0 = df_clocks_tests.at[clock_name, f"bias_0\n{section_id}\n{section_group}"]
                bias_1 = df_clocks_tests.at[clock_name, f"bias_1\n{section_id}\n{section_group}"]
                
                group_direction = section_directions[section_group_id]
                if pval < 0.05:
                    if group_direction == 'Increasing' and bias_1 > bias_0:
                        passed_icd_code[icd_code] += 1
                    elif group_direction == 'Decreasing' and bias_1 < bias_0:
                        passed_icd_code[icd_code] += 1
        df_clocks.at[clock_name, f'Passed\nICD-11\nCode {icd_code}'] = passed_icd_code[icd_code]
        df_clocks.at[clock_name, f'Max\nPassed\nICD-11\nCode {icd_code}'] = passed_icd_code_max[icd_code]      
    
df_clocks.to_excel(f"{path_epi}/clocks_tests.xlsx", index_label='Model ID')

In [None]:
def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter

df_clocks = pd.read_excel(f"{path_epi}/clocks_tests.xlsx", index_col="Clock Name")
df_clocks[f"Passed\nICD-11\nTotal"] /= df_clocks.at['Hannum', f'Max\nPassed\nICD-11\nTotal']
for col in icd_cols:
    df_clocks[col] /= df_clocks.at['Hannum', f'Max\n{col}']

col_names_common = ["Year", "Total Rho", "Total MAE", f"Passed\nICD-11\nTotal"]
col_defs_common = [
    ColumnDefinition(
        name="Clock Name",
        title="Clocks",
        textprops={"ha": "right", "weight": "bold"},
        width=2.0,
    ),
    ColumnDefinition(
        name="Year",
        title="Year",
        textprops={"ha": "center"},
        width=1.0,
        border="left"
    ),
    ColumnDefinition(
        name="Total Rho",
        title="Total\nRho",
        textprops={"ha": "center"},
        formatter="{:.3f}",
        cmap=normed_cmap(df_clocks["Total Rho"].dropna(), cmap=matplotlib.cm.Greens, num_stds=2.5),
        width=1.0,
        border="left"
    ),
    ColumnDefinition(
        name="Total MAE",
        title="Total\nMAE",
        textprops={"ha": "center"},
        formatter="{:.3f}",
        cmap=normed_cmap(df_clocks["Total MAE"].dropna(), cmap=matplotlib.cm.Reds, num_stds=2.5),
        width=1.0,
    ),
    ColumnDefinition(
        name=f"Passed\nICD-11\nTotal",
        title="Passed\nICD-11",
        width=1.5,
        border="left",
        textprops={"ha": "center"},
        plot_fn=bar,
        plot_kw={
            "cmap": colormap_total,
            "plot_bg_bar": True,
            "annotate": True,
            "height": 0.95,
            "linewidth": 0.5,
            "formatter": form_bar(df_clocks.at['Hannum', f'Max\nPassed\nICD-11\nTotal']),
        },
    ),
]

icd_chpt_col_defs = copy.deepcopy(col_defs_common)
icd_chpt_col_names = copy.deepcopy(col_names_common)
for icd_chpt in icd_chpts:
    if icd_chpt == 1:
        border = 'left'
    else:
        border = None
    max_passed = df_clocks.at['Hannum', f'Max\nPassed\nICD-11\nChapter {icd_chpt}']
    icd_chpt_col_names.append(f'Passed\nICD-11\nChapter {icd_chpt}')
    col_def = ColumnDefinition(
        name=f'Passed\nICD-11\nChapter {icd_chpt}',
        title=f'Chapter {icd_chpt}',
        width=1.0,
        plot_fn=bar,
        border=border,
        textprops={"ha": "center"},
        plot_kw={
            "cmap": colormaps_icd_chpts[icd_chpt],
            "plot_bg_bar": True,
            "annotate": True,
            "height": 0.95,
            "lw": 0.5,
            "formatter": form_bar(max_passed),
        },
    )
    icd_chpt_col_defs.append(col_def)

fig, ax = plt.subplots(figsize=(25, 17))
table = Table(
    df_clocks[icd_chpt_col_names],
    column_definitions=icd_chpt_col_defs,
    row_dividers=True,
    footer_divider=False,
    odd_row_color="#ffffff", 
    even_row_color="#f0f0f0",
    ax=ax,
    row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
    col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
    column_border_kw={"linewidth": 1, "linestyle": "-"},
).autoset_fontcolors(colnames=icd_chpt_col_names) 
fig.savefig(f"{path_epi}/clocks_tests.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_epi}/clocks_tests.pdf", bbox_inches='tight')
plt.close(fig)

# Immunomarkers tests

In [None]:
data = pd.read_excel(f"{path_save}/data_filtered.xlsx", index_col='ID')

df_groups = pd.read_excel(f"{path_epi}/groups.xlsx", index_col=0)
icd_chpts = np.sort(df_groups['ICD-11 chapter'].unique())
icd_codes = np.sort(df_groups['ICD-11 code'].unique())
icd_cols = []
for icd_chpt in icd_chpts:
    icd_cols.append(f'Passed\nICD-11\nChapter {icd_chpt}')
    icd_codes_for_chpt = [f"Passed\nICD-11\nCode {x}" for x in np.sort(df_groups.loc[df_groups['ICD-11 chapter'] == icd_chpt, 'ICD-11 code'].unique())]
    icd_cols += icd_codes_for_chpt
icd_cols_max = [f"Max\n{x}" for x in icd_cols]

colors = distinctipy.get_colors(len(icd_chpts), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1337, pastel_factor=0.5)
colors_icd_chpts = {icd_chpt: colors[icd_chpt_id] for icd_chpt_id, icd_chpt in enumerate(icd_chpts)}
colormaps_icd_chpts = {
    icd_chpt: LinearSegmentedColormap.from_list(
        name=f"ICD-11 Chapter {icd_chpt} cmap",
        colors=[make_rgb_transparent(colors_icd_chpts[icd_chpt], (1, 1, 1), 0.2), colors_icd_chpts[icd_chpt]], N=256
    )
    for icd_chpt in icd_chpts
}
colormap_total = LinearSegmentedColormap.from_list(
    name=f"ICD-11 Total cmap",
    colors=[
        mcolors.hex2color(mcolors.CSS4_COLORS['lavender']),
        mcolors.hex2color(mcolors.CSS4_COLORS['dimgray'])],
    N=256,
)

In [None]:
feats_selected = pd.read_excel(f"{path_save}/feats.xlsx", index_col=0).index.values.tolist()
df_immunomarkers = pd.DataFrame(index=feats_selected)
df_immunomarkers_tests = df_immunomarkers.copy()
new_cols = ['Passed\nICD-11\nTotal'] + icd_cols + ['Max\nPassed\nICD-11\nTotal'] + icd_cols_max
for col in new_cols: 
    df_immunomarkers[col] = None
    
for feat in (pbar := tqdm(feats_selected)):
    pbar.set_description(f"Processing {feat}")
        
    for section_id, section_row in df_groups.iterrows():
        section_statuses = ast.literal_eval(section_row['Statuses'])
        section_groups = ast.literal_eval(section_row['Groups'])
        section_directions = ast.literal_eval(section_row['Directions'])
        df_section = data.loc[(data['GSE'] == section_row['GSE']) & (data['Status'].isin(section_statuses)), ['Status', f"{feat}_log"]]
        
        for section_group_id, section_group in enumerate(section_groups):
            
            _, pval = mannwhitneyu(
                df_section.loc[df_section["Status"] == section_group[0], f"{feat}_log"].values,
                df_section.loc[df_section["Status"] == section_group[1], f"{feat}_log"].values,
                alternative="two-sided",
            )
            bias_0 = np.mean(df_section.loc[df_section['Status'] == section_group[0], f"{feat}_log"])
            bias_1 = np.mean(df_section.loc[df_section['Status'] == section_group[1], f"{feat}_log"])
            
            df_immunomarkers_tests.at[feat, f"pval\n{section_id}\n{section_group}"] = pval
            df_immunomarkers_tests.at[feat, f"bias_0\n{section_id}\n{section_group}"] = bias_0
            df_immunomarkers_tests.at[feat, f"bias_1\n{section_id}\n{section_group}"] = bias_1
df_immunomarkers_tests.to_excel(f"{path_save}/immunomarkers_tests_raw_before_correction.xlsx", index_label='Immunomarker')

# Here we can modify test results (p-values)
pvals_cols = [col for col in df_immunomarkers_tests.columns if 'pval' in col]
for feat in (pbar := tqdm(feats_selected)):
    _, df_immunomarkers_tests.loc[feat, pvals_cols], _, _ = multipletests(df_immunomarkers_tests.loc[feat, pvals_cols], 0.05, method='fdr_bh')
df_immunomarkers_tests.to_excel(f"{path_save}/immunomarkers_tests_raw_after_correction.xlsx", index_label='Immunomarker')

for feat in (pbar := tqdm(feats_selected)):
    pbar.set_description(f"Processing {feat}")

    passed_icd_chpt = {icd_chpt: 0 for icd_chpt in icd_chpts}
    passed_icd_chpt_max = {icd_chpt: 0 for icd_chpt in icd_chpts}
    for icd_chpt in icd_chpts:
        df_chpt = df_groups[df_groups['ICD-11 chapter'] == icd_chpt]
        for section_id, section_row in df_chpt.iterrows():
            section_statuses = ast.literal_eval(section_row['Statuses'])
            section_groups = ast.literal_eval(section_row['Groups'])
            section_directions = ast.literal_eval(section_row['Directions'])
            
            for section_group_id, section_group in enumerate(section_groups):
                passed_icd_chpt_max[icd_chpt] += 1
                
                pval = df_immunomarkers_tests.at[feat, f"pval\n{section_id}\n{section_group}"]
                bias_0 = df_immunomarkers_tests.at[feat, f"bias_0\n{section_id}\n{section_group}"]
                bias_1 = df_immunomarkers_tests.at[feat, f"bias_1\n{section_id}\n{section_group}"]
                
                if pval < 0.05:
                    passed_icd_chpt[icd_chpt] += 1
        df_immunomarkers.at[feat, f'Passed\nICD-11\nChapter {icd_chpt}'] = passed_icd_chpt[icd_chpt]
        df_immunomarkers.at[feat, f'Max\nPassed\nICD-11\nChapter {icd_chpt}'] = passed_icd_chpt_max[icd_chpt]              
    df_immunomarkers.at[feat, f'Passed\nICD-11\nTotal'] = sum(passed_icd_chpt.values())
    df_immunomarkers.at[feat, f'Max\nPassed\nICD-11\nTotal'] = sum(passed_icd_chpt_max.values())
    
    passed_icd_code = {icd_code: 0 for icd_code in icd_codes}
    passed_icd_code_max = {icd_code: 0 for icd_code in icd_codes}
    for icd_code in icd_codes:
        df_code = df_groups[df_groups['ICD-11 code'] == icd_code]
        for section_id, section_row in df_code.iterrows():
            section_statuses = ast.literal_eval(section_row['Statuses'])
            section_groups = ast.literal_eval(section_row['Groups'])
            section_directions = ast.literal_eval(section_row['Directions'])
            
            for section_group_id, section_group in enumerate(section_groups):
                passed_icd_code_max[icd_code] += 1
                
                pval = df_immunomarkers_tests.at[feat, f"pval\n{section_id}\n{section_group}"]
                bias_0 = df_immunomarkers_tests.at[feat, f"bias_0\n{section_id}\n{section_group}"]
                bias_1 = df_immunomarkers_tests.at[feat, f"bias_1\n{section_id}\n{section_group}"]
                
                group_direction = section_directions[section_group_id]
                if pval < 0.05:
                    passed_icd_code[icd_code] += 1
        df_immunomarkers.at[feat, f'Passed\nICD-11\nCode {icd_code}'] = passed_icd_code[icd_code]
        df_immunomarkers.at[feat, f'Max\nPassed\nICD-11\nCode {icd_code}'] = passed_icd_code_max[icd_code]      
    
df_immunomarkers.to_excel(f"{path_save}/immunomarkers_tests.xlsx", index_label='Immunomarker')

In [None]:
def form_bar(base):
    def formatter(x):
        return f'{str(int(round(x * base)))}/{base}'
    return formatter

df_immunomarkers = pd.read_excel(f"{path_save}/immunomarkers_tests.xlsx", index_col=0)

df_immunomarkers[f"Passed\nICD-11\nTotal"] /= df_immunomarkers.at['CXCL9', f'Max\nPassed\nICD-11\nTotal']
for col in icd_cols:
    df_immunomarkers[col] /= df_immunomarkers.at['CXCL9', f'Max\n{col}']
df_immunomarkers.index.name = 'Immunomarker'

col_names_common = [f"Passed\nICD-11\nTotal"]
col_defs_common = [
    ColumnDefinition(
        name="Immunomarker",
        title="Immunomarker",
        textprops={"ha": "right", "weight": "bold"},
        width=1.5,
    ),
    ColumnDefinition(
        name=f"Passed\nICD-11\nTotal",
        title="Passed\nICD-11",
        width=1.5,
        border="left",
        textprops={"ha": "center"},
        plot_fn=bar,
        plot_kw={
            "cmap": colormap_total,
            "plot_bg_bar": True,
            "annotate": True,
            "height": 0.95,
            "linewidth": 0.5,
            "formatter": form_bar(df_immunomarkers.at['CXCL9', f'Max\nPassed\nICD-11\nTotal']),
        },
    ),
]

icd_chpt_col_defs = copy.deepcopy(col_defs_common)
icd_chpt_col_names = copy.deepcopy(col_names_common)
for icd_chpt in icd_chpts:
    if icd_chpt == 1:
        border = 'left'
    else:
        border = None
    max_passed = df_immunomarkers.at['CXCL9', f'Max\nPassed\nICD-11\nChapter {icd_chpt}']
    icd_chpt_col_names.append(f'Passed\nICD-11\nChapter {icd_chpt}')
    col_def = ColumnDefinition(
        name=f'Passed\nICD-11\nChapter {icd_chpt}',
        title=f'Chapter {icd_chpt}',
        width=1.0,
        plot_fn=bar,
        border=border,
        textprops={"ha": "center"},
        plot_kw={
            "cmap": colormaps_icd_chpts[icd_chpt],
            "plot_bg_bar": True,
            "annotate": True,
            "height": 0.95,
            "lw": 0.5,
            "formatter": form_bar(max_passed),
        },
    )
    icd_chpt_col_defs.append(col_def)

fig, ax = plt.subplots(figsize=(22, 14))
table = Table(
    df_immunomarkers[icd_chpt_col_names],
    column_definitions=icd_chpt_col_defs,
    row_dividers=True,
    footer_divider=False,
    odd_row_color="#ffffff", 
    even_row_color="#f0f0f0",
    ax=ax,
    row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
    col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
    column_border_kw={"linewidth": 1, "linestyle": "-"},
).autoset_fontcolors(colnames=icd_chpt_col_names) 
fig.savefig(f"{path_save}/immunomarkers_tests.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/immunomarkers_tests.pdf", bbox_inches='tight')
plt.close(fig)

# Datasets sunburst

In [None]:
data = pd.read_excel(f"{path_save}/data_filtered.xlsx", index_col='ID')

df_groups = pd.read_excel(f"{path_epi}/groups.xlsx", index_col=0)
icd_chpts = np.sort(df_groups['ICD-11 chapter'].unique())
icd_codes = np.sort(df_groups['ICD-11 code'].unique())
colors = distinctipy.get_colors(len(icd_chpts), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1337, pastel_factor=0.5)
colors_icd_chpts = {icd_chpt: colors[icd_chpt_id] for icd_chpt_id, icd_chpt in enumerate(icd_chpts)}
colormaps_icd_chpts = {
    icd_chpt: LinearSegmentedColormap.from_list(
        name=f"ICD-11 Chapter {icd_chpt} cmap",
        colors=[make_rgb_transparent(colors_icd_chpts[icd_chpt], (1, 1, 1), 0.3), make_rgb_transparent(colors_icd_chpts[icd_chpt], (1, 1, 1), 0.8)],
        N=256
    )
    for icd_chpt in icd_chpts
}

gse_count = data['GSE'].value_counts()
gses = gse_count.index.values
colors = distinctipy.get_colors(len(gses), [mcolors.hex2color(mcolors.CSS4_COLORS['white']), mcolors.hex2color(mcolors.CSS4_COLORS['black'])], rng=1337)
colors_gse = {gses[gse_id]: colors[gse_id] for gse_id in range(len(gses))}

In [None]:
sb_labels = []
sb_texts = []
sb_parents = []
sb_values = []
sb_colors = []

gses_controls_cases = data.loc[data['Status'] != 'Control', 'GSE'].value_counts().index.values
gses_controls_only = list(set(gses) - set(gses_controls_cases)) 

sb_labels.append('Controls Only')
sb_texts.append(f'Controls Only<br>{len(gses_controls_only)}')
sb_parents.append('Total Datasets')
sb_values.append(len(gses_controls_only))
sb_colors.append('dodgerblue')

for gse in gses_controls_only:
    sb_labels.append(gse)
    sb_texts.append(gse)
    sb_parents.append('Controls Only')
    sb_values.append(1)
    sb_colors.append(mcolors.to_hex(colors_gse[gse], keep_alpha=True))

n_gses_icd_chpts = 0
for icd_chpt in icd_chpts:
    
    icd_codes_for_chpt = np.sort(df_groups.loc[df_groups['ICD-11 chapter'] == icd_chpt, 'ICD-11 code'].unique())
    
    n_gses_icd_codes = 0
    for icd_code_id, icd_code in enumerate(icd_codes_for_chpt):
        
        gses_icd_code = data.loc[(data['ICD-11 chapter'] == icd_chpt) & (data['ICD-11 code'] == icd_code), 'GSE'].value_counts().index.values
        
        for gse in gses_icd_code:
            sb_labels.append(f"{icd_code} {gse}")
            sb_texts.append(gse)
            sb_parents.append(icd_code)
            sb_values.append(1)
            sb_colors.append(mcolors.to_hex(colors_gse[gse], keep_alpha=True))
            
        n_gses_icd_codes += len(gses_icd_code)
        
        color_icd_code = colormaps_icd_chpts[icd_chpt](icd_code_id/len(icd_codes_for_chpt))
        sb_labels.append(icd_code)
        sb_texts.append(icd_code)
        sb_parents.append(f'Chapter {icd_chpt}')
        sb_values.append(len(gses_icd_code))
        sb_colors.append(mcolors.to_hex(color_icd_code, keep_alpha=True))
    
    n_gses_icd_chpts += n_gses_icd_codes
    
    sb_labels.append(f'Chapter {icd_chpt}')
    sb_texts.append(f'Chapter {icd_chpt}')
    sb_parents.append('Controls and Cases')
    sb_values.append(n_gses_icd_codes)
    sb_colors.append(mcolors.to_hex(colors_icd_chpts[icd_chpt], keep_alpha=True))
    
sb_labels.append('Controls and Cases')
sb_texts.append(f'Controls and Cases<br>{len(gses_controls_cases)}')
sb_parents.append('Total Datasets')
sb_values.append(n_gses_icd_chpts)
sb_colors.append('firebrick')

sb_labels.append('Total Datasets')
sb_texts.append(f'Total Datasets<br>{len(gses_controls_cases) + len(gses_controls_only)}')
sb_parents.append('')
sb_values.append(n_gses_icd_chpts + len(gses_controls_only))
sb_colors.append('gainsbro')

fig = go.Figure()
fig.add_trace(
    go.Sunburst(
        labels=sb_labels,
        parents=sb_parents,
        values=sb_values,
        text=sb_texts,
        branchvalues="total",
        textinfo="text",
        marker=dict(
            colors=sb_colors,
            # line=dict(color='black', width=0.1)
            # pattern=dict(
            #     shape=["", "/", "/", ".", ".", "/", "/", ".", "/"], solidity=0.9
            # )
        ),
    )
)
fig.show()
fig.write_image(f"{path_epi}/sunburst.png")
fig.write_image(f"{path_epi}/sunburst.pdf", format="pdf")