# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
from impyute.imputation.cs import fast_knn
import plotly
import torch
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
from src.models.tabular.widedeep.ft_transformer import WDFTTransformerModel
from statannotations.Annotator import Annotator
import functools
import matplotlib.lines as mlines
import patchworklib as pw
import warnings
warnings.filterwarnings("ignore", ".*will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.*")


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Update original data with new data from Mirny

## Load original data

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_save = f"{path}/special/061_new_imm_data"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

# Papers
df_imm_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/data.xlsx", index_col="sample_id")
df_imm_geroscience = pd.read_excel(f"{path}/data/immuno/models/IPAge/11357_2022_540_MOESM12_ESM.xlsx", index_col=0, skiprows=1)
df_epi_clinepi = pd.read_excel(f"{path}/data/GSE234461/samples.xlsx", index_col=0)

df_ori = pd.read_excel(f"{path}/data/immuno/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df_ld_imm_ori = df_ori['Subject ID'].value_counts().to_frame()
df_ori['Is longitudinal?'] = False
df_ori.loc[df_ori['Subject ID'].isin(df_ld_imm_ori.index[df_ld_imm_ori['Subject ID'] > 1].values), 'Is longitudinal?'] = True
df_ori['Time'] = df_ori['Sample_Chronology']
df_ori['Time'].replace({0: 'T0', 1: 'T1', 2: 'T2', 3: 'T3'}, inplace=True)
df_ori.loc[df_imm_fimmu.index.values, 'PMC10485620 ID'] = df_imm_fimmu.loc[df_imm_fimmu.index.values, 'index']
df_ori.loc[df_imm_geroscience.index.values, 'PMC9135940 ID'] = df_imm_geroscience.loc[df_imm_geroscience.index.values, 'ID_Origin']
df_ori.loc[df_epi_clinepi.index.values, 'PMC10699032 ID'] = df_epi_clinepi.loc[df_epi_clinepi.index.values, 'GSM']

feats = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_slctd = pd.read_excel(f"{path}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

## Load SImAge

In [None]:
model_simage = WDFTTransformerModel.load_from_checkpoint(checkpoint_path=f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/data/immuno/models/SImAge/best_fold_0002.ckpt")
model_simage.eval()
model_simage.freeze()

## Get original dataframe with nans

In [None]:
files = [
    "Aging L, Q, H, I",
    "Aging-Covid_05.01.2022",
    "Aging-Covid-05.05.22",
    "Covid_results_02_2021",
    "Covid-25.11.20",
    "MULTIPLEX_20_11_2020_ AGING",
    "Yakutiya + TR",
    "Мультиплекс_Agind&Covid",
]
df_imm_genes = pd.read_excel(f"{path}/data/immuno/immuno_markers_genes.xlsx")
dict_imm_genes = dict(zip(df_imm_genes['immuno_marker'], df_imm_genes['gene']))

dfs_files = []
nans_by_features = {}
for file in files:
    df_file = pd.read_excel(f"{path}/data/immuno/files/processed/{file}.xlsx", index_col="Sample")
    df_file.rename(columns=dict_imm_genes, inplace=True)
    df_file = df_file.loc[:, feats]

    # duplicates processing
    if file == "MULTIPLEX_20_11_2020_ AGING":
        df_file_doubled_unique = df_file.loc[~df_file.index.duplicated(keep=False), :]
        df_file_doubled_1 = df_file.loc[df_file.index.duplicated(keep='first'), :]
        df_file_doubled_2 = df_file.loc[df_file.index.duplicated(keep='last'), :]
        df_file_duplicates_final = pd.concat([df_file_doubled_2, df_file_doubled_unique], axis=0)
        df_file = df_file_duplicates_final
    df_file_duplicates = df_file.loc[df_file.index.duplicated(keep=False), :]
    if df_file_duplicates.shape[0] > 0:
        print(df_file_duplicates.index)
    
    for feat in df_file:
        nan_vals = set(df_file.loc[df_file[feat].astype(str).str.contains(r'^([<>].*)$', regex=True), feat].values)
        if len(nan_vals) > 0:
            for nv in nan_vals:
                if feat in nans_by_features:
                    nans_by_features[feat].add(nv)
                else:
                    nans_by_features[feat] = {nv}
    
    dfs_files.append(df_file)

df_ori_w_nans = pd.concat(dfs_files, verify_integrity=False)
df_ori_w_nans.index = df_ori_w_nans.index.map(str)
df_ori_w_nans = df_ori_w_nans.loc[df_ori.index.values, :]
df_ori_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
for feat in feats:
    ids_imputed_above = df_ori_w_nans.index[df_ori_w_nans[feat].astype(str).str.contains('>')]
    df_ori_w_nans.loc[ids_imputed_above, feat] = df_ori.loc[ids_imputed_above, feat]
df_ori_w_nans = df_ori_w_nans.apply(pd.to_numeric, errors='coerce')

## Impute thresholds in xponent data

In [None]:
df_xpnt_w_nans = pd.read_excel(f"{path}/data/immuno/files/processed/10-March-2024/48-plex-human-_xPONENT_2024.xlsx", index_col="Sample ID")
df_xpnt_w_nans = df_xpnt_w_nans.loc[df_xpnt_w_nans.index.str.startswith('M', na=False), feats]
df_xpnt_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
df_xpnt_w_nans = df_xpnt_w_nans.apply(pd.to_numeric, errors='coerce')
df_mirny_pheno = pd.read_excel(f"{path}/data/immuno/files/processed/10-March-2024/Список Мирный.xlsx", index_col=0)
df_xpnt_w_nans.loc[df_xpnt_w_nans.index.values, ['Age', 'Sex', 'Nationality']] = df_mirny_pheno.loc[df_xpnt_w_nans.index.values, ['Age', 'Sex', 'Nationality']]
ids_imp_trn = df_ori.index[(df_ori['Region'] == 'Central') & (df_ori['Status'] == 'Control')]
ids_imp_tst = df_xpnt_w_nans.index.values
df_imp = pd.concat([
    df_ori.loc[ids_imp_trn, feats],
    df_xpnt_w_nans.loc[:, feats]
])
df_imp.loc[:, feats] = df_imp.loc[:, feats].astype('float')
imp_vals = fast_knn(df_imp.loc[:, feats].values)
df_imp.loc[:, feats] = imp_vals

def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

for feat in feats:
    srs_feat_base = df_ori_w_nans.loc[ids_imp_trn, feat].isna()
    ids_feat_base = srs_feat_base.index[srs_feat_base == True].values
    if len(ids_feat_base) > 0:
        feat_base_vals = df_ori.loc[ids_feat_base, feat].unique()
        srs_feat_trgt = df_xpnt_w_nans.loc[ids_imp_tst, feat].isna()
        ids_feat_trgt = srs_feat_trgt.index[srs_feat_trgt == True].values
        for id_trgt in ids_feat_trgt:
            df_imp.at[id_trgt, feat] = find_nearest(feat_base_vals, df_imp.at[id_trgt, feat])

df_xpnt = df_xpnt_w_nans.copy()
df_xpnt.loc[ids_imp_tst, feats] = df_imp.loc[ids_imp_tst, feats]
df_xpnt['Region'] = 'Mirny'
df_xpnt['Time'] = 'T0'
df_xpnt['Sample_Chronology'] = 0
df_xpnt['Is longitudinal?'] = False
df_xpnt['Status'] = 'Control'
df_xpnt['file'] = '48-plex-human-_xPONENT_2024.xlsx'
df_xpnt['Subject ID'] = df_xpnt.index.values
df_xpnt['SImAge'] = model_simage(torch.from_numpy(df_xpnt.loc[:, feats_fimmu].values)).cpu().detach().numpy().ravel()
df_xpnt['SImAge acceleration'] = df_xpnt['SImAge'] - df_xpnt['Age']
df_xpnt['|SImAge acceleration|'] = df_xpnt['SImAge acceleration'].abs()
for f in feats:
    df_xpnt[f"{f}_log"] = np.log(df_xpnt[f"{f}"])

## Merge xponent data it with the original data

In [None]:
df_all = pd.concat([df_ori, df_xpnt], verify_integrity=True)
df_all.to_excel(f"{path_save}/df_all.xlsx")

# Process analyst data

In [None]:
files_nlst = ['plate_1_analyst_2024', 'plate_2_analyst_2024', 'plate_3_analyst_2024']
dfs_nlst = []
for file in files_nlst:
    df_nlst_file = pd.read_excel(f"{path}/data/immuno/files/processed/10-March-2024/{file}.xlsx", index_col="Sample ID")
    df_nlst_file = df_nlst_file.loc[df_nlst_file.index.str.startswith('M', na=False), feats]
    dfs_nlst.append(df_nlst_file) 
df_nlst_w_nans = pd.concat(dfs_nlst, verify_integrity=True)
df_nlst_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
df_nlst_w_nans = df_nlst_w_nans.apply(pd.to_numeric, errors='coerce')
df_mirny_pheno = pd.read_excel(f"{path}/data/immuno/files/processed/10-March-2024/Список Мирный.xlsx", index_col=0)
df_nlst_w_nans.loc[df_nlst_w_nans.index.values, ['Age', 'Sex', 'Nationality']] = df_mirny_pheno.loc[df_nlst_w_nans.index.values, ['Age', 'Sex', 'Nationality']]
ids_imp_trn = df_ori.index[(df_ori['Region'] == 'Central') & (df_ori['Status'] == 'Control')]
ids_imp_tst = df_nlst_w_nans.index.values
df_imp = pd.concat([
    df_ori.loc[ids_imp_trn, feats],
    df_nlst_w_nans.loc[:, feats]
])
df_imp.loc[:, feats] = df_imp.loc[:, feats].astype('float')
imp_vals = fast_knn(df_imp.loc[:, feats].values)
df_imp.loc[:, feats] = imp_vals

def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

for feat in feats:
    srs_feat_base = df_ori_w_nans.loc[ids_imp_trn, feat].isna()
    ids_feat_base = srs_feat_base.index[srs_feat_base == True].values
    if len(ids_feat_base) > 0:
        feat_base_vals = df_ori.loc[ids_feat_base, feat].unique()
        srs_feat_trgt = df_nlst_w_nans.loc[ids_imp_tst, feat].isna()
        ids_feat_trgt = srs_feat_trgt.index[srs_feat_trgt == True].values
        for id_trgt in ids_feat_trgt:
            df_imp.at[id_trgt, feat] = find_nearest(feat_base_vals, df_imp.at[id_trgt, feat])

df_nlst = df_nlst_w_nans.copy()
df_nlst.loc[ids_imp_tst, feats] = df_imp.loc[ids_imp_tst, feats]
df_nlst['Region'] = 'Mirny'
df_nlst['Time'] = 'T0'
df_nlst['Sample_Chronology'] = 0
df_nlst['Is longitudinal?'] = False
df_nlst['Status'] = 'Control'
df_nlst['file'] = '48-plex-human-_xPONENT_2024.xlsx'
df_nlst['Subject ID'] = df_nlst.index.values
df_nlst['SImAge'] = model_simage(torch.from_numpy(df_nlst.loc[:, feats_fimmu].values)).cpu().detach().numpy().ravel()
df_nlst['SImAge acceleration'] = df_nlst['SImAge'] - df_nlst['Age']
df_nlst['|SImAge acceleration|'] = df_nlst['SImAge acceleration'].abs()
for f in feats:
    df_nlst[f"{f}_log"] = np.log(df_nlst[f"{f}"])

df_nlst.to_excel(f"{path_save}/df_nlst.xlsx")

# Data differences

## Reload all data 

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_save = f"{path}/special/061_new_imm_data"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

feats = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_slctd = pd.read_excel(f"{path}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

df_ori = pd.read_excel(f"{path}/data/immuno/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df_ori['Program'] = 'xponent'
df_nlst = pd.read_excel(f"{path_save}/df_nlst.xlsx", index_col=0)
df_nlst.index += '_nlst'
df_nlst['Program'] = 'analyst'

df = pd.concat([df_ori, df_nlst])

files = [
    "Aging L, Q, H, I",
    "Aging-Covid_05.01.2022",
    "Aging-Covid-05.05.22",
    "Covid_results_02_2021",
    "Covid-25.11.20",
    "MULTIPLEX_20_11_2020_ AGING",
    "Yakutiya + TR",
    "Мультиплекс_Agind&Covid",
    "10-March-2024/48-plex-human-_xPONENT_2024", 
    "10-March-2024/plate_1_analyst_2024",
    "10-March-2024/plate_2_analyst_2024", 
    "10-March-2024/plate_3_analyst_2024", 
]
df_imm_genes = pd.read_excel(f"{path}/data/immuno/immuno_markers_genes.xlsx")
dict_imm_genes = dict(zip(df_imm_genes['immuno_marker'], df_imm_genes['gene']))

dfs_files = []
nans_by_features = {}
for file in files:
    if file in ["10-March-2024/48-plex-human-_xPONENT_2024", "10-March-2024/plate_1_analyst_2024", "10-March-2024/plate_2_analyst_2024", "10-March-2024/plate_3_analyst_2024"]:
        df_file = pd.read_excel(f"{path}/data/immuno/files/processed/{file}.xlsx", index_col=0)
    else:
        df_file = pd.read_excel(f"{path}/data/immuno/files/processed/{file}.xlsx", index_col="Sample")
    df_file.rename(columns=dict_imm_genes, inplace=True)
    df_file = df_file.loc[:, feats]

    # duplicates processing
    if file == "MULTIPLEX_20_11_2020_ AGING":
        df_file_doubled_unique = df_file.loc[~df_file.index.duplicated(keep=False), :]
        df_file_doubled_1 = df_file.loc[df_file.index.duplicated(keep='first'), :]
        df_file_doubled_2 = df_file.loc[df_file.index.duplicated(keep='last'), :]
        df_file_duplicates_final = pd.concat([df_file_doubled_2, df_file_doubled_unique], axis=0)
        df_file = df_file_duplicates_final
    elif file == "10-March-2024/48-plex-human-_xPONENT_2024":
        df_file = df_file.loc[df_file.index.str.startswith('M', na=False), :]
    elif file in ["10-March-2024/plate_1_analyst_2024", "10-March-2024/plate_2_analyst_2024", "10-March-2024/plate_3_analyst_2024"]:
        df_file = df_file.loc[df_file.index.str.startswith('M', na=False), :]
        df_file.index += '_nlst'
    df_file_duplicates = df_file.loc[df_file.index.duplicated(keep=False), :]
    if df_file_duplicates.shape[0] > 0:
        print(df_file_duplicates.index)
        
    for feat in df_file:
        nan_vals = set(df_file.loc[df_file[feat].astype(str).str.contains(r'^([<>].*)$', regex=True), feat].values)
        if len(nan_vals) > 0:
            for nv in nan_vals:
                if feat in nans_by_features:
                    nans_by_features[feat].add(nv)
                else:
                    nans_by_features[feat] = {nv}
    
    dfs_files.append(df_file)

df_w_nans = pd.concat(dfs_files, verify_integrity=False)
df_w_nans.index = df_w_nans.index.map(str)
df_w_nans = df_w_nans.loc[df.index.values, :]
df_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
for feat in feats:
    ids_imputed_above = df_w_nans.index[df_w_nans[feat].astype(str).str.contains('>')]
    df_w_nans.loc[ids_imputed_above, feat] = df.loc[ids_imputed_above, feat]
df_w_nans = df_w_nans.apply(pd.to_numeric, errors='coerce')

In [None]:
groups_ids = {
    'Central': df.index[(df['Status'] == 'Control') & (df['PMC10485620 ID'].notna())].values,
    'Yakutia': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Yakutia')].values,
    'Mirny xPONENT': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Mirny') & (df['Program'] == 'xponent')].values,
    'Mirny Analyst': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Mirny') & (df['Program'] == 'analyst')].values,
}

groups_colors = {
    'Central': 'gold',
    'Yakutia': 'lightslategray',
    'Mirny xPONENT': 'darkblue',
    'Mirny Analyst': 'darkgreen',
}

## Plot distribution

In [None]:
hist_bins = np.linspace(5, 115, 23)

fig, ax = plt.subplots(figsize=(4, 3))
sns.set_theme(style='whitegrid')
histplot = sns.histplot(
    data=df.loc[groups_ids['Mirny xPONENT'], :],
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Sex',
    palette={'F': 'crimson', 'M': 'dodgerblue'},
    hue_order=['F', 'M'],
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path_save}/histplot.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/histplot.pdf", bbox_inches='tight')
plt.close(fig)

## NaN statistics

In [None]:
pathlib.Path(f"{path_save}/nan").mkdir(parents=True, exist_ok=True)

n_rows = 2
n_cols = 2
fig_width = 16
fig_height = 10

fig_bar, axs_bar = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True)
fig_hist, axs_hist = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharex=True)
sns.set_theme(style='whitegrid')

dfs_nan_feats = {}
df_nan_feats_by_group = pd.DataFrame(index=list(groups_ids.keys()))
for group_id, group in enumerate(groups_ids.keys()):
    row_id, col_id = divmod(group_id, n_cols)
    
    df_nan_feats = df_w_nans.loc[groups_ids[group], feats].isna().sum(axis=0).to_frame(name="Number of NaNs")
    df_nan_feats["% of NaNs"] = df_nan_feats["Number of NaNs"] / len(groups_ids[group]) * 100
    df_nan_feats["Number of not-NaNs"] = df_w_nans.loc[groups_ids[group], feats].notna().sum(axis=0)
    df_nan_feats.sort_values(["% of NaNs"], ascending=[False], inplace=True)
    dfs_nan_feats[group] = df_nan_feats
    df_nan_feats_by_group.at[group, "% of NaNs"] = df_nan_feats["Number of NaNs"].sum(axis=0) / df_w_nans.loc[groups_ids[group], feats].size * 100
    
    barplot = sns.barplot(
        data=df_nan_feats,
        x=df_nan_feats.index,
        y=f"% of NaNs",
        edgecolor='black',
        color=groups_colors[group],
        dodge=False,
        ax=axs_bar[row_id, col_id],
    )
    axs_bar[row_id, col_id].set_title(f"{group}")
    axs_bar[row_id, col_id].set_xticklabels(axs_bar[row_id, col_id].get_xticklabels(), rotation=90)

    df_nan_smpls = df_w_nans.loc[groups_ids[group], feats].isna().sum(axis=1).to_frame(name="Features with NaNs")
    
    hist_bins = np.linspace(0, len(feats), len(feats) + 1)
    histplot = sns.histplot(
        data=df_nan_smpls,
        discrete=True,
        edgecolor='k',
        linewidth=1,
        x="Features with NaNs",
        color=groups_colors[group],
        ax=axs_hist[row_id, col_id],
    )
    axs_hist[row_id, col_id].set(xlim=(-0.6, len(feats)+0.6))
    axs_hist[row_id, col_id].set_title(f"{group}")
    axs_hist[row_id, col_id].set_ylabel(f"Number of samples")

fig_bar.tight_layout()    
fig_bar.savefig(f"{path_save}/nan/feats.png", bbox_inches='tight', dpi=200)
fig_bar.savefig(f"{path_save}/nan/feats.pdf", bbox_inches='tight')
plt.close(fig_bar)

with pd.ExcelWriter(f'{path_save}/nan/feats.xlsx', engine='xlsxwriter') as writer:
    for group_id, group in enumerate(groups_ids.keys()):
        dfs_nan_feats[group].to_excel(writer, sheet_name=group)

fig_hist.tight_layout()    
fig_hist.savefig(f"{path_save}/nan/samples.png", bbox_inches='tight', dpi=200)
fig_hist.savefig(f"{path_save}/nan/samples.pdf", bbox_inches='tight')
plt.close(fig_hist)

plt.figure(figsize=(4, 6))
sns.set_theme(style='whitegrid')
barplot = sns.barplot(
    data=df_nan_feats_by_group,
    y=df_nan_feats_by_group.index,
    x=f"% of NaNs",
    edgecolor='black',
    palette=groups_colors,
    dodge=False,
    orient='h'
)
for x in barplot.containers:
    barplot.bar_label(x, fmt="%.1f")
plt.savefig(f"{path_save}/nan/global.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/nan/global.pdf", bbox_inches='tight')
plt.close()

## Comparing two programs for Mirny data

In [None]:
pathlib.Path(f"{path_save}/xponent_vs_analyst").mkdir(parents=True, exist_ok=True)

df_stat = pd.DataFrame(index=list(feats))
ids_mirny = set.union(set(groups_ids['Mirny xPONENT']), set(groups_ids['Mirny Analyst']))
for feat in list(feats):
    vals = {}
    for group in ['Mirny xPONENT', 'Mirny Analyst']:
        vals[group] = df.loc[groups_ids[group], feat].values
    _, df_stat.at[feat, "mw_pval"] = mannwhitneyu(vals['Mirny xPONENT'], vals['Mirny Analyst'], alternative='two-sided')
    
    df_pivot = df.loc[ids_mirny].pivot(index='Subject ID', columns='Program', values=feat).dropna()
    diff = df_pivot.loc[:, 'xponent'].values - df_pivot.loc[:, 'analyst'].values
    if np.linalg.norm(diff) > 0:
        res = wilcoxon(
            x=df_pivot.loc[:, 'xponent'].values,
            y=df_pivot.loc[:, 'analyst'].values,
            alternative='two-sided'
        )
        df_stat.at[feat, "wlxn_pval"] =  res.pvalue
    else:
        df_stat.at[feat, "wlxn_pval"] = 1.0
    
_, df_stat.loc[feats, "mw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_pval"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "wlxn_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "wlxn_pval"], 0.05, method='fdr_bh')
df_stat.to_excel(f"{path_save}/xponent_vs_analyst/stat.xlsx", index_label='Features')

### Plots for Mann-Whitney

In [None]:
df_fig = df_stat.loc[feats, :]
df_fig.sort_values([f"mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_fig['mw_pval_fdr_bh_log'] = -np.log10(df_fig['mw_pval_fdr_bh'])
df_fig['color'] = 'pink'
df_fig.loc[df_fig['mw_pval_fdr_bh'] < 0.05, 'color'] = 'red'

fig, ax = plt.subplots(figsize=(3, 16))
sns.set_theme(style='whitegrid')
barplot = sns.barplot(
    data=df_fig,
    y=df_fig.index.values,
    x='mw_pval_fdr_bh_log',
    edgecolor='black',
    palette=df_fig['color'].values,
    dodge=True,
    ax=ax
)
ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
ax.set_ylabel('', fontsize=20)
ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
plt.savefig(f"{path_save}/xponent_vs_analyst/barplot_mw.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/xponent_vs_analyst/barplot_mw.pdf", bbox_inches='tight')
plt.close(fig)

In [None]:
n_rows = 6
n_cols = 8
fig_width = 20
fig_height = 16

colors_mirny = {
    'xponent': 'darkblue',
    'analyst': 'darkgreen',
}

fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={})
sns.set_theme(style='whitegrid')

df_stat.sort_values([f"mw_pval"], ascending=[True], inplace=True)
feats_sorted = df_stat.index[df_stat.index.isin(feats)].values

for f_id, f in enumerate(feats_sorted):
    row_id, col_id = divmod(f_id, n_cols)
    
    df_fig = df.loc[ids_mirny, :]
    
    sns.violinplot(
        data=df_fig,
        x='Program',
        y=f,
        palette=colors_mirny,
        scale='width',
        order=list(colors_mirny.keys()),
        saturation=0.75,
        cut=0,
        linewidth=1.0,
        ax=axs[row_id, col_id],
        legend=False,
    )
    axs[row_id, col_id].set_ylabel(f)
    axs[row_id, col_id].set_xlabel('')
    axs[row_id, col_id].set(xticklabels=[]) 
    mw_pval = df_stat.at[f, "mw_pval_fdr_bh"]
    pval_formatted = [f'{mw_pval:.2e}']
    annotator = Annotator(
        axs[row_id, col_id],
        pairs=[('xponent', 'analyst')],
        data=df_fig,
        x='Program',
        y=f,
        order=list(colors_mirny.keys()),
    )
    annotator.set_custom_annotations(pval_formatted)
    annotator.configure(loc='outside')
    annotator.annotate()
axs[n_rows - 1, n_cols - 1].axis('off')
axs[n_rows - 1, n_cols - 2].axis('off')
legend_handles = [
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_mirny['xponent'], markersize=10, label='xponent'),
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=colors_mirny['analyst'], markersize=10, label='analyst')
]
fig.legend(handles=legend_handles, bbox_to_anchor=(0.5, 1.0), loc="lower center", ncol=2, frameon=False, fontsize='large')
fig.tight_layout()    
plt.savefig(f"{path_save}/xponent_vs_analyst/feats_mw.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/xponent_vs_analyst/feats_mw.pdf", bbox_inches='tight')
plt.close(fig)
    

### Plots for Wilcoxon


In [None]:
df_fig = df_stat.loc[feats, :]
df_fig.sort_values([f"wlxn_pval_fdr_bh"], ascending=[True], inplace=True)
df_fig['wlxn_pval_fdr_bh_log'] = -np.log10(df_fig['wlxn_pval_fdr_bh'])
df_fig['color'] = 'pink'
df_fig.loc[df_fig['wlxn_pval_fdr_bh'] < 0.05, 'color'] = 'red'

fig, ax = plt.subplots(figsize=(3, 16))
sns.set_theme(style='whitegrid')
barplot = sns.barplot(
    data=df_fig,
    y=df_fig.index.values,
    x='wlxn_pval_fdr_bh_log',
    edgecolor='black',
    palette=df_fig['color'].values,
    dodge=True,
    ax=ax
)
ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
ax.set_ylabel('', fontsize=20)
ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
plt.savefig(f"{path_save}/xponent_vs_analyst/barplot_wlxn.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/xponent_vs_analyst/barplot_wlxn.pdf", bbox_inches='tight')
plt.close(fig)

In [None]:
n_rows = 6
n_cols = 8
fig_width = 20
fig_height = 16

samples_mirny = df.loc[ids_mirny, 'Subject ID'].unique()
colors_xkcd = list(matplotlib.colors.XKCD_COLORS.values())
colors_samples = {x: colors_xkcd[x_id] for x_id, x in enumerate(samples_mirny)}

fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={})
sns.set_theme(style='whitegrid')

df_stat.sort_values([f"wlxn_pval"], ascending=[True], inplace=True)
feats_sorted = df_stat.index[df_stat.index.isin(feats)].values

for f_id, f in tqdm(enumerate(feats_sorted)):
    row_id, col_id = divmod(f_id, n_cols)
    
    df_fig = df.loc[ids_mirny, :]

    sns.scatterplot(
        data=df_fig,
        x='Program',
        y=f,
        hue='Subject ID',
        edgecolor="k",
        linewidth=0.001,
        palette=colors_samples,
        hue_order=list(colors_samples.keys()),
        alpha=0.75,
        s=100,
        legend=False,
        ax=axs[row_id, col_id]
    )
    sns.lineplot(
        data=df_fig,
        x='Program',
        y=f,
        hue='Subject ID',
        palette=colors_samples,
        hue_order=list(colors_samples.keys()),
        legend=False,
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set_xlabel('')
    pval = df_stat.at[f, "wlxn_pval_fdr_bh"]
    axs[row_id, col_id].set_title(f'{pval:.2e}')
axs[n_rows - 1, n_cols - 1].axis('off')
axs[n_rows - 1, n_cols - 2].axis('off')
fig.tight_layout()    
plt.savefig(f"{path_save}/xponent_vs_analyst/feats_wlxn.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/xponent_vs_analyst/feats_wlxn.pdf", bbox_inches='tight')
plt.close(fig)

### SImAge acceleration

In [None]:
formula = f"SImAge ~ Age"
model = smf.ols(formula=formula, data=df.loc[groups_ids['Central'], :]).fit()
df[f"SImAge_Central_linreg"] = model.predict(df)
df[f"SImAge residuals"] = df['SImAge'] - df["SImAge_Central_linreg"]

df_fig = df.loc[ids_mirny, :]

colors_mirny = {
    'xponent': 'darkblue',
    'analyst': 'darkgreen',
}

fig, ax = plt.subplots(figsize=(4.5, 4))
sns.set_theme(style='whitegrid')
scatter = sns.scatterplot(
    data=df_fig,
    x="Age",
    y="SImAge",
    hue="Program",
    palette=colors_mirny,
    linewidth=0.2,
    alpha=0.75,
    edgecolor="k",
    s=20,
    hue_order=list(colors_mirny.keys()),
    ax=ax
)
bisect = sns.lineplot(
    x=[0, 120],
    y=[0, 120],
    linestyle='--',
    color='black',
    linewidth=1.0,
    ax=ax
)
df_line = pd.DataFrame({'Age': [-100, 200]})
df_line[f"SImAge_Central_linreg"] = model.predict(df_line)
central_linreg_back = sns.lineplot(
    x=df_line['Age'].values,
    y=df_line['SImAge_Central_linreg'].values,
    color='black',
    linewidth=3.0,
    ax=ax
)
central_linreg_front = sns.lineplot(
    x=df_line['Age'].values,
    y=df_line['SImAge_Central_linreg'].values,
    color=groups_colors['Central'],
    linewidth=2.0,
    ax=ax
)
mae_xponent = mean_absolute_error(df_fig.loc[groups_ids['Mirny xPONENT'], 'Age'].values, df_fig.loc[groups_ids['Mirny xPONENT'], 'SImAge'].values)
mae_analyst = mean_absolute_error(df_fig.loc[groups_ids['Mirny Analyst'], 'Age'].values, df_fig.loc[groups_ids['Mirny Analyst'], 'SImAge'].values)
ax.set_title(f"MAE xponent: {mae_xponent:0.1f}\nMAE analyst: {mae_analyst:0.1f}")
ax.set_xlabel("Age")
ax.set_ylabel("SImAge")
ax.set_xlim(0, 120)
ax.set_ylim(0, 120)
plt.gca().set_aspect('equal', adjustable='box')
fig.savefig(f"{path_save}/xponent_vs_analyst/SImAge_scatter.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/xponent_vs_analyst/SImAge_scatter.pdf", bbox_inches='tight')
plt.close()

fig, ax = plt.subplots(figsize=(4, 4))
sns.set_theme(style='whitegrid')
violin = sns.violinplot(
    data=df_fig,
    x='Program',
    y='SImAge acceleration',
    palette=colors_mirny,
    scale='width',
    order=list(colors_mirny.keys()),
    saturation=0.75,
)
violin.set_xlabel(f"")
mw_pval = mannwhitneyu(
    df_fig.loc[df_fig['Program'] == 'xponent', 'SImAge acceleration'].values,
    df_fig.loc[df_fig['Program'] == 'analyst', 'SImAge acceleration'].values,
    alternative='two-sided').pvalue
pval_formatted = [f'{mw_pval:.2e}']
annotator = Annotator(
    violin,
    pairs=[('xponent', 'analyst')],
    data=df_fig,
    x='Program',
    y='SImAge acceleration',
    order=list(colors_mirny.keys())
)
annotator.set_custom_annotations(pval_formatted)
annotator.configure(loc='outside')
annotator.annotate()
fig.savefig(f"{path_save}/xponent_vs_analyst/SImAge_acceleration.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/xponent_vs_analyst/SImAge_acceleration.pdf", bbox_inches='tight')
plt.close()

fig, ax = plt.subplots(figsize=(4, 4))
sns.set_theme(style='whitegrid')
violin = sns.violinplot(
    data=df_fig,
    x='Program',
    y='SImAge residuals',
    palette=colors_mirny,
    scale='width',
    order=list(colors_mirny.keys()),
    saturation=0.75,
)
violin.set_xlabel(f"")
mw_pval = mannwhitneyu(
    df_fig.loc[df_fig['Program'] == 'xponent', 'SImAge residuals'].values,
    df_fig.loc[df_fig['Program'] == 'analyst', 'SImAge residuals'].values,
    alternative='two-sided').pvalue
pval_formatted = [f'{mw_pval:.2e}']
annotator = Annotator(
    violin,
    pairs=[('xponent', 'analyst')],
    data=df_fig,
    x='Program',
    y='SImAge residuals',
    order=list(colors_mirny.keys())
)
annotator.set_custom_annotations(pval_formatted)
annotator.configure(loc='outside')
annotator.annotate()
fig.savefig(f"{path_save}/xponent_vs_analyst/SImAge_residuals.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/xponent_vs_analyst/SImAge_residuals.pdf", bbox_inches='tight')
plt.close()


## Mirny vs Central (same age)

In [None]:
pathlib.Path(f"{path_save}/central_vs_mirny").mkdir(parents=True, exist_ok=True)

ids_mirny = df.index[(df['Status'] == 'Control') & (df['Region'] == 'Mirny') & (df['Program'] == 'xponent')].values
ids_central = df.index[(df['Status'] == 'Control') & (df['PMC10485620 ID'].notna())].values

hist_bins = np.linspace(5, 115, 23)
age_counts, age_bin_edges = np.histogram(df.loc[ids_mirny, 'Age'].values, bins=hist_bins)
age_prob = age_counts / len(df.loc[ids_mirny, 'Age'].values)
bin_diff = age_bin_edges[1] - age_bin_edges[0]
df.loc[ids_central, 'Prob Age (Mirny)'] = age_prob[np.rint((df.loc[ids_central, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]

n_same_age = 150
index_central_same_age = df.loc[ids_central, :].sample(n=n_same_age, replace=False, weights='Prob Age (Mirny)', random_state=1337).index
if index_central_same_age.is_unique:
    ids_central_same_age = index_central_same_age.values
else:
    print("Not unique index")

In [None]:
df_fig_1 = df.loc[ids_central, ['Age']].copy()
df_fig_1['Group'] = 'Central'
df_fig_2 = df.loc[ids_central_same_age, ['Age']].copy()
df_fig_2['Group'] = 'Central (Age as Mirny)'
df_fig_3 = df.loc[ids_mirny, ['Age']].copy()
df_fig_3['Group'] = 'Mirny'
df_fig = pd.concat([df_fig_1, df_fig_2, df_fig_3], ignore_index=True)

hist_bins = np.linspace(5, 115, 23)

sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(6, 4))
histplot = sns.histplot(
    data=df_fig,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Group',
    palette={'Central': 'gold', 'Central (Age as Mirny)': 'crimson', 'Mirny': 'dodgerblue'},
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path_save}/central_vs_mirny/Hist_DataSelection.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/central_vs_mirny/Hist_DataSelection.pdf", bbox_inches='tight')
plt.close(fig)

In [None]:
groups_ids = {
    'Central': ids_central_same_age,
    'Mirny': ids_mirny,
}

for group_name, ids in groups_ids.items():
    df.loc[ids, 'Group'] = group_name

groups_colors = {
    'Central': 'crimson',
    'Mirny': 'dodgerblue',
}

### Calculate statistics

In [None]:
df_stat = pd.DataFrame(index=list(feats_fimmu))
for feat in list(feats_fimmu):
    vals = {}
    for group, ids in groups_ids.items():
        vals[group] = df.loc[ids, feat].values
        df_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_stat.at[feat, f"q75_{group}"], df_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at[feat, f"iqr_{group}"] = df_stat.at[feat, f"q75_{group}"] - df_stat.at[feat, f"q25_{group}"]
    _, df_stat.at[feat, "mw_pval"] = mannwhitneyu(vals['Central'], vals['Mirny'], alternative='two-sided')

_, df_stat.loc[feats_fimmu, "mw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats_fimmu, "mw_pval"], 0.05, method='fdr_bh')
df_stat.sort_values([f"mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path_save}/central_vs_mirny/Stat_Feats.xlsx", index_label='Features')

### Plot features p-values

In [None]:
df_stat['mw_pval_fdr_bh_log'] = -np.log10(df_stat['mw_pval_fdr_bh'])
df_stat['color'] = 'pink'
df_stat.loc[df_stat['mw_pval_fdr_bh'] < 0.05, 'color'] = 'red'
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(4, 8))
barplot = sns.barplot(
    data=df_stat,
    y=df_stat.index.values,
    x='mw_pval_fdr_bh_log',
    edgecolor='black',
    palette=df_stat['color'].values,
    dodge=True,
    ax=ax
)
ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
ax.set_ylabel('', fontsize=20)
ax.set_xticklabels([f"{int(tick):d}" for tick in ax.get_xticks()], fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize = 16)
plt.savefig(f"{path_save}/central_vs_mirny/Barplot_Stat_Feats.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/central_vs_mirny/Barplot_Stat_Feats.pdf", bbox_inches='tight')
plt.close(fig)

### Plot features distributions

In [None]:
n_rows = 2
n_cols = 5
fig_width = 12
fig_height = 8

df_fig = df.loc[list(set.union(set(ids_mirny), set(ids_central_same_age))), :]

sns.set_theme(style='whitegrid')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={})
feats_sorted = df_stat.index.values
for f_id, f in enumerate(feats_sorted):
    row_id, col_id = divmod(f_id, n_cols)
    
    q01 = df_fig[f].quantile(0.01)
    q99 = df_fig[f].quantile(0.99)
    sns.violinplot(
        data=df_fig.loc[(df_fig[f] > q01) & (df_fig[f] < q99), :],
        x='Group',
        y=f,
        palette=groups_colors,
        scale='width',
        order=list(groups_colors.keys()),
        saturation=0.75,
        cut=0,
        linewidth=1.0,
        ax=axs[row_id, col_id],
        legend=False,
    )
    axs[row_id, col_id].set_ylabel(f)
    axs[row_id, col_id].set_xlabel('')
    axs[row_id, col_id].set(xticklabels=[]) 
    mw_pval = df_stat.at[f, "mw_pval_fdr_bh"]
    pval_formatted = [f'{mw_pval:.2e}']
    annotator = Annotator(
        axs[row_id, col_id],
        pairs=[('Central', 'Mirny')],
        data=df_fig,
        x='Region',
        y=f,
        order=list(groups_colors.keys()),
    )
    annotator.set_custom_annotations(pval_formatted)
    annotator.configure(loc='outside')
    annotator.annotate()

legend_handles = [
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=groups_colors['Central'], markersize=10, label='Central'),
    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor=groups_colors['Mirny'], markersize=10, label='Mirny')
]
fig.legend(handles=legend_handles, bbox_to_anchor=(0.5, 1.0), loc="lower center", ncol=2, frameon=False, fontsize='large')
fig.tight_layout()    
plt.savefig(f"{path_save}/central_vs_mirny/Violin_Feats.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/central_vs_mirny/Violin_Feats.pdf", bbox_inches='tight')
plt.close(fig)
    

### SImAge analysis

In [None]:
df_fig = df.loc[list(set.union(set(ids_mirny), set(ids_central_same_age))), :]
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(4, 4))
scatter = sns.scatterplot(
    data=df_fig,
    x="Age",
    y="SImAge",
    hue="Group",
    palette=groups_colors,
    linewidth=0.2,
    alpha=0.75,
    edgecolor="k",
    s=40,
    ax=ax
)
bisect = sns.lineplot(
    x=[0, 120],
    y=[0, 120],
    linestyle='--',
    color='black',
    linewidth=1.0,
    ax=ax
)
mae_central = mean_absolute_error(df_fig.loc[ids_central_same_age, 'Age'].values, df_fig.loc[ids_central_same_age, 'SImAge'].values)
mae_mirny = mean_absolute_error(df_fig.loc[ids_mirny, 'Age'].values, df_fig.loc[ids_mirny, 'SImAge'].values)
ax.set_title(f"MAE Central: {mae_central:0.1f}\nMAE Mirny: {mae_mirny:0.1f}")
ax.set_xlim(0, 120)
ax.set_ylim(0, 120)
plt.gca().set_aspect('equal', adjustable='box')
fig.savefig(f"{path_save}/central_vs_mirny/SImAge_scatter.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/central_vs_mirny/SImAge_scatter.pdf", bbox_inches='tight')
plt.close()

sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(4, 4))
violin = sns.violinplot(
    data=df_fig,
    x='Group',
    y='SImAge acceleration',
    palette=groups_colors,
    scale='width',
    order=list(groups_colors.keys()),
    saturation=0.75,
)
violin.set_xlabel(f"")
mw_pval = mannwhitneyu(
    df_fig.loc[df_fig['Group'] == 'Central', 'SImAge acceleration'].values,
    df_fig.loc[df_fig['Group'] == 'Mirny', 'SImAge acceleration'].values,
    alternative='two-sided').pvalue
pval_formatted = [f'{mw_pval:.2e}']
annotator = Annotator(
    violin,
    pairs=[('Central', 'Mirny')],
    data=df_fig,
    x='Group',
    y='SImAge acceleration',
    order=list(groups_colors.keys())
)
annotator.set_custom_annotations(pval_formatted)
annotator.configure(loc='outside')
annotator.annotate()
fig.savefig(f"{path_save}/central_vs_mirny/SImAge_acceleration.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/central_vs_mirny/SImAge_acceleration.pdf", bbox_inches='tight')
plt.close()

# No-age data vs Mirny vs Central (same age)

## Load data

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_save = f"{path}/special/061_new_imm_data"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

feats = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_slctd = pd.read_excel(f"{path}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

df = pd.read_excel(f"{path}/data/immuno/data.xlsx", index_col=0)
df_w_nans = pd.read_excel(f"{path}/data/immuno/data_with_nans.xlsx", index_col=0)

## Impute No-age data

In [None]:
df_noage_w_nans = pd.read_excel(f"{path}/data/immuno/files/processed/10-March-2024/48-plex-human-_xPONENT_2024.xlsx", index_col="Sample ID")
df_noage_w_nans.index = df_noage_w_nans.index.map(str)

In [None]:
df_noage_w_nans = df_noage_w_nans.loc[df_noage_w_nans.index.str.startswith('F', na=False) | (df_noage_w_nans.index.str.isnumeric()), feats]
df_noage_w_nans.index = 'No_age_' + df_noage_w_nans.index

In [None]:
df_noage_w_nans.replace(r'^([\<].*)$', 'NaN', inplace=True, regex=True)
df_noage_w_nans = df_noage_w_nans.apply(pd.to_numeric, errors='coerce')
ids_imp_trn = df.index[(df['Region'] == 'Central') & (df['Status'] == 'Control')].values
ids_imp_tst = df_noage_w_nans.index.values
df_imp = pd.concat([
    df.loc[ids_imp_trn, feats],
    df_noage_w_nans.loc[:, feats]
])
df_imp.loc[:, feats] = df_imp.loc[:, feats].astype('float')
imp_vals = fast_knn(df_imp.loc[:, feats].values)
df_imp.loc[:, feats] = imp_vals

def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [None]:
for feat in feats:
    srs_feat_base = df_w_nans.loc[ids_imp_trn, feat].isna()
    ids_feat_base = srs_feat_base.index[srs_feat_base == True].values
    if len(ids_feat_base) > 0:
        feat_base_vals = df.loc[ids_feat_base, feat].unique()
        srs_feat_trgt = df_noage_w_nans.loc[ids_imp_tst, feat].isna()
        ids_feat_trgt = srs_feat_trgt.index[srs_feat_trgt == True].values
        for id_trgt in ids_feat_trgt:
            df_imp.at[id_trgt, feat] = find_nearest(feat_base_vals, df_imp.at[id_trgt, feat])

df_noage = df_noage_w_nans.copy()
df_noage.loc[ids_imp_tst, feats] = df_imp.loc[ids_imp_tst, feats]

## Preparing data for versus

In [None]:
pathlib.Path(f"{path_save}/no-age_vs_central_vs_mirny").mkdir(parents=True, exist_ok=True)

ids_mirny = df.index[(df['Status'] == 'Control') & (df['Region'] == 'Mirny')].values
ids_central = df.index[(df['Status'] == 'Control') & (df['PMC10485620 ID'].notna())].values

hist_bins = np.linspace(5, 115, 23)
age_counts, age_bin_edges = np.histogram(df.loc[ids_mirny, 'Age'].values, bins=hist_bins)
age_prob = age_counts / len(df.loc[ids_mirny, 'Age'].values)
bin_diff = age_bin_edges[1] - age_bin_edges[0]
df.loc[ids_central, 'Prob Age (Mirny)'] = age_prob[np.rint((df.loc[ids_central, 'Age'].values - age_bin_edges[0]) / (bin_diff + 0.0001)).astype(int)]

n_same_age = 150
index_central_same_age = df.loc[ids_central, :].sample(n=n_same_age, replace=False, weights='Prob Age (Mirny)', random_state=1337).index
if index_central_same_age.is_unique:
    ids_central_same_age = index_central_same_age.values
else:
    print("Not unique index")

In [None]:
df_fig = pd.concat([
    df.loc[ids_central_same_age, feats],
    df.loc[ids_mirny, feats],
    df_noage.loc[:, feats]
])

groups_ids = {
    'Central': ids_central_same_age,
    'Mirny': ids_mirny,
    'No-age': df_noage.index.values
}

## Calculate statistics

In [None]:
df_stat = pd.DataFrame(index=list(feats))
for feat in list(feats):
    vals = {}
    for group, ids in groups_ids.items():
        vals[group] = df_fig.loc[ids, feat].values
    _, df_stat.at[feat, "Central vs Mirny"] = mannwhitneyu(vals['Central'], vals['Mirny'], alternative='two-sided')
    _, df_stat.at[feat, "Central vs No-age"] = mannwhitneyu(vals['Central'], vals['No-age'], alternative='two-sided')
    _, df_stat.at[feat, "Mirny vs No-age"] = mannwhitneyu(vals['Mirny'], vals['No-age'], alternative='two-sided')

_, df_stat.loc[feats, "Central vs Mirny FDR"], _, _ = multipletests(df_stat.loc[feats, "Central vs Mirny"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "Central vs No-age FDR"], _, _ = multipletests(df_stat.loc[feats, "Central vs No-age"], 0.05, method='fdr_bh')
_, df_stat.loc[feats, "Mirny vs No-age FDR"], _, _ = multipletests(df_stat.loc[feats, "Mirny vs No-age"], 0.05, method='fdr_bh')
df_stat.to_excel(f"{path_save}/no-age_vs_central_vs_mirny/stat.xlsx", index_label='Features')

## Plot features p-values

In [None]:
n_rows = 1
n_cols = 3
fig_width = 12
fig_height = 18

sns.set_theme(style='whitegrid')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'hspace': 0.0})
colors = ['red', 'blue', 'green']
for pair_id, pair in enumerate(["Central vs Mirny FDR", "Central vs No-age FDR", "Mirny vs No-age FDR"]):
    df_stat.sort_values([pair], ascending=True, inplace=True)
    df_stat[f'{pair} log'] = -np.log10(df_stat[pair])
    df_stat['color'] = 'white'
    df_stat.loc[df_stat[pair] < 0.05, 'color'] = colors[pair_id]
    barplot = sns.barplot(
        data=df_stat,
        y=df_stat.index.values,
        x=f'{pair} log',
        edgecolor='black',
        palette=df_stat['color'].values,
        dodge=True,
        ax=axs[pair_id]
    )
    axs[pair_id].set_xlabel(r"$-\log_{10}(\mathrm{p-value})$", fontsize=18)
    axs[pair_id].set_ylabel('', fontsize=20)
    axs[pair_id].set_title(pair, fontsize=20)
    axs[pair_id].set_xticklabels([f"{int(tick):d}" for tick in axs[pair_id].get_xticks()], fontsize=16)
    axs[pair_id].set_yticklabels(axs[pair_id].get_yticklabels(), fontsize = 16)
fig.tight_layout()
plt.savefig(f"{path_save}/no-age_vs_central_vs_mirny/barplot.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/no-age_vs_central_vs_mirny/barplot.pdf", bbox_inches='tight')
plt.close(fig)