# Debugging autoreload

In [ ]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
import itertools
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
from impyute.imputation.cs import fast_knn
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
from statannotations.Annotator import Annotator
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
import functools
import matplotlib.lines as mlines
import patchworklib as pw


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# Load data

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_save = f"{path}/special/060_EpiSImAge"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

## Immunology

In [None]:
df = pd.read_excel(f"{path_save}/df.xlsx", index_col=0)

feats_imm = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

feats_global = [
    'Subject ID', 'Time', 'Status', 'Sex', 'Age', 'Region', 'SImAge', 'SImAge acceleration', '|SImAge acceleration|', 'Dialysis (months)',
    'PMC10485620 ID', 'PMC9135940 ID', 'COVID', 'GSM', 'PMC10699032 ID', 'Residence', 'Nationality', 'Sample_Name', 'Sentrix_ID', 'Sentrix_Position'
]

for f in feats_imm_slctd:
    df[f"{f}_log"] = np.log(df[f"{f}"])

In [None]:
# Stratification params
random_state = 1337
n_splits = 5

stratify_cat_parts = {
    'ctrl_central': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Central')].values,
    'ctrl_yakutia': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Yakutia')].values,
    'esrd': df.index[(df['Status'] == 'ESRD')].values,
}

for part, ids in stratify_cat_parts.items():
    print(f"{part}: {len(ids)}")
    trgt = df.loc[ids, 'Age'].values
    ptp = np.ptp(trgt)
    num_bins = 10
    bins = np.linspace(np.min(trgt) - 0.1 * ptp, np.max(trgt) + 0.1 * ptp, num_bins + 1)
    binned = np.digitize(trgt, bins) - 1
    unique, counts = np.unique(binned, return_counts=True)
    occ = dict(zip(unique, counts))
    k_fold = RepeatedStratifiedKFold(
        n_splits=n_splits,
        n_repeats=1,
        random_state=random_state
    )
    splits = k_fold.split(X=ids, y=binned, groups=binned)
    
    for split_id, (ids_trn, ids_val) in enumerate(splits):
        df.loc[ids[ids_trn], f"Split_{split_id}"] = "trn_val"
        df.loc[ids[ids_val], f"Split_{split_id}"] = "tst"

### Plotting

In [None]:
# Plot histograms
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
histplot = sns.histplot(
    data=df,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Status',
    palette={'ESRD': 'crimson', 'Control': 'dodgerblue'},
    hue_order=['Control', 'ESRD'],
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path_save}/hist.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/hist.pdf", bbox_inches='tight')
plt.close(fig)

## Epigenetics

In [None]:
feats_epi = {}
epi_data_type = 'harm'
manifest = get_manifest('GPL13534', path="D:/YandexDisk/Work/pydnameth/datasets")

### Train dataset

#### Correct index and save to binary

In [None]:
df_epi = pd.read_csv(f"{path_save}/GSEUNN/betas.csv", index_col=0).transpose()
df_epi = df_epi.loc[df['Sample_Name'].values, :]
df_epi['Index'] = df.index.values
df_epi.set_index('Index', inplace=True)
df_epi.to_pickle(f"{path_save}/GSEUNN/betas.pkl")

#### Load corrected from binary

In [None]:
df_epi = pd.read_pickle(f"{path_save}/GSEUNN/{epi_data_type}/betas.pkl")
feats_epi['GSEUNN'] = df_epi.columns.values

In [None]:
df = pd.merge(df, df_epi, left_index=True, right_index=True)

### Test datasets

#### GSE87571

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE87571/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE87571/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE87571/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE87571/pheno.xlsx")

#### GSE40279

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE40279/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE40279/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
df_gse_phn['Sample_Name'] = df_gse_phn.index.values
df_gse_phn.set_index('gsm', inplace=True)
df_gse_epi = df_gse_epi.loc[df_gse_phn['Sample_Name'].values, :]
df_gse_epi['Index'] = df_gse_phn.index.values
df_gse_epi.set_index('Index', inplace=True)
df_gse_epi.to_pickle(f"{path_save}/GSE40279/betas.pkl")
df_gse_phn.to_excel(f"{path_save}/GSE40279/pheno.xlsx")

##### Load corrected from binary

In [None]:
df_gse_epi = pd.read_pickle(f"{path_save}/GSE40279/betas.pkl")
feats_epi['GSE40279'] = df_gse_epi.columns.values

#### GSE179325

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE179325/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE179325/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE179325/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE179325/pheno.xlsx")

#### GSE217633

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE217633/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE217633/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE217633/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE217633/pheno.xlsx")

#### GSE220622

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE220622/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE220622/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE220622/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE220622/pheno.xlsx")

#### GSE219037

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE219037/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE219037/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE219037/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE219037/pheno.xlsx")

#### GSE118144

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE118144/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE118144/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE118144/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE118144/pheno.xlsx")

#### GSE201752

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE201752/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE201752/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE201752/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE201752/pheno.xlsx")

#### GSE42861

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE42861/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE42861/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE42861/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE42861/pheno.xlsx")

#### GSE73103

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE73103/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE73103/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE73103/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE73103/pheno.xlsx")

#### GSE106648

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE106648/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE106648/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE106648/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE106648/pheno.xlsx")

#### GSE67530

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE67530/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE67530/pheno.csv", index_col=0)
X = df_gse_epi.values
imputer = KNNImputer(n_neighbors=5)
X_imptd = imputer.fit_transform(X)
df_gse_epi.loc[:, :] = X_imptd

In [None]:
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE67530/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE67530/pheno.xlsx")

#### GSE71955

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE71955/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE71955/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_epi.to_pickle(f"{path_save}/GSE71955/betas.pkl")
    df_gse_phn.to_excel(f"{path_save}/GSE71955/pheno.xlsx")

#### GSE77696

##### Correct index and save to binary

In [None]:
df_gse_epi = pd.read_csv(f"{path_save}/GSE77696/betas.csv", index_col=0).transpose()
df_gse_phn = pd.read_csv(f"{path_save}/GSE77696/pheno.csv", index_col=0)
if not df_gse_epi.index.equals(df_gse_phn.index):
    print("Indexes are not equal!")
else:
    df_gse_phn['Index'] = df_gse_phn['gsm']
    df_gse_phn['Samples'] = df_gse_phn.index.values
    df_gse_phn.set_index('Index', inplace=True)
    df_gse_epi.set_index(df_gse_phn.index.values, inplace=True)
    df_gse_epi.to_pickle(f"{path_save}/GSE77696/betas.pkl")
    df_gse_phn.to_csv(f"{path_save}/GSE77696/pheno_1.csv")
    df_gse_phn.to_excel(f"{path_save}/GSE77696/pheno.xlsx")

# Features selection

## Immuno features analysis

In [None]:
n_rows = 2
n_cols = 5
fig_height = 7
fig_width = 16

sns.set_theme(style='whitegrid')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False)
for feat_id, feat in enumerate(feats_imm_fimmu):
    row_id, col_id = divmod(feat_id, n_cols)
    sns.kdeplot(
        data=df,
        x=feat,
        color='red',
        linewidth=2,
        cut=0,
        fill=True,
        ax=axs[row_id, col_id],
    )
    axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True)
fig.tight_layout()    
fig.savefig(f"{path_save}/fimmu_features/ori.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/fimmu_features/ori.pdf", bbox_inches='tight')
plt.close(fig)

sns.set_theme(style='whitegrid')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False)
for feat_id, feat in enumerate(feats_imm_fimmu):
    row_id, col_id = divmod(feat_id, n_cols)
    sns.kdeplot(
        data=df,
        x=f"{feat}_log",
        color='blue',
        linewidth=2,
        cut=0,
        fill=True,
        ax=axs[row_id, col_id],
    )
    axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True)
    axs[row_id, col_id].set_xlabel(fr"$\log(\mathrm{{{feat}}})$")
fig.tight_layout()    
fig.savefig(f"{path_save}/fimmu_features/log.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path_save}/fimmu_features/log.pdf", bbox_inches='tight')
plt.close(fig)


## Generate table of features

In [None]:
feats_epi_cmn = list(set.intersection(*[set(x) for x in feats_epi.values()]))

In [None]:
df_feats_spearman = pd.DataFrame(
    index=feats_epi_cmn,
    columns=['Gene'] + list(itertools.chain(*[[f"{x}_stat", f"{x}_pval", f"{x}_pval_fdr"] for x in feats_imm_slctd]))
)

for cpg in tqdm(feats_epi_cmn, desc='CpG processing', total=len(feats_epi_cmn)):
    df_feats_spearman.at[cpg, 'Gene'] = manifest.at[cpg, 'Gene']
    for imm in feats_imm_slctd:
        res = stats.spearmanr(df[f"{imm}_log"], df[cpg], alternative='two-sided')
        df_feats_spearman.at[cpg, f"{imm}_stat"] = res.statistic
        df_feats_spearman.at[cpg, f"{imm}_pval"] = res.pvalue
for imm in feats_imm_slctd:
    _, df_feats_spearman[f"{imm}_pval_fdr"], _, _ = multipletests(df_feats_spearman[f"{imm}_pval"], 0.05, method='fdr_bh')
df_feats_spearman.to_excel(f"{path_save}/df_feats_spearman.xlsx")

## Load table of features

In [None]:
df_feats_spearman = pd.read_excel(f"{path_save}/df_feats_spearman.xlsx", index_col=0)
feats_epi_cmn = df_feats_spearman.index.values

# Create data for immunology regression

In [None]:
n_top_cpgs = [100, 500, 1000]

for imm in feats_imm_fimmu:
    for n_feats in n_top_cpgs:
        pathlib.Path(f"{path_save}/fimmu_features/{imm}").mkdir(parents=True, exist_ok=True)
        df_feats_imm = df_feats_spearman.sort_values(f"{imm}_pval_fdr", ascending=True)[['Gene', f"{imm}_stat", f"{imm}_pval", f"{imm}_pval_fdr"]].head(n_feats)
        df_feats_imm.to_excel(f"{path_save}/fimmu_features/{imm}/feats_con_{n_feats}.xlsx", index_label='CpG')
        df_feats_imm.to_pickle(f"{path_save}/fimmu_features/{imm}/feats_con_{n_feats}.pkl")
        feats_imm_curr = df_feats_imm.index.values
        df_data_imm = df.loc[:, feats_global + [f"Split_{split_id}" for split_id in range(5)] + [imm, f'{imm}_log'] + list(feats_imm_curr)]
        df_data_imm['Index'] = df_data_imm.index.values
        df_data_imm.to_excel(f"{path_save}/fimmu_features/{imm}/data_{n_feats}.xlsx", index=False)
        df_data_imm.to_pickle(f"{path_save}/fimmu_features/{imm}/data_{n_feats}.pkl")

# SImAge 2: All contols, logarithmic

In [None]:
pathlib.Path(f"{path_save}/SImAge2").mkdir(parents=True, exist_ok=True)
df_feats_simage2 = pd.DataFrame(index=[f"{f}_log" for f in feats_imm_fimmu])
df_feats_simage2.to_excel(f"{path_save}/SImAge2/feats_con{len(feats_imm_fimmu)}.xlsx", index_label='Features')
df_feats_simage2.to_pickle(f"{path_save}/SImAge2/feats_con{len(feats_imm_fimmu)}.pkl")

# Papers
df_imm_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/data.xlsx", index_col="sample_id")
df_imm_geroscience = pd.read_excel(f"{path}/data/immuno/models/IPAge/11357_2022_540_MOESM12_ESM.xlsx", index_col=0, skiprows=1)
df_epi_clinepi = pd.read_excel(f"{path}/data/GSE234461/samples.xlsx", index_col=0)

df_imm_all = pd.read_excel(f"{path}/data/immuno/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df_ld_imm = df_imm_all['Subject ID'].value_counts().to_frame()
df_imm_all['Is longitudinal?'] = False
df_imm_all.loc[df_imm_all['Subject ID'].isin(df_ld_imm.index[df_ld_imm['Subject ID'] > 1].values), 'Is longitudinal?'] = True
df_imm_all.rename(columns={'Sample_Chronology': 'Time'}, inplace=True)
df_imm_all['Time'].replace({0: 'T0', 1: 'T1', 2: 'T2', 3: 'T3'}, inplace=True)
df_imm_all.loc[df_imm_fimmu.index.values, 'PMC10485620 ID'] = df_imm_fimmu.loc[df_imm_fimmu.index.values, 'index']
df_imm_all.loc[df_imm_geroscience.index.values, 'PMC9135940 ID'] = df_imm_geroscience.loc[df_imm_geroscience.index.values, 'ID_Origin']
df_imm_all.loc[df_epi_clinepi.index.values, 'PMC10699032 ID'] = df_epi_clinepi.loc[df_epi_clinepi.index.values, 'GSM']

feats_for_simage2 = [
    'Subject ID', 'Time', 'Status', 'Sex', 'Age', 'Region', 'Dialysis (months)',
    'SImAge', 'SImAge acceleration', '|SImAge acceleration|',
    'PMC10485620 ID', 'PMC9135940 ID', 'PMC10699032 ID'
] + [f"{f}_log" for f in feats_imm_fimmu]

df_imm_simage2 = df_imm_all.loc[(df_imm_all['Status'].isin(['Control', 'ESRD'])) & (df_imm_all['Region'].isin(['Central', 'Yakutia'])), feats_for_simage2]

random_state = 1337
n_splits = 5

stratify_cat_parts = {
    'ctrl_central': df_imm_simage2.index[(df_imm_simage2['Status'] == 'Control') & (df_imm_simage2['Region'] == 'Central')].values,
    'ctrl_yakutia': df_imm_simage2.index[(df_imm_simage2['Status'] == 'Control') & (df_imm_simage2['Region'] == 'Yakutia')].values,
}

for part, ids in stratify_cat_parts.items():
    print(f"{part}: {len(ids)}")
    trgt = df_imm_simage2.loc[ids, 'Age'].values
    ptp = np.ptp(trgt)
    num_bins = 10
    bins = np.linspace(np.min(trgt) - 0.1 * ptp, np.max(trgt) + 0.1 * ptp, num_bins + 1)
    binned = np.digitize(trgt, bins) - 1
    unique, counts = np.unique(binned, return_counts=True)
    occ = dict(zip(unique, counts))
    k_fold = RepeatedStratifiedKFold(
        n_splits=n_splits,
        n_repeats=1,
        random_state=random_state
    )
    splits = k_fold.split(X=ids, y=binned, groups=binned)
    
    for split_id, (ids_trn, ids_val) in enumerate(splits):
        df_imm_simage2.loc[ids[ids_trn], f"Split_{split_id}"] = "trn_val"
        df_imm_simage2.loc[ids[ids_val], f"Split_{split_id}"] = "tst"
        df_imm_simage2.loc[df_imm_all['Status'] == "ESRD", f"Split_{split_id}"] = 'tst_esrd'

df_imm_simage2.to_excel(f"{path_save}/SImAge2/data.xlsx")
df_imm_simage2['Index'] = df_imm_simage2.index.values
df_imm_simage2.to_pickle(f"{path_save}/SImAge2/data.pkl")

# SImAge logarithmic

In [None]:
pathlib.Path(f"{path_save}/SImAge_log").mkdir(parents=True, exist_ok=True)
df_feats_simage_log = pd.DataFrame(index=[f"{f}_log" for f in feats_imm_fimmu])
df_feats_simage_log.to_excel(f"{path_save}/SImAge_log/feats_con{len(feats_imm_fimmu)}.xlsx", index_label='Features')
df_feats_simage_log.to_pickle(f"{path_save}/SImAge_log/feats_con{len(feats_imm_fimmu)}.pkl")

# Papers
df_imm_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/data.xlsx", index_col="sample_id")
df_imm_geroscience = pd.read_excel(f"{path}/data/immuno/models/IPAge/11357_2022_540_MOESM12_ESM.xlsx", index_col=0, skiprows=1)
df_epi_clinepi = pd.read_excel(f"{path}/data/GSE234461/samples.xlsx", index_col=0)

df_imm_all = pd.read_excel(f"{path}/data/immuno/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df_ld_imm = df_imm_all['Subject ID'].value_counts().to_frame()
df_imm_all['Is longitudinal?'] = False
df_imm_all.loc[df_imm_all['Subject ID'].isin(df_ld_imm.index[df_ld_imm['Subject ID'] > 1].values), 'Is longitudinal?'] = True
df_imm_all.rename(columns={'Sample_Chronology': 'Time'}, inplace=True)
df_imm_all['Time'].replace({0: 'T0', 1: 'T1', 2: 'T2', 3: 'T3'}, inplace=True)
df_imm_all.loc[df_imm_fimmu.index.values, 'PMC10485620 ID'] = df_imm_fimmu.loc[df_imm_fimmu.index.values, 'index']
df_imm_all.loc[df_imm_geroscience.index.values, 'PMC9135940 ID'] = df_imm_geroscience.loc[df_imm_geroscience.index.values, 'ID_Origin']
df_imm_all.loc[df_epi_clinepi.index.values, 'PMC10699032 ID'] = df_epi_clinepi.loc[df_epi_clinepi.index.values, 'GSM']

feats_for_simage_log = [
    'Subject ID', 'Time', 'Status', 'Sex', 'Age', 'Region', 'Dialysis (months)',
    'SImAge', 'SImAge acceleration', '|SImAge acceleration|',
    'PMC10485620 ID', 'PMC9135940 ID', 'PMC10699032 ID'
] + [f"{f}_log" for f in feats_imm_fimmu]

df_imm_simage_log = df_imm_all.loc[df_imm_all['PMC10485620 ID'].notna(), feats_for_simage_log]
df_imm_simage_log.loc[df_imm_simage_log['PMC10485620 ID'].str.contains('trn_val'), 'Split'] = 'trn_val'
df_imm_simage_log.loc[df_imm_simage_log['PMC10485620 ID'].str.contains('tst_ctrl'), 'Split'] = 'tst'
df_imm_simage_log.loc[df_imm_simage_log['PMC10485620 ID'].str.contains('tst_esrd'), 'Split'] = 'tst_esrd'

df_imm_simage_log.to_excel(f"{path_save}/SImAge_log/data.xlsx")
df_imm_simage_log['Index'] = df_imm_simage_log.index.values
df_imm_simage_log.to_pickle(f"{path_save}/SImAge_log/data.pkl")