# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
from scipy import stats
from sklearn.feature_selection import f_regression
import itertools
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
from itertools import chain
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
import functools
import matplotlib.lines as mlines

# Load data

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_load = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/060_EpiSImAge"
path_save = "D:/YandexDisk/Work/bbd/immunology/003_EpImAge"
pathlib.Path(path_save).mkdir(parents=True, exist_ok=True)

## Immunology

In [None]:
df = pd.read_excel(f"{path_load}/df.xlsx", index_col=0)

feats_imm = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

feats_global = [
    'Subject ID', 'Time', 'Status', 'Sex', 'Age', 'Region', 'SImAge', 'SImAge acceleration', '|SImAge acceleration|', 'Dialysis (months)',
    'PMC10485620 ID', 'PMC9135940 ID', 'COVID', 'GSM', 'PMC10699032 ID', 'Residence', 'Nationality', 'Sample_Name', 'Sentrix_ID', 'Sentrix_Position'
]

for f in feats_imm_slctd:
    df[f"{f}_log"] = np.log(df[f"{f}"])

### Plot histogram for status

In [None]:
hist_bins = np.linspace(5, 115, 23)
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
histplot = sns.histplot(
    data=df,
    bins=hist_bins,
    edgecolor='k',
    linewidth=1,
    x="Age",
    hue='Status',
    palette={'ESRD': 'crimson', 'Control': 'dodgerblue'},
    hue_order=['Control', 'ESRD'],
    ax=ax
)
histplot.set(xlim=(0, 120))
plt.savefig(f"{path_save}/hist_status.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/hist_status.pdf", bbox_inches='tight')
plt.close(fig)

### Stratify immunological data by age and groups

In [None]:
random_state = 1337
n_splits = 5

stratify_cat_parts = {
    'ctrl_central': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Central')].values,
    'ctrl_yakutia': df.index[(df['Status'] == 'Control') & (df['Region'] == 'Yakutia')].values,
    'esrd': df.index[(df['Status'] == 'ESRD')].values,
}

for part, ids in stratify_cat_parts.items():
    print(f"{part}: {len(ids)}")
    trgt = df.loc[ids, 'Age'].values
    ptp = np.ptp(trgt)
    num_bins = 10
    bins = np.linspace(np.min(trgt) - 0.1 * ptp, np.max(trgt) + 0.1 * ptp, num_bins + 1)
    binned = np.digitize(trgt, bins) - 1
    unique, counts = np.unique(binned, return_counts=True)
    occ = dict(zip(unique, counts))
    k_fold = RepeatedStratifiedKFold(
        n_splits=n_splits,
        n_repeats=1,
        random_state=random_state
    )
    splits = k_fold.split(X=ids, y=binned, groups=binned)
    
    for split_id, (ids_trn, ids_val) in enumerate(splits):
        df.loc[ids[ids_trn], f"Split_{split_id}"] = "trn_val"
        df.loc[ids[ids_val], f"Split_{split_id}"] = "tst"

## Epigenetics

In [None]:
feats_epi = {}
epi_data_type = 'no_harm'
manifest = pd.read_pickle("D:/YandexDisk/Work/pydnameth/datasets/GPL21145/manifest/manifest.pkl")
pathlib.Path(f"{path_save}/{epi_data_type}").mkdir(parents=True, exist_ok=True)

### GSEUNN dataset

In [None]:
df_epi = pd.read_pickle(f"{path_load}/GSEUNN/{epi_data_type}/betas.pkl")
df = pd.merge(df, df_epi, left_index=True, right_index=True)

#### Features update

In [None]:
feats_epi['GSEUNN'] = df_epi.columns.values

### Test datasets

In [None]:
tst_gses = [
    'GSE87571',
    'GSE40279',
    'GSE179325',
    'GSE217633',
    'GSE118144',
    'GSE42861',
    'GSE106648',
    'GSE67530',
]

for gse in tst_gses:
    feats_epi_cmn = list(set.intersection(*[set(x) for x in feats_epi.values()]))
    print(f"Number of CpGs before {gse}: {len(feats_epi_cmn)}")
    df_gse_epi = pd.read_pickle(f"{path_load}/GSE40279/betas.pkl")
    feats_epi['GSE40279'] = df_gse_epi.columns.values
    feats_epi_cmn = list(set.intersection(*[set(x) for x in feats_epi.values()]))
    print(f"Number of CpGs after {gse}: {len(feats_epi_cmn)}")

### Epigenetic features

In [None]:
feats_epi_cmn = list(set.intersection(*[set(x) for x in feats_epi.values()]))
print(f"Number of CpGs: {len(feats_epi_cmn)}")

# Features selection

## Immunological features: original or logarithmic?

In [None]:
n_rows = 8
n_cols = 4
fig_height = 20
fig_width = 12

colors_feats_type = {
    '': 'red',
    '_log': 'blue'
}

for suffix in colors_feats_type:
    sns.set_theme(style='whitegrid')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=False, sharex=False)
    for feat_id, feat in enumerate(feats_imm_slctd):
        row_id, col_id = divmod(feat_id, n_cols)
        sns.kdeplot(
            data=df,
            x=f"{feat}{suffix}",
            color=colors_feats_type[suffix],
            linewidth=2,
            cut=0,
            fill=True,
            ax=axs[row_id, col_id],
        )
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True)
        if suffix == '_log':
            axs[row_id, col_id].set_xlabel(fr"$\log(\mathrm{{{feat}}})$")
    fig.tight_layout()    
    fig.savefig(f"{path_save}/immuno_features{suffix}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path_save}/immuno_features{suffix}.pdf", bbox_inches='tight')
    plt.close(fig)


## Generate table of features

### Spearman correlation

In [None]:
n_feats = 100

with pd.ExcelWriter(f"{path_save}/{epi_data_type}/spearman_{n_feats}.xlsx", engine='xlsxwriter') as writer:
    for imm in feats_imm_slctd:
        df_stat = pd.DataFrame(
            index=feats_epi_cmn,
            columns=['Gene'] + [f"{imm}_stat", f"{imm}_pval", f"{imm}_pval_fdr"]
        )
        for cpg in tqdm(feats_epi_cmn, desc=f'{imm} CpG processing', total=len(feats_epi_cmn)):
            df_stat.at[cpg, 'Gene'] = manifest.at[cpg, 'Gene']
            res = stats.spearmanr(df[f"{imm}_log"], df[cpg], alternative='two-sided')
            df_stat.at[cpg, f"{imm}_stat"] = res.statistic
            df_stat.at[cpg, f"{imm}_pval"] = res.pvalue
        _, df_stat[f"{imm}_pval_fdr"], _, _ = multipletests(df_stat[f"{imm}_pval"], 0.05, method='fdr_bh')
        df_stat.sort_values([f"{imm}_pval"], ascending=[True], inplace=True)
        df_stat.head(n_feats).to_excel(writer, sheet_name=imm)

### f_regression

In [None]:
n_feats = 100

with pd.ExcelWriter(f"{path_save}/{epi_data_type}/f_regression_{n_feats}.xlsx", engine='xlsxwriter') as writer:
    for imm in feats_imm_slctd:
        df_stat = pd.DataFrame(
            index=feats_epi_cmn,
            columns=['Gene'] + [f"{imm}_stat", f"{imm}_pval", f"{imm}_pval_fdr"]
        )
        df_stat.loc[feats_epi_cmn, 'Gene'] = manifest.loc[feats_epi_cmn, 'Gene']
        df_stat[f"{imm}_stat"], df_stat[f"{imm}_pval"] = f_regression(df.loc[:, feats_epi_cmn].values, df.loc[:, imm].values)
        _, df_stat[f"{imm}_pval_fdr"], _, _ = multipletests(df_stat[f"{imm}_pval"], 0.05, method='fdr_bh')
        df_stat.sort_values([f"{imm}_pval"], ascending=[True], inplace=True)
        df_stat.head(n_feats).to_excel(writer, sheet_name=imm)

## Load table of features and create data

In [None]:
method = 'spearman'
n_feats = 100
for imm in feats_imm_slctd:
    df_stat = pd.read_excel(f"{path_save}/{epi_data_type}/{method}_{n_feats}.xlsx", index_col=0, sheet_name=imm)
    pathlib.Path(f"{path_save}/{epi_data_type}/{method}_{n_feats}/{imm}").mkdir(parents=True, exist_ok=True)
    df_stat.to_excel(f"{path_save}/{epi_data_type}/{method}_{n_feats}/{imm}/feats_con.xlsx", index_label='CpG')
    df_stat.to_pickle(f"{path_save}/{epi_data_type}/{method}_{n_feats}/{imm}/feats_con.pkl")
    df_data_imm = df.loc[:, feats_global + [f"Split_{split_id}" for split_id in range(5)] + [imm, f'{imm}_log'] + list(df_stat.index.values)]
    df_data_imm['Index'] = df_data_imm.index.values
    df_data_imm.to_excel(f"{path_save}/{epi_data_type}/{method}_{n_feats}/{imm}/data.xlsx")
    df_data_imm.to_pickle(f"{path_save}/{epi_data_type}/{method}_{n_feats}/{imm}/data.pkl")