# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
from scipy import stats
from sklearn.feature_selection import f_regression
import itertools
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
from itertools import chain
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
import functools
import matplotlib.lines as mlines
import miceforest as mf

# Imputation (logarithmic) using all suitable immuno data

## Prepare data

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_load = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/060_EpiSImAge"

In [None]:
df = pd.read_excel(f"{path_load}/df.xlsx", index_col=0)
df_imm = pd.read_excel(f"{path_load}/df_imm.xlsx", index_col=0)
df_all = pd.read_excel(f"{path}/data/immuno/data.xlsx", index_col=0)
df_all_w_nans = pd.read_excel(f"{path}/data/immuno/data_with_nans.xlsx", index_col=0)

feats_imm = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
feats_imm_fimmu = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
feats_imm_slctd = pd.read_excel(f"{path}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.values

feats_global = [
    'Subject ID', 'Time', 'Status', 'Sex', 'Age', 'Region', 'SImAge', 'SImAge acceleration', '|SImAge acceleration|', 'Dialysis (months)',
    'PMC10485620 ID', 'PMC9135940 ID', 'COVID', 'GSM', 'PMC10699032 ID', 'Residence', 'Nationality', 'Sample_Name', 'Sentrix_ID', 'Sentrix_Position'
]
feats_add_for_imp = ['Age', 'Sex', 'Region', 'Status']

In [None]:
df_for_imp = df_all_w_nans.loc[df_imm.index.values, feats_imm_slctd]
df_for_imp.loc[df_for_imp.index.values, feats_add_for_imp] = df_imm.loc[df_for_imp.index.values, feats_add_for_imp]
for f in ['Sex', 'Region', 'Status']:
    df_for_imp[f] = df_for_imp[f].astype('category')

feats_imm_slctd_log = []
for f in feats_imm_slctd:
    df_for_imp[f"{f}_log"] = np.log(df_for_imp[f"{f}"])
    feats_imm_slctd_log.append(f"{f}_log")

df_nan_feats = df_for_imp.loc[:, feats_imm_slctd_log].isna().sum(axis=0).to_frame(name="Number of NaNs")
df_nan_feats["% of NaNs"] = df_nan_feats["Number of NaNs"] / df_for_imp.shape[0] * 100
df_nan_feats["Number of not-NaNs"] = df_for_imp.loc[:, feats_imm_slctd_log].notna().sum(axis=0)
df_nan_feats.sort_values(["% of NaNs"], ascending=[False], inplace=True)
sns.set_theme(style='whitegrid')
fig = plt.figure(figsize=(14, 4))
plt.xticks(rotation=90)
barplot = sns.barplot(
    data=df_nan_feats,
    x=df_nan_feats.index,
    y=f"% of NaNs",
    edgecolor='black',
    dodge=False,
)
barplot.set_xlabel("")
plt.show()
plt.close(fig)

## KNN Imputation

In [None]:
n_neighbors = 5
path_save = f"D:/YandexDisk/Work/bbd/immunology/003_EpImAge/imp_source(imm)_method(knn)_params({n_neighbors})"
pathlib.Path(path_save).mkdir(parents=True, exist_ok=True)

X = df_for_imp.loc[:, feats_imm_slctd_log + ['Age']].values
print(f'Missing before imputation: {np.isnan(X).sum()}')
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
print(f'Missing after imputation: {np.isnan(X_imptd).sum()}')

df_imp = df_for_imp.copy()
df_imp.loc[:, feats_imm_slctd_log + ['Age']] = X_imptd

# Strong outliers imputation
print('Strong outliers:')
for f in feats_imm_slctd_log:
    q1 = df_imp[f].quantile(0.25)
    q3 = df_imp[f].quantile(0.75)
    iqr = q3 - q1
    df_imp.loc[(df_imp[f] < q1 - 3.0 * iqr) | (df_imp[f] > q3 + 3.0 * iqr), f] = np.nan
    print(f"{f}: {df_imp.loc[:, f].isna().sum()}")
X = df_imp.loc[:, feats_imm_slctd_log + ['Age']].values
imputer = KNNImputer(n_neighbors=n_neighbors)
X_imptd = imputer.fit_transform(X)
df_imp.loc[:, feats_imm_slctd_log + ['Age']] = X_imptd

for f in feats_imm_slctd:
    df_imp[f"{f}"] = np.exp(df_imp[f"{f}_log"])

df_imm_new = df_imm.copy()
df_imm_new.loc[df_imm_new.index.values, feats_imm_slctd_log] = df_imp.loc[df_imm_new.index.values, feats_imm_slctd_log]
df_imm_new.loc[df_imm_new.index.values, feats_imm_slctd] = df_imp.loc[df_imm_new.index.values, feats_imm_slctd]
df_imm_new.to_excel(f"{path_save}/df_imm.xlsx")

df_new = df.copy()
df_new.loc[df_new.index.values, feats_imm_slctd_log] = df_imp.loc[df_new.index.values, feats_imm_slctd_log]
df_new.loc[df_new.index.values, feats_imm_slctd] = df_imp.loc[df_new.index.values, feats_imm_slctd]
df_new.to_excel(f"{path_save}/df.xlsx")

## miceforest imputation

In [None]:
iterations = 2
path_save = f"D:/YandexDisk/Work/bbd/immunology/003_EpImAge/imp_source(imm)_method(miceforest)_params({iterations})"
pathlib.Path(path_save).mkdir(parents=True, exist_ok=True)

print(f'Missing before imputation: {df_for_imp.loc[:, feats_imm_slctd_log + feats_add_for_imp].isna().sum().sum()}')
kds = mf.ImputationKernel(
  data=df_for_imp.loc[:, feats_imm_slctd_log + feats_add_for_imp],
  save_all_iterations=True,
  random_state=42
)
kds.mice(iterations=iterations)
df_kds = kds.complete_data()
print(f'Missing after imputation: {df_kds.isna().sum().sum()}')

df_imp = df_for_imp.copy()
df_imp.loc[df_imp.index.values, feats_imm_slctd_log] = df_kds.loc[df_imp.index.values, feats_imm_slctd_log]

# Strong outliers imputation
print('Strong outliers:')
for f in feats_imm_slctd_log:
    q1 = df_imp[f].quantile(0.25)
    q3 = df_imp[f].quantile(0.75)
    iqr = q3 - q1
    df_imp.loc[(df_imp[f] < q1 - 3.0 * iqr) | (df_imp[f] > q3 + 3.0 * iqr), f] = np.nan
    print(f"{f}: {df_imp.loc[:, f].isna().sum()}")
    
kds = mf.ImputationKernel(
  data=df_imp.loc[:, feats_imm_slctd_log + feats_add_for_imp],
  save_all_iterations=True,
  random_state=42
)
kds.mice(iterations=iterations)
df_kds = kds.complete_data()
df_imp.loc[df_imp.index.values, feats_imm_slctd_log] = df_kds.loc[df_imp.index.values, feats_imm_slctd_log]

for f in feats_imm_slctd:
    df_imp[f"{f}"] = np.exp(df_imp[f"{f}_log"])
    
df_imm_new = df_imm.copy()
df_imm_new.loc[df_imm_new.index.values, feats_imm_slctd_log] = df_imp.loc[df_imm_new.index.values, feats_imm_slctd_log]
df_imm_new.loc[df_imm_new.index.values, feats_imm_slctd] = df_imp.loc[df_imm_new.index.values, feats_imm_slctd]
df_imm_new.to_excel(f"{path_save}/df_imm.xlsx")

df_new = df.copy()
df_new.loc[df_new.index.values, feats_imm_slctd_log] = df_imp.loc[df_new.index.values, feats_imm_slctd_log]
df_new.loc[df_new.index.values, feats_imm_slctd] = df_imp.loc[df_new.index.values, feats_imm_slctd]
df_new.to_excel(f"{path_save}/df.xlsx")