In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from src.utils.outliers.iqr import add_iqr_outs_to_df, plot_iqr_outs, plot_iqr_outs_regression_error
from src.utils.outliers.pyod import add_pyod_outs_to_df, plot_pyod_outs, plot_pyod_outs_regression_error
from scripts.python.dataset_specific.GSEUNN.tasks.routines_046 import plot_regression_error_distributions
from plotly.subplots import make_subplots
from scipy import stats
import plotly.express as px
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
import importlib
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
from scipy.interpolate import interp1d
from src.utils.verbose import NoStdStreams
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
from matplotlib import colors
from omegaconf import OmegaConf
from tqdm import tqdm
import seaborn as sns
from glob import glob
import pathlib
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap
from openTSNE import TSNE
from sklearn.metrics import mean_absolute_error
from scipy import stats
import patchworklib as pw
import os
import functools
from scipy.stats import iqr
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
import shap
from slugify import slugify
from src.models.tabular.widedeep.ft_transformer import WDFTTransformerModel
from src.models.tabular.widedeep.tab_net import WDTabNetModel
from art.estimators.regression.pytorch import PyTorchRegressor
from art.estimators.classification import PyTorchClassifier
from art.estimators.regression.blackbox import BlackBoxRegressor
from art.attacks.evasion import ProjectedGradientDescentNumpy, FastGradientMethod, BasicIterativeMethod, MomentumIterativeMethod
from art.attacks.evasion import ZooAttack, CarliniL2Method, ElasticNet, NewtonFool
import torch
from src.tasks.metrics import get_cls_pred_metrics, get_cls_prob_metrics, get_reg_metrics
import matplotlib.lines as mlines

from sdv.metadata import SingleTableMetadata
from sdv.lite import SingleTablePreset
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot
from sklearn.preprocessing import StandardScaler
from scripts.python.routines.mvals import expit2

import missingno as msno

import joblib
import pickle

from pyod.models.ecod import ECOD
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD
from pyod.models.sos import SOS
from pyod.models.kde import KDE
from pyod.models.sampling import Sampling
from pyod.models.gmm import GMM

from pyod.models.kpca import KPCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lmdd import LMDD

from pyod.models.lof import LOF
from pyod.models.cof import COF
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.models.sod import SOD

from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.loda import LODA
from pyod.models.suod import SUOD

from pyod.models.auto_encoder_torch import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.deep_svdd import DeepSVDD

from pyod.models.lunar import LUNAR

from torchmetrics import BootStrapper


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# 1. Adversarial examples for immunology data

## 1.1. Original data, models and functions
### Load data and model, define PyTorchRegressor, setup colors for different data 

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_model = f"{path}/data/immuno/models/SImAge"
path_save = f"{path}/special/046_adversarial_robustness_toolbox/immunology"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)
df = pd.read_excel(f"{path}/data/immuno/models/SImAge/data.xlsx", index_col='sample_id')
feats = pd.read_excel(f"{path}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.values
ids_feat = list(range(len(feats)))
col_trgt = 'Age'
col_pred = 'SImAge'

df_preds = pd.read_excel(f"{path}/data/immuno/models/SImAge/results/predictions.xlsx", index_col=0)
ids_trn = df_preds.index[df_preds['fold_0002'] == 'trn'].values
ids_val = df_preds.index[df_preds['fold_0002'] == 'val'].values
ids_tst = df_preds.index[df_preds['fold_0002'] == 'tst_ctrl_central'].values
ids_all = df_preds.index[df_preds['fold_0002'].isin(['trn', 'val', 'tst_ctrl_central'])].values
ids_trn_val = df_preds.index[df_preds['fold_0002'].isin(['trn', 'val'])].values
ids_dict = {
    'all': ids_all,
    'trn_val': ids_trn_val,
    'tst': ids_tst
}

df = df.loc[ids_all, :]
df["SImAge Error"] = df["SImAge"] - df["Age"]
df["abs(SImAge Error)"] = df["SImAge Error"].abs()
df['Data'] = 'Real'
df['Eps'] = 'Origin'

model = WDFTTransformerModel.load_from_checkpoint(checkpoint_path=f"{path}/data/immuno/models/SImAge/best_fold_0002.ckpt")
model.eval()
model.freeze()

def predict_func_regression(X):
    model.produce_probabilities = True
    batch = {
        'all': torch.from_numpy(np.float32(X[:, ids_feat])),
        'continuous': torch.from_numpy(np.float32(X[:, ids_feat])),
        'categorical': torch.from_numpy(np.int32(X[:, []])),
    }
    tmp = model(batch)
    return tmp.cpu().detach().numpy()

art_regressor = PyTorchRegressor(
    model=model,
    loss=model.loss_fn,
    input_shape=[len(feats)],
    optimizer=torch.optim.Adam(
        params=model.parameters(),
        lr=model.hparams.optimizer_lr,
        weight_decay=model.hparams.optimizer_weight_decay
    ),
    use_amp=False,
    opt_level="O1",
    loss_scale="dynamic",
    channels_first=True,
    clip_values=None,
    preprocessing_defences=None,
    postprocessing_defences=None,
    preprocessing=(0.0, 1.0),
    device_type="cpu",
)

colors_augs = {
    'FAST_ML': px.colors.qualitative.Light24[0],
    'GaussianCopula': px.colors.qualitative.Light24[1],
    'CTGANSynthesizer': px.colors.qualitative.Light24[2],
    'TVAESynthesizer': px.colors.qualitative.Light24[3],
    'CopulaGANSynthesizer': px.colors.qualitative.Light24[4],
}
colors_atks = {
    "MomentumIterative": px.colors.qualitative.D3[0],
    "BasicIterative": px.colors.qualitative.D3[1],
    "ProjectedGradientDescent": px.colors.qualitative.D3[2],
    "FastGradient": px.colors.qualitative.D3[3],
}

dim_red_labels = {
    'PCA': ['PC 1', 'PC 2'],
    'SVD': ['SVD 1', 'SVD 2'],
    't-SNE': ['t-SNE 1', 't-SNE 2'],
    'GRP': ['GRP 1', 'GRP 2'],
    'SRP': ['SRP 1', 'SRP 2'],
    'IsoMap': ['IsoMap 1', 'IsoMap 2'],
    'MBDL': ['MBDL 1', 'MBDL 2'],
}

# Create Scalers trained on trn_val samples ===================================
scalers = {}
feats_scaled = []
for f in feats:
    scaler = StandardScaler()
    scaler.fit(df.loc[:, f].values.reshape(-1, 1))
    scalers[f] = scaler
    feats_scaled.append(f"{f}_scaled")
    df[f"{f}_scaled"] = scalers[f].transform(df.loc[:, f].values.reshape(-1, 1))
with open(f"{path_save}/scalers.pkl", 'wb') as handle:
    pickle.dump(scalers, handle, protocol=pickle.HIGHEST_PROTOCOL)

thld_outs_iqr = 1/3
thld_outs_pyod = 1/3

### Create PyOD models, trained on trn_val samples

In [None]:
contamination = 0.1

pyod_methods = {
    'ECOD': ECOD(contamination=contamination),
    'LUNAR': LUNAR(),
    'DeepSVDD': DeepSVDD(contamination=contamination, verbose=0),
    'VAE': VAE(encoder_neurons=[32, 16, 8], decoder_neurons=[8, 16, 32], contamination=contamination),
    'LODA': LODA(contamination=contamination),
    'INNE': INNE(contamination=contamination),
    'IForest': IForest(contamination=contamination),
    'SOD': SOD(contamination=contamination),
    'KNN': KNN(contamination=contamination),
    'CBLOF': CBLOF(contamination=contamination),
    'COF': COF(contamination=contamination),
    'LOF': LOF(contamination=contamination),
    'LMDD': LMDD(contamination=contamination),
    'MCD': MCD(contamination=contamination),
    'GMM': GMM(contamination=contamination),
    'Sampling': Sampling(contamination=contamination),
    'SOS': SOS(contamination=contamination),
    'COPOD': COPOD(contamination=contamination),
}

for method_name, method in (pbar := tqdm(pyod_methods.items())):
    pbar.set_description(f"Processing {method_name}")
    
    method.fit(df.loc[ids_trn_val, feats_scaled].values)

### Create dimensionality reduction models, trained on trn_val samples

In [None]:
X_dim_red = df.loc[ids_trn_val, feats].values
dim_red_models = {
    'PCA': PCA(n_components=2, whiten=False).fit(X_dim_red),
    'SVD': TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5).fit(X_dim_red),
    't-SNE': TSNE(n_components=2).fit(X_dim_red),
    'GRP': GaussianRandomProjection(n_components=2, eps=0.5).fit(X_dim_red),
    'SRP': SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False).fit(X_dim_red),
    'IsoMap': Isomap(n_components=2, n_neighbors=5).fit(X_dim_red),
    'MBDL': MiniBatchDictionaryLearning(n_components=2, batch_size=100, alpha=1, n_iter=25).fit(X_dim_red),
}

### Apply and save or load processed data

In [None]:
# Load data with dim_red columns ==============================================
df = pd.read_excel(f"{path_save}/df_origin.xlsx", index_col=0)

In [None]:
# Add to df_origin.xlsx IQR outliers columns ==================================
add_iqr_outs_to_df(df, df.loc[ids_trn_val, :], feats)
df.to_excel(f"{path_save}/df_origin.xlsx", index_label='sample_id')

In [None]:
# Add to df_origin.xlsx PyOD outliers columns =================================
add_pyod_outs_to_df(df, pyod_methods, feats_scaled)
df.to_excel(f"{path_save}/df_origin.xlsx", index_label='sample_id')

In [None]:
# Add to df_origin.xlsx dimensionality reduction columns ======================
for m, drm in dim_red_models.items():
    dim_red_res = drm.transform(df.loc[:, feats].values)
    df.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
    df.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
df.to_excel(f"{path_save}/df_origin.xlsx", index_label='sample_id')

### Outliers analysis

In [None]:
# IQR plots
pathlib.Path(f"{path_save}/Origin/outliers_iqr").mkdir(parents=True, exist_ok=True)
plot_iqr_outs(df, feats, 'grey', 'Origin', f"{path_save}/Origin/outliers_iqr")
plot_iqr_outs_regression_error(df, feats, 'Origin', f"{path_save}/Origin/outliers_iqr", thld_outs_iqr, 'Age', 'SImAge', 'SImAge Error')

# PyOD plots
pathlib.Path(f"{path_save}/Origin/outliers_pyod").mkdir(parents=True, exist_ok=True)
plot_pyod_outs(df, pyod_methods, 'grey', 'Origin', f"{path_save}/Origin/outliers_pyod")
plot_pyod_outs_regression_error(df, pyod_methods, 'Origin', f"{path_save}/Origin/outliers_pyod", thld_outs_pyod, 'Age', 'SImAge', 'SImAge Error')

### Regression error analysis

In [None]:
pathlib.Path(f"{path_save}/Origin/errors").mkdir(parents=True, exist_ok=True)
plot_regression_error_distributions(df, feats, 'grey', 'Origin', f"{path_save}/Origin/errors", "abs(SImAge Error)")

### Confidence

In [None]:
torch.manual_seed(42)
quantiles = torch.tensor([0.05, 0.95])

pathlib.Path(f"{path_save}/Origin/confidence").mkdir(parents=True, exist_ok=True)
metrics = get_reg_metrics()
df_metrics = pd.DataFrame(index=list(metrics.keys()), columns=['mean', 'std', 'q0.05', 'q0.95'])
y_real = torch.from_numpy(np.float32(df['Age'].values))
y_pred = torch.from_numpy(np.float32(df['SImAge'].values))
for metric_name, metric_pair in metrics.items():
    metric = metric_pair[0]
    bootstrap = BootStrapper(
        metric,
        num_bootstraps=200,
        sampling_strategy="multinomial",
        quantile=quantiles
    )
    bootstrap.update(y_pred, y_real)
    bootstrap_output = bootstrap.compute()
    df_metrics.at[metric_name, 'mean'] = bootstrap_output['mean'].detach().cpu().numpy()
    df_metrics.at[metric_name, 'std'] = bootstrap_output['std'].detach().cpu().numpy()
    df_metrics.at[metric_name, 'q0.05'] = bootstrap_output['quantile'].detach().cpu().numpy()[0]
    df_metrics.at[metric_name, 'q0.95'] = bootstrap_output['quantile'].detach().cpu().numpy()[1]
df_metrics.to_excel(f"{path_save}/Origin/confidence/metrics.xlsx", index_label='Metrics')

## 1.2. Augmented data

### Naive augmented data: the same distribution of original feats

In [None]:
path_curr = f"{path_save}/Augmentation/Naive"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

n_bins = 100
n_smps = 10000

df_aug_naive = pd.DataFrame(columns=feats)
for f in feats:
    f_vals = df.loc[ids_trn_val, f].values
    counts, bin_edges = np.histogram(df.loc[ids_trn_val, f].values, bins=n_bins)
    df_aug_naive[f] = np.random.choice(bin_edges[:-1], size=n_smps, p=counts/len(f_vals))
df_aug_naive["SImAge"] = model(torch.from_numpy(np.float32(df_aug_naive.loc[:, feats].values))).cpu().detach().numpy().ravel()
for m, drm in dim_red_models.items():
    dim_red_res = drm.transform(df_aug_naive.loc[:, feats].values)
    df_aug_naive.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
    df_aug_naive.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
df_aug_naive.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')

### Augmented data with Synthetic Data Vault (SDV)

In [None]:
n_smps = 10000

df_aug_sdv_input = df.loc[:, np.concatenate((feats, ['Age']))]

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_aug_sdv_input)

synthesizers = {
    'FAST_ML': SingleTablePreset(metadata, name='FAST_ML'),
    'GaussianCopula': GaussianCopulaSynthesizer(metadata),
    'CTGANSynthesizer': CTGANSynthesizer(metadata),
    'TVAESynthesizer': TVAESynthesizer(metadata),
    'CopulaGANSynthesizer': CopulaGANSynthesizer(metadata),
}
for s_name, s in (pbar := tqdm(synthesizers.items())):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path_save}/Augmentation/{s_name}"
    pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

    s.fit(
        data=df_aug_sdv_input
    )
    s.save(
        filepath=f"{path_curr}/synthesizer.pkl"
    )
    df_aug_sdv = s.sample(
        num_rows=n_smps
    )
    quality_report = evaluate_quality(
        df_aug_sdv_input,
        df_aug_sdv,
        metadata
    )
    
    q_rep_prop = quality_report.get_properties()
    q_rep_prop.set_index('Property', inplace=True)
    
    df_col_shapes = quality_report.get_details(property_name='Column Shapes')
    df_col_shapes.sort_values(["Score"], ascending=[False], inplace=True)
    df_col_shapes.to_excel(f"{path_curr}/ColumnShapes.xlsx", index=False)
    fig = plt.figure(figsize=(3, 5))
    sns.set_theme(style='whitegrid')
    barplot = sns.barplot(
        data=df_col_shapes,
        x="Score",
        y="Column",
        edgecolor='black',
        color=colors_augs[s_name],
        dodge=False,
        orient='h'
    )
    barplot.set_title(f"{s_name} Average Score: {q_rep_prop.at['Column Shapes', 'Score']:0.2f}")
    barplot.set_xlabel(f"KSComplement")
    barplot.set_ylabel(f"Features")
    plt.savefig(f"{path_curr}/ColumnShapes.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_curr}/ColumnShapes.pdf", bbox_inches='tight')
    plt.close(fig)
    
    df_col_pair_trends = quality_report.get_details(property_name='Column Pair Trends')
    df_col_pair_trends.to_excel(f"{path_curr}/ColumnPairTrends.xlsx", index=False)
    feats_plot = np.concatenate((feats, ['Age']))
    df_corr_mtx = pd.DataFrame(data=np.zeros(shape=(len(feats_plot), len(feats_plot))), index=feats_plot, columns=feats_plot)
    df_pair_mtx = pd.DataFrame(index=feats_plot, columns=feats_plot)
    for index, row in df_col_pair_trends.iterrows():
        df_corr_mtx.at[row['Column 1'], row['Column 2']] = row['Real Correlation']
        df_corr_mtx.at[row['Column 2'], row['Column 1']] = row['Synthetic Correlation']
        df_pair_mtx.at[row['Column 1'], row['Column 2']] = row['Score']
        df_pair_mtx.at[row['Column 2'], row['Column 1']] = row['Score']
    
    fig = plt.figure()
    df_pair_mtx.fillna(value=np.nan, inplace=True)
    sns.set_theme(style='whitegrid')
    heatmap = sns.heatmap(
        data=df_pair_mtx,
        cmap='plasma',
        annot=True,
        fmt="0.2f",
        cbar_kws={'label': "Correlation Similarity"},
        mask=df_pair_mtx.isnull()
    )
    heatmap.set(xlabel="", ylabel="")
    heatmap.tick_params(axis='x', rotation=90)
    heatmap.set_title(f"{s_name} Average Score: {q_rep_prop.at['Column Pair Trends', 'Score']:0.2f}")
    plt.savefig(f"{path_curr}/ColumnPairTrends.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_curr}/ColumnPairTrends.pdf", bbox_inches='tight')
    plt.close(fig)
    
    sns.set_theme(style='whitegrid')
    mtx_to_plot = df_corr_mtx.to_numpy()
    mtx_triu = np.triu(mtx_to_plot, +1)
    mtx_triu_mask = np.ma.masked_array(mtx_triu, mtx_triu==0)
    cmap_triu = plt.get_cmap("seismic").copy()
    mtx_tril = np.tril(mtx_to_plot, -1)
    mtx_tril_mask = np.ma.masked_array(mtx_tril, mtx_tril==0)
    cmap_tril = plt.get_cmap("PRGn").copy()
    fig, ax = plt.subplots()
    im_triu = ax.imshow(mtx_triu_mask, cmap=cmap_triu, vmin=-1, vmax=1)
    cbar_triu = ax.figure.colorbar(im_triu, ax=ax, location='right', shrink=0.7, pad=0.1)
    cbar_triu.ax.tick_params(labelsize=10)
    cbar_triu.set_label("Real Correlation", horizontalalignment='center', fontsize=12)
    im_tril = ax.imshow(mtx_tril_mask, cmap=cmap_tril, vmin=-1, vmax=1)
    cbar_tril = ax.figure.colorbar(im_tril, ax=ax, location='right', shrink=0.7, pad=0.1)
    cbar_tril.ax.tick_params(labelsize=10)
    cbar_tril.set_label("Synthetic Correlation", horizontalalignment='center', fontsize=12)
    ax.grid(None)
    ax.set_aspect("equal")
    ax.set_xticks(np.arange(df_corr_mtx.shape[1]))
    ax.set_yticks(np.arange(df_corr_mtx.shape[0]))
    ax.set_xticklabels(df_corr_mtx.columns.values)
    ax.set_yticklabels(df_corr_mtx.index.values)
    plt.setp(ax.get_xticklabels(), rotation=90)
    ax.tick_params(axis='both', which='major', labelsize=10)
    ax.tick_params(axis='both', which='minor', labelsize=10)
    for i in range(df_corr_mtx.shape[0]):
        for j in range(df_corr_mtx.shape[1]):
            color = "black"
            if i != j:
                color = "black"
                if np.abs(mtx_tril[i, j]) > 0.5:
                    color = 'white'
                text = ax.text(j, i, f"{mtx_to_plot[i, j]:0.2f}", ha="center", va="center", color=color, fontsize=7)
    fig.tight_layout()
    plt.savefig(f"{path_curr}/Correlations.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_curr}/Correlations.pdf", bbox_inches='tight')
    plt.clf()
    
    df_aug_sdv["SImAge"] = model(torch.from_numpy(np.float32(df_aug_sdv.loc[:, feats].values))).cpu().detach().numpy().ravel()
    df_aug_sdv["SImAge Error"] = df_aug_sdv["SImAge"] - df_aug_sdv["Age"]
    df_aug_sdv["abs(SImAge Error)"] = df_aug_sdv["SImAge Error"].abs()
    for m, drm in dim_red_models.items():
        dim_red_res = drm.transform(df_aug_sdv.loc[:, feats].values)
        df_aug_sdv.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
        df_aug_sdv.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
    
    for f in feats:
        df_aug_sdv[f"{f}_scaled"] = scalers[f].transform(df_aug_sdv.loc[:, f].values.reshape(-1, 1))
    
    add_iqr_outs_to_df(df_aug_sdv, df.loc[ids_trn_val, :], feats)
    add_pyod_outs_to_df(df_aug_sdv, pyod_methods, feats_scaled)
    
    df_aug_sdv.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')

### Plot in reduced dimension

In [None]:
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    
    path_curr = f"{path_save}/Augmentation/{s_name}"
    df_aug_sdv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    df_ori_aug = pd.concat([df, df_aug_sdv])
    for m in ['t-SNE']:
        n_bins = 100
        x_xtd = (df_aug_sdv[dim_red_labels[m][0]].max() - df_aug_sdv[dim_red_labels[m][0]].min()) * 0.075
        x_min = df_aug_sdv[dim_red_labels[m][0]].min() - x_xtd
        x_max = df_aug_sdv[dim_red_labels[m][0]].max() + x_xtd
        x_shift = (x_max - x_min) / n_bins
        x_bin_centers = np.linspace(
            start=x_min + 0.5 * x_shift,
            stop=x_max - 0.5 * x_shift,
            num=n_bins
        )
        y_xtd = (df_aug_sdv[dim_red_labels[m][1]].max() - df_aug_sdv[dim_red_labels[m][1]].min()) * 0.075
        y_min = df_aug_sdv[dim_red_labels[m][1]].min() - y_xtd
        y_max = df_aug_sdv[dim_red_labels[m][1]].max() + y_xtd
        y_shift = (y_max - y_min) / n_bins
        y_bin_centers = np.linspace(
            start=y_min + 0.5 * y_shift,
            stop=y_max - 0.5 * y_shift,
            num=n_bins
        )
        df_heatmap_sum = pd.DataFrame(index=x_bin_centers, columns=y_bin_centers, data=np.zeros((n_bins, n_bins)))
        df_heatmap_cnt = pd.DataFrame(index=x_bin_centers, columns=y_bin_centers, data=np.zeros((n_bins, n_bins)))
        xs = df_aug_sdv.loc[:, dim_red_labels[m][0]].values
        xs_ids = np.floor((xs - x_min) / (x_shift + 1e-10)).astype(int)
        ys = df_aug_sdv.loc[:, dim_red_labels[m][1]].values
        ys_ids = np.floor((ys - y_min) / (y_shift + 1e-10)).astype(int)
        zs = df_aug_sdv.loc[:, "SImAge Error"].values
        for d_id in range(len(xs_ids)):
            df_heatmap_sum.iat[xs_ids[d_id], ys_ids[d_id]] += zs[d_id]
            df_heatmap_cnt.iat[xs_ids[d_id], ys_ids[d_id]] += 1
        df_heatmap = pd.DataFrame(data=df_heatmap_sum.values / df_heatmap_cnt.values, columns=df_heatmap_sum.columns, index=df_heatmap_sum.index)
        df_heatmap.to_excel(f"{path_curr}/{m}_heatmap.xlsx")
        
        norm = plt.Normalize(df_ori_aug["SImAge Error"].min(), df_ori_aug["SImAge Error"].max())
        sm = plt.cm.ScalarMappable(cmap="spring", norm=norm)
        sm.set_array([])
        fig, ax = plt.subplots(figsize=(5, 4))
        sns.set_theme(style='whitegrid')

        ax.imshow(
            X=df_heatmap.transpose().iloc[::-1].values,
            extent=[x_min, x_max, y_min, y_max],
            vmin=df_ori_aug["SImAge Error"].min(),
            vmax=df_ori_aug["SImAge Error"].max(),
            aspect=x_shift/y_shift,
            cmap="spring",
            alpha=1.0
        )
        
        scatter_colors = {sample: colors.rgb2hex(sm.to_rgba(row["SImAge Error"])) for sample, row in df.iterrows()}
        scatter = sns.scatterplot(
            data=df,
            x=dim_red_labels[m][0],
            y=dim_red_labels[m][1],
            palette=scatter_colors,
            hue=df.index,
            linewidth=1,
            alpha=0.85,
            edgecolor="k",
            marker='o',
            s=30,
            ax=ax
        )
        scatter.get_legend().remove()
        fig.colorbar(sm, label="SImAge Error")
        plt.title(f'{s_name}', y=1.2, fontsize = 14)
        
        legend_handles = [
            mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Real'),
            mlines.Line2D([], [], marker='s', linestyle='None', markeredgewidth=0, markerfacecolor='lightgrey', markersize=10, label='Synthetic')
        ]
        plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", borderaxespad=0, mode="expand", ncol=2, frameon=False)
        
        plt.savefig(f"{path_curr}/{m}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path_curr}/{m}.pdf", bbox_inches='tight')
        plt.close(fig)

### Plot distributions

In [None]:
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    
    path_curr = f"{path_save}/Augmentation/{s_name}"
    df_aug_sdv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    df_aug_sdv['Data'] = s_name
    df_ori_aug = pd.concat([df, df_aug_sdv])
    
    pw_brick_kdes = {}
    pw_brick_scatters = {}
    for f in feats:
    
        pw_brick_kdes[f] = pw.Brick(figsize=(4, 3))
        sns.set_theme(style='whitegrid')
        kdeplot = sns.kdeplot(
            data=df_ori_aug,
            x=f,
            hue='Data',
            palette={'Real': 'grey', s_name: colors_augs[s_name]},
            hue_order=['Real', s_name],
            fill=True,
            common_norm=False,
            ax=pw_brick_kdes[f]
        )
        
        pw_brick_scatters[f] = pw.Brick(figsize=(4, 3))
        sns.set_theme(style='whitegrid')
        sns.histplot(
            data=df_aug_sdv,
            x=f,
            y='Age',
            bins=30,
            discrete=(False, False),
            log_scale=(False, False),
            cbar=True,
            color=colors_augs[s_name],
            ax=pw_brick_scatters[f],
        )
        scatterplot = sns.scatterplot(
            data=df,
            x=f,
            y='Age',
            hue='Data',
            palette={'Real': 'grey', s_name: colors_augs[s_name]},
            hue_order=['Real', s_name],
            linewidth=0.85,
            alpha=0.85,
            edgecolor="k",
            marker='o',
            s=20,
            ax=pw_brick_scatters[f]
        )

    n_cols = 3
    n_rows = int(np.ceil(len(feats)/ n_cols))
    pw_rows_kdes = []
    pw_rows_scatters = []
    for r_id in range(n_rows):
        pw_cols_kdes = []
        pw_cols_scatters = []
        for c_id in range(n_cols):
            rc_id = r_id * n_cols + c_id
            if rc_id < len(feats):
                f = feats[rc_id]
                pw_cols_kdes.append(pw_brick_kdes[f])
                pw_cols_scatters.append(pw_brick_scatters[f])
            else:
                empty_fig = pw.Brick(figsize=(4.67, 3))
                empty_fig.axis('off')
                pw_cols_kdes.append(empty_fig)
                pw_cols_scatters.append(empty_fig)
        pw_rows_kdes.append(pw.stack(pw_cols_kdes, operator="|"))
        pw_rows_scatters.append(pw.stack(pw_cols_scatters, operator="|"))
    pw_fig_kde = pw.stack(pw_rows_kdes, operator="/")
    pw_fig_kde.savefig(f"{path_curr}/feats_kde.png", bbox_inches='tight', dpi=200)
    pw_fig_kde.savefig(f"{path_curr}/feats_kde.pdf", bbox_inches='tight')
    pw_fig_scatter = pw.stack(pw_rows_scatters, operator="/")
    pw_fig_scatter.savefig(f"{path_curr}/feats_scatter.png", bbox_inches='tight', dpi=200)
    pw_fig_scatter.savefig(f"{path_curr}/feats_scatter.pdf", bbox_inches='tight')
    
    fig = plt.figure(figsize=(6, 4))
    sns.set_theme(style='whitegrid')
    kdeplot = sns.kdeplot(
        data=df_ori_aug,
        x='Age',
        hue='Data',
        palette={'Real': 'grey', s_name: colors_augs[s_name]},
        hue_order=['Real', s_name],
        fill=True,
        common_norm=False,
    )
    plt.savefig(f"{path_curr}/Age_kde.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_curr}/Age_kde.pdf", bbox_inches='tight')
    plt.close(fig)

### Plot Error distributions

In [None]:
dfs_fig = [df.loc[:, ['Data', 'SImAge Error']].copy()]
df_stat = pd.DataFrame(index=list(colors_augs.keys()), columns=['mw_pval'])
mae_dict = {'Real': mean_absolute_error(df.loc[:, 'Age'].values, df.loc[:, 'SImAge'].values)}
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    
    path_curr = f"{path_save}/Augmentation/{s_name}"
    df_aug_sdv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    mae_dict[s_name] = mean_absolute_error(df_aug_sdv.loc[:, 'Age'].values, df_aug_sdv.loc[:, 'SImAge'].values)
    df_fig = df_aug_sdv.loc[:, ['SImAge Error']].copy()
    df_fig.set_index(df_fig.index.astype(str).values + f'_{s_name}', inplace=True)
    df_fig['Data'] = s_name
    dfs_fig.append(df_fig)
    
    _, df_stat.at[s_name, 'mw_pval'] = mannwhitneyu(df['SImAge Error'].values, df_fig['SImAge Error'].values, alternative='two-sided')

_, df_stat.loc[:, "mw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[:, "mw_pval"], 0.05, method='fdr_bh')

df_fig = pd.concat(dfs_fig)

rename_dict = {x: f"{x}\nMAE={mae_dict[x]:0.2f}" for x in mae_dict}
colors_dict_old = {'Real': 'grey'} | colors_augs
colors_dict_new = {f"{x}\nMAE={mae_dict[x]:0.2f}": colors_dict_old[x] for x in rename_dict}
df_fig['Data'].replace(rename_dict, inplace=True)
fig = plt.figure(figsize=(12, 8))
sns.set_theme(style='whitegrid')
violin = sns.violinplot(
    data=df_fig,
    x='Data',
    y='SImAge Error',
    palette=colors_dict_new,
    scale='width',
    order=list(colors_dict_new.keys()),
    saturation=0.75,
)
pval_formatted = [f"{df_stat.at[x, 'mw_pval_fdr_bh']:.2e}" for x in colors_augs]
annotator = Annotator(
    violin,
    pairs=[(rename_dict['Real'], rename_dict[x]) for x in colors_augs],
    data=df_fig,
    x='Data',
    y='SImAge Error',
    order=list(colors_dict_new.keys()),
)
annotator.set_custom_annotations(pval_formatted)
annotator.configure(loc='outside')
annotator.annotate()
plt.savefig(f"{path_save}/Augmentation/SImAgeError.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/Augmentation/SImAgeError.pdf", bbox_inches='tight')
plt.close(fig)

### Outliers analysis for augmented samples

In [None]:
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")

    path_curr = f"{path_save}/Augmentation/{s_name}"
    df_aug_sdv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    
    # IQR outliers
    pathlib.Path(f"{path_curr}/outliers_iqr").mkdir(parents=True, exist_ok=True)
    plot_iqr_outs(df_aug_sdv, feats, colors_augs[s_name], s_name, f"{path_curr}/outliers_iqr")
    plot_iqr_outs_regression_error(df_aug_sdv, feats, s_name, f"{path_curr}/outliers_iqr", thld_outs_iqr, 'Age', 'SImAge', 'SImAge Error')
    
    # PyOD plots
    pathlib.Path(f"{path_curr}/outliers_pyod").mkdir(parents=True, exist_ok=True)
    plot_pyod_outs(df_aug_sdv, pyod_methods, colors_augs[s_name], s_name, f"{path_curr}/outliers_pyod")
    plot_pyod_outs_regression_error(df_aug_sdv, pyod_methods, s_name, f"{path_curr}/outliers_pyod", thld_outs_pyod, 'Age', 'SImAge', 'SImAge Error')


### Regression error analysis

In [None]:
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")

    path_curr = f"{path_save}/Augmentation/{s_name}"
    df_aug_sdv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    
    pathlib.Path(f"{path_curr}/errors").mkdir(parents=True, exist_ok=True)
    plot_regression_error_distributions(df_aug_sdv, feats, colors_augs[s_name], s_name, f"{path_curr}/errors", "abs(SImAge Error)")

### Confidence

In [None]:
torch.manual_seed(42)
quantiles = torch.tensor([0.05, 0.95])

for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    
    path_curr = f"{path_save}/Augmentation/{s_name}"
    df_aug_sdv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    
    pathlib.Path(f"{path_curr}/confidence").mkdir(parents=True, exist_ok=True)
    metrics = get_reg_metrics()
    df_metrics = pd.DataFrame(index=list(metrics.keys()), columns=['mean', 'std', 'q0.05', 'q0.95'])
    y_real = torch.from_numpy(np.float32(df_aug_sdv['Age'].values))
    y_pred = torch.from_numpy(np.float32(df_aug_sdv['SImAge'].values))
    for metric_name, metric_pair in metrics.items():
        metric = metric_pair[0]
        bootstrap = BootStrapper(
            metric,
            num_bootstraps=200,
            sampling_strategy="multinomial",
            quantile=quantiles
        )
        bootstrap.update(y_pred, y_real)
        bootstrap_output = bootstrap.compute()
        df_metrics.at[metric_name, 'mean'] = bootstrap_output['mean'].detach().cpu().numpy()
        df_metrics.at[metric_name, 'std'] = bootstrap_output['std'].detach().cpu().numpy()
        df_metrics.at[metric_name, 'q0.05'] = bootstrap_output['quantile'].detach().cpu().numpy()[0]
        df_metrics.at[metric_name, 'q0.95'] = bootstrap_output['quantile'].detach().cpu().numpy()[1]
    df_metrics.to_excel(f"{path_curr}/confidence/metrics.xlsx", index_label='Metrics')

In [None]:
metrics_names = {
    'mean_absolute_error': 'MAE',
    'pearson_corr_coef': 'Pearson rho'
}
quantiles = [0.05, 0.95]

df_conf = pd.DataFrame(index=['Real'] + list(colors_augs.keys()), columns=[f"{m}_{q}" for m in metrics_names for q in quantiles])

df_metrics = pd.read_excel(f"{path_save}/Origin/confidence/metrics.xlsx", index_col='Metrics')
for m in metrics_names:
    for q in quantiles:
        df_conf.at["Real", f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]

for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    
    df_metrics = pd.read_excel(f"{path_save}/Augmentation/{s_name}/confidence/metrics.xlsx", index_col='Metrics')
    for m in metrics_names:
        for q in quantiles:
            df_conf.at[s_name, f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]

colors_dict = {'Real': 'grey'} | colors_augs
for m in metrics_names:
    df_fig = df_conf.loc[:, [f"{m}_{q}" for q in quantiles]].copy()
    df_fig['Type'] = df_fig.index
    df_fig = df_fig.melt(id_vars=['Type'], value_name=metrics_names[m])
    fig, ax = plt.subplots(figsize=(3, 2))
    sns.set_theme(style='ticks') 
    scatter = sns.scatterplot(
        data=df_fig,
        x=metrics_names[m],
        y='Type',
        hue='Type',
        palette=colors_dict,
        hue_order=list(colors_dict.keys()),
        linewidth=0.2,
        alpha=0.95,
        edgecolor="black",
        s=16,
        ax=ax
    )
    scatter.get_legend().set_visible(False)
    line = sns.lineplot(
        data=df_fig,
        x=metrics_names[m],
        y='Type',
        hue='Type',
        palette=colors_dict,
        hue_order=list(colors_dict.keys()),
        linewidth=2,
        ax=ax
    )
    line.get_legend().set_visible(False)
    ax.set_xlabel(f"Confidence Intervals for {metrics_names[m]}")
    plt.savefig(f"{path_save}/Augmentation/confidence_{m}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/Augmentation/confidence_{m}.pdf", bbox_inches='tight')
    plt.close(fig)

## 1.3. Attacks

### Generate attacks

In [None]:
epsilons = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
)))
df_eps = pd.DataFrame(index=epsilons)

for eps_raw in epsilons:

    eps = np.array([eps_raw * iqr(df.loc[:, feat].values) for feat in feats])
    eps_step = np.array([0.2 * eps_raw * iqr(df.loc[:, feat].values) for feat in feats])

    attacks = {
        'MomentumIterative': MomentumIterativeMethod(
            estimator=art_regressor,
            norm=np.inf,
            eps=eps,
            eps_step=eps_step,
            decay=0.1,
            max_iter=100,
            targeted=False,
            batch_size=512,
            verbose=True
        ),
        'BasicIterative': BasicIterativeMethod(
            estimator=art_regressor,
            eps=eps,
            eps_step=eps_step,
            max_iter=100,
            targeted=False,
            batch_size=512,
            verbose=True
        ),
        'ProjectedGradientDescent': ProjectedGradientDescentNumpy(
            estimator=art_regressor,
            norm=np.inf,
            eps=eps,
            eps_step=eps_step,
            decay=None,
            max_iter=100,
            targeted=False,
            num_random_init=0,
            batch_size=512,
            random_eps=False,
            summary_writer=False,
            verbose=True
        ),
        'FastGradient': FastGradientMethod(
            estimator=art_regressor,
            norm=np.inf,
            eps=eps,
            eps_step=eps_step,
            targeted=False,
            num_random_init=0,
            batch_size=512,
            minimal=False,
            summary_writer=False,
        ),
    }

    for attack_name, attack in attacks.items():
        path_curr = f"{path_save}/Evasion/{attack_name}/eps_{eps_raw:0.4f}"
        pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

        X_adv = attack.generate(np.float32(df.loc[:, feats].values))
        
        df_adv = df.loc[:, ['Age']].copy()
        df_adv.loc[:, feats] = X_adv
        df_adv["SImAge"] = model(torch.from_numpy(np.float32(df_adv.loc[:, feats].values))).cpu().detach().numpy().ravel()
        df_adv["SImAge Error"] = df_adv["SImAge"] - df_adv["Age"]
        df_adv["abs(SImAge Error)"] = df_adv["SImAge Error"].abs()
        df_adv.loc[:, "Error Origin"] = df.loc[:, "SImAge"] - df.loc[:, "Age"]
        df_adv.loc[:, "Error Attack"] = df_adv.loc[:, "SImAge"] - df_adv.loc[:, "Age"]
        df_adv['Error Diff'] = df_adv['Error Attack'] - df_adv['Error Origin']
        df_adv['abs(Error Diff)'] = df_adv['Error Diff'].abs()
        for m, drm in dim_red_models.items():
            dim_red_res = drm.transform(df_adv.loc[:, feats].values)
            df_adv.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
            df_adv.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
            
        for f in feats:
            df_adv[f"{f}_scaled"] = scalers[f].transform(df_adv.loc[:, f].values.reshape(-1, 1))
        
        add_iqr_outs_to_df(df_adv, df.loc[ids_trn_val, :], feats)
        add_pyod_outs_to_df(df_adv, pyod_methods, feats_scaled)
            
        df_adv.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')

        metrics = get_reg_metrics()
        metrics_cols = [f"{m}_{p}" for m in metrics for p in ids_dict]
        df_metrics = pd.DataFrame(index=metrics_cols)
        for p, ids_part in ids_dict.items():
            for m in metrics:
                m_val = float(metrics[m][0](torch.from_numpy(np.float32(df.loc[ids_part, "SImAge"].values)), torch.from_numpy(np.float32(df.loc[ids_part, "Age"].values))).numpy())
                df_metrics.at[f"{m}_{p}", 'Origin'] = m_val
                metrics[m][0].reset()
                m_val = float(metrics[m][0](torch.from_numpy(np.float32(df_adv.loc[ids_part, "SImAge"].values)), torch.from_numpy(np.float32(df.loc[ids_part, "Age"].values))).numpy())
                df_metrics.at[f"{m}_{p}", 'Attack'] = m_val
                metrics[m][0].reset()
        df_metrics.to_excel(f"{path_curr}/metrics.xlsx", index_label='Metrics')
        
        for p in ids_dict:
            if attack_name == 'MomentumIterative':
                df_eps.loc[eps_raw, f"Origin_MAE_{p}"] = df_metrics.at[f'mean_absolute_error_{p}', 'Origin']
            df_eps.loc[eps_raw, f"{attack_name}_MAE_{p}"] = df_metrics.at[f'mean_absolute_error_{p}', 'Attack']
            
df_eps.to_excel(f"{path_save}/Evasion/df_eps.xlsx", index_label='eps')

for p in ids_dict:
    df_fig = df_eps.loc[:, [f"{x}_MAE_{p}" for x in colors_atks]].copy()
    df_fig.rename(columns={f"{x}_MAE_{p}": x for x in colors_atks}, inplace=True)
    df_fig['Eps'] = df_fig.index.values
    df_fig = df_fig.melt(id_vars="Eps", var_name='Method', value_name="MAE")
    fig = plt.figure()
    sns.set_theme(style='whitegrid', font_scale=1)
    lines = sns.lineplot(
        data=df_fig,
        x='Eps',
        y="MAE",
        hue=f"Method",
        style=f"Method",
        palette=colors_atks,
        hue_order=list(colors_atks.keys()),
        markers=True,
        dashes=False,
    )
    plt.xscale('log')
    lines.set_xlabel(r'$\epsilon$')
    x_min = 0.009
    x_max = 1.05
    mae_basic = df_eps.at[0.01, f"Origin_MAE_{p}"]
    lines.set_xlim(x_min, x_max)
    plt.gca().plot(
        [x_min, x_max],
        [mae_basic, mae_basic],
        color='k',
        linestyle='dashed',
        linewidth=1
    )
    plt.savefig(f"{path_save}/Evasion/line_mae_vs_eps_{p}.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_save}/Evasion/line_mae_vs_eps_{p}.pdf", bbox_inches='tight')
    plt.close(fig)

### Plot in reduced dimension

In [None]:
epsilons_hglt = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
colors_epsilons = {x: px.colors.qualitative.G10[x_id] for x_id, x in enumerate(['Origin'] + epsilons_hglt)}

for atk in colors_atks:
    for m in ['t-SNE']:
        df_fig_ori = df.loc[:, ['SImAge Error', dim_red_labels[m][0], dim_red_labels[m][1]]].copy()
        df_fig_ori['Symbol'] = 'o'
        df_fig_ori['index_origin'] = df_fig_ori.index
        df_fig_ori['Eps'] = 'Origin'
        
        dfs_fig_adv = [df_fig_ori]
        for eps in epsilons_hglt:
            path_curr = f"{path_save}/Evasion/{atk}/eps_{eps:0.4f}"
            pathlib.Path(f"{path_curr}/SImAgeError").mkdir(parents=True, exist_ok=True)
            
            df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
            df_fig_adv = df_adv.loc[:, ['SImAge Error', 'abs(Error Diff)', dim_red_labels[m][0], dim_red_labels[m][1]]].copy()
            df_fig_adv['Eps'] = eps
            df_fig_adv['index_origin'] = df_fig_adv.index
            df_fig_adv.set_index(df_fig_adv.index.values + f'_adv_eps_{eps:0.4f}', inplace=True)
            df_fig_adv['Symbol'] = 'X'
            dfs_fig_adv.append(df_fig_adv)
            df_fig_all = pd.concat([df_fig_ori, df_fig_adv])
            
            norm = plt.Normalize(df_fig_all['SImAge Error'].min(), df_fig_all['SImAge Error'].max())
            sm = plt.cm.ScalarMappable(cmap="spring", norm=norm)
            sm.set_array([])
            
            fig, ax = plt.subplots(figsize=(5, 4))
            
            sns.set_theme(style='whitegrid')
            scatter = sns.scatterplot(
                data=df_fig_all,
                x=dim_red_labels[m][0],
                y=dim_red_labels[m][1],
                palette='spring',
                hue='SImAge Error',
                linewidth=1,
                alpha=0.75,
                edgecolor="k",
                style=df_fig_all.loc[:, 'Symbol'].values,
                s=40,
                ax=ax
            )
            scatter.get_legend().remove()
            scatter.figure.colorbar(sm, label='SImAge Error')
            scatter.set_title(fr'$\epsilon={eps:0.2f}$', loc='left', fontdict={'fontsize': 20})

            legend_handles = [
                mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Real'),
                mlines.Line2D([], [], marker='X', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Attack')
            ]
            plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0.4, 1.02, 1, 0.2), loc="lower left", borderaxespad=0, ncol=2, frameon=False)
            
            plt.savefig(f"{path_curr}/SImAgeError/{m}.png", bbox_inches='tight', dpi=200)
            plt.savefig(f"{path_curr}/SImAgeError/{m}.pdf", bbox_inches='tight')
            plt.close(fig)  
        
        df_fig_adv_eps = pd.concat(dfs_fig_adv)
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.set_theme(style='whitegrid')
        kdeplot = sns.kdeplot(
            data=df_fig_adv_eps,
            x='SImAge Error',
            palette=colors_epsilons,
            hue='Eps',
            linewidth=2,
            fill=False,
            ax=ax
        )
        plt.savefig(f"{path_save}/Evasion/{atk}/SImAgeError_{m}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path_save}/Evasion/{atk}/SImAgeError_{m}.pdf", bbox_inches='tight')
        plt.close(fig)

### Plot distributions

In [None]:
epsilons_hglt = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
colors_epsilons = {x: px.colors.qualitative.G10[x_id] for x_id, x in enumerate(['Origin'] + epsilons_hglt)}

df['Eps'] = 'Origin'
df['MarkerSize'] = 40

for atk in colors_atks:

    for eps in epsilons_hglt:
        path_curr = f"{path_save}/Evasion/{atk}/eps_{eps:0.4f}"
        pathlib.Path(f"{path_curr}/SImAgeError").mkdir(parents=True, exist_ok=True)
        df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
        df_adv.index += f'_eps_{eps:0.4f}'
        df_adv['Eps'] = eps
        df_adv['MarkerSize'] = 30
        df_ori_adv = pd.concat([df, df_adv])
        
        pw_brick_kdes = {}
        pw_brick_scatters = {}
        for f in feats:
            
            pw_brick_kdes[f] = pw.Brick(figsize=(4, 3))
            sns.set_theme(style='whitegrid')
            kdeplot = sns.kdeplot(
                data=df_ori_adv,
                x=f,
                hue='Eps',
                palette={'Origin': 'grey', eps: colors_epsilons[eps]},
                hue_order=['Origin', eps],
                fill=True,
                common_norm=False,
                ax=pw_brick_kdes[f]
            )
            
            pw_brick_scatters[f] = pw.Brick(figsize=(4, 3))
            sns.set_theme(style='whitegrid')
            scatterplot = sns.scatterplot(
                data=df_ori_adv,
                x=f,
                y='Age',
                hue='Eps',
                palette={'Origin': 'grey', eps: colors_epsilons[eps]},
                hue_order=['Origin', eps],
                linewidth=0.85,
                alpha=0.75,
                edgecolor="k",
                marker='o',
                s=30,
                ax=pw_brick_scatters[f]
            )
        
        n_cols = 3
        n_rows = int(np.ceil(len(feats)/ n_cols))
        pw_rows_kdes = []
        pw_rows_scatters = []
        for r_id in range(n_rows):
            pw_cols_kdes = []
            pw_cols_scatters = []
            for c_id in range(n_cols):
                rc_id = r_id * n_cols + c_id
                if rc_id < len(feats):
                    f = feats[rc_id]
                    pw_cols_kdes.append(pw_brick_kdes[f])
                    pw_cols_scatters.append(pw_brick_scatters[f])
                else:
                    empty_fig = pw.Brick(figsize=(4.67, 3))
                    empty_fig.axis('off')
                    pw_cols_kdes.append(empty_fig)
                    pw_cols_scatters.append(empty_fig)
            pw_rows_kdes.append(pw.stack(pw_cols_kdes, operator="|"))
            pw_rows_scatters.append(pw.stack(pw_cols_scatters, operator="|"))
        pw_fig_kde = pw.stack(pw_rows_kdes, operator="/")
        pw_fig_kde.savefig(f"{path_curr}/feats_kde.png", bbox_inches='tight', dpi=200)
        pw_fig_kde.savefig(f"{path_curr}/feats_kde.pdf", bbox_inches='tight')
        pw_fig_scatter = pw.stack(pw_rows_scatters, operator="/")
        pw_fig_scatter.savefig(f"{path_curr}/feats_scatter.png", bbox_inches='tight', dpi=200)
        pw_fig_scatter.savefig(f"{path_curr}/feats_scatter.pdf", bbox_inches='tight')
        pw.clear()

### Outliers analysis for attacks

In [None]:
epsilons_hglt = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
colors_epsilons = {x: px.colors.qualitative.G10[x_id] for x_id, x in enumerate(['Origin'] + epsilons_hglt)}

for atk in colors_atks:

    for eps in epsilons_hglt:
        path_curr = f"{path_save}/Evasion/{atk}/eps_{eps:0.4f}"
        pathlib.Path(f"{path_curr}/SImAgeError").mkdir(parents=True, exist_ok=True)
        df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
        
        # IQR outliers
        pathlib.Path(f"{path_curr}/outliers_iqr").mkdir(parents=True, exist_ok=True)
        plot_iqr_outs(df_adv, feats, colors_epsilons[eps], f"{atk} Eps({eps})", f"{path_curr}/outliers_iqr")
        
        # PyOD plots
        pathlib.Path(f"{path_curr}/outliers_pyod").mkdir(parents=True, exist_ok=True)
        plot_pyod_outs(df_adv, pyod_methods, colors_epsilons[eps], f"{atk} Eps({eps})", f"{path_curr}/outliers_pyod")

### Regression error analysis

In [None]:
epsilons_hglt = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
colors_epsilons = {x: px.colors.qualitative.G10[x_id] for x_id, x in enumerate(['Origin'] + epsilons_hglt)}

for atk in colors_atks:

    for eps in epsilons_hglt:
        path_curr = f"{path_save}/Evasion/{atk}/eps_{eps:0.4f}"
        pathlib.Path(f"{path_curr}/errors").mkdir(parents=True, exist_ok=True)
        
        df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')

        plot_regression_error_distributions(df_adv, feats, colors_epsilons[eps], f"{atk} Eps({eps})", f"{path_curr}/errors", "abs(SImAge Error)")

### Confidence

In [None]:
torch.manual_seed(42)
quantiles = torch.tensor([0.05, 0.95])

epsilons = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
)))

for atk in colors_atks:

    for eps in epsilons:
        path_curr = f"{path_save}/Evasion/{atk}/eps_{eps:0.4f}"
        pathlib.Path(f"{path_curr}/confidence").mkdir(parents=True, exist_ok=True)
        
        df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')

        metrics = get_reg_metrics()
        df_metrics = pd.DataFrame(index=list(metrics.keys()), columns=['mean', 'std', 'q0.05', 'q0.95'])
        y_real = torch.from_numpy(np.float32(df_adv['Age'].values))
        y_pred = torch.from_numpy(np.float32(df_adv['SImAge'].values))
        for metric_name, metric_pair in metrics.items():
            metric = metric_pair[0]
            bootstrap = BootStrapper(
                metric,
                num_bootstraps=200,
                sampling_strategy="multinomial",
                quantile=quantiles
            )
            bootstrap.update(y_pred, y_real)
            bootstrap_output = bootstrap.compute()
            df_metrics.at[metric_name, 'mean'] = bootstrap_output['mean'].detach().cpu().numpy()
            df_metrics.at[metric_name, 'std'] = bootstrap_output['std'].detach().cpu().numpy()
            df_metrics.at[metric_name, 'q0.05'] = bootstrap_output['quantile'].detach().cpu().numpy()[0]
            df_metrics.at[metric_name, 'q0.95'] = bootstrap_output['quantile'].detach().cpu().numpy()[1]
        df_metrics.to_excel(f"{path_curr}/confidence/metrics.xlsx", index_label='Metrics')

In [None]:
metrics_names = {
    'mean_absolute_error': 'MAE',
    'pearson_corr_coef': 'Pearson rho'
}
quantiles = [0.05, 0.95]
epsilons = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
)))

for atk in colors_atks:
    df_conf = pd.DataFrame(index=epsilons, columns=[f"{m}_{q}" for m in metrics_names for q in quantiles])
    for eps in (pbar := tqdm(epsilons)):
        pbar.set_description(f"Processing Eps: {eps}")
        df_metrics = pd.read_excel(f"{path_save}/Evasion/{atk}/eps_{eps:0.4f}/confidence/metrics.xlsx", index_col='Metrics')
        for m in metrics_names:
            for q in quantiles:
                df_conf.at[eps, f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]
    
    for m in metrics_names:
        df_fig = df_conf.loc[:, [f"{m}_{q}" for q in quantiles]].copy()
        df_fig['Type'] = df_fig.index
        df_fig = df_fig.melt(id_vars=['Type'], value_name=metrics_names[m])
        fig, ax = plt.subplots(figsize=(5, 4))
        sns.set_theme(style='ticks') 
        scatter = sns.scatterplot(
            data=df_fig,
            y=metrics_names[m],
            x='Type',
            hue='Type',
            palette={x: colors_atks[atk] for x in epsilons},
            hue_order=epsilons,
            linewidth=0.2,
            alpha=0.95,
            edgecolor="black",
            s=16,
            ax=ax
        )
        scatter.get_legend().set_visible(False)
        line = sns.lineplot(
            data=df_fig,
            y=metrics_names[m],
            x='Type',
            hue='Type',
            palette={x: colors_atks[atk] for x in epsilons},
            hue_order=epsilons,
            linewidth=3,
            ax=ax
        )
        line.get_legend().set_visible(False)
        plt.xscale('log')
        ax.set_xlabel(r'$\epsilon$')
        x_min = 0.009
        x_max = 1.05
        ax.set_ylabel(f"Confidence Intervals for {metrics_names[m]}")
        plt.savefig(f"{path_save}/Evasion/{atk}/confidence_{m}.png", bbox_inches='tight', dpi=400)
        plt.savefig(f"{path_save}/Evasion/{atk}/confidence_{m}.pdf", bbox_inches='tight')
        plt.close(fig)