In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from src.utils.outliers.iqr import add_iqr_outs_to_df, plot_iqr_outs, plot_iqr_outs_cls
from src.utils.outliers.pyod import add_pyod_outs_to_df, plot_pyod_outs, plot_pyod_outs_cls
from scripts.python.dataset_specific.GSEUNN.tasks.routines_046 import plot_regression_error_distributions, plot_cls_dim_red
from plotly.subplots import make_subplots
from scipy import stats
import plotly.express as px
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
import importlib
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
from scipy.interpolate import interp1d
from src.utils.verbose import NoStdStreams
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
from matplotlib import colors
from omegaconf import OmegaConf
from tqdm import tqdm
import seaborn as sns
from glob import glob
import pathlib
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap
from openTSNE import TSNE
from sklearn.metrics import mean_absolute_error
from scipy import stats
import patchworklib as pw
import os
import functools
from scipy.stats import iqr
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu, kruskal
import shap
from slugify import slugify
from src.models.tabular.widedeep.ft_transformer import WDFTTransformerModel
from src.models.tabular.widedeep.tab_net import WDTabNetModel
from art.estimators.regression.pytorch import PyTorchRegressor
from art.estimators.classification import PyTorchClassifier
from art.estimators.regression.blackbox import BlackBoxRegressor
from art.attacks.evasion import ProjectedGradientDescentNumpy, FastGradientMethod, BasicIterativeMethod, MomentumIterativeMethod
from art.attacks.evasion import ZooAttack, CarliniL2Method, ElasticNet, NewtonFool
import torch
from src.tasks.metrics import get_cls_pred_metrics, get_cls_prob_metrics
import matplotlib.lines as mlines

from sdv.metadata import SingleTableMetadata
from sdv.lite import SingleTablePreset
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot
from sklearn.preprocessing import StandardScaler
from scripts.python.routines.mvals import expit2

import missingno as msno

import joblib
import pickle

from pyod.models.ecod import ECOD
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD
from pyod.models.sos import SOS
from pyod.models.kde import KDE
from pyod.models.sampling import Sampling
from pyod.models.gmm import GMM

from pyod.models.kpca import KPCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lmdd import LMDD

from pyod.models.lof import LOF
from pyod.models.cof import COF
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.models.sod import SOD

from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.loda import LODA
from pyod.models.suod import SUOD

from pyod.models.auto_encoder_torch import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.deep_svdd import DeepSVDD

from pyod.models.lunar import LUNAR

from torchmetrics import BootStrapper


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# 2. Adversarial examples for DNAm data

## Data processing

### Prepare data for ML, convert mvals to betas

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/046_adversarial_robustness_toolbox/dnam"

df_mvals = pd.read_excel(f"{path}/mvals.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats_1000.xlsx", index_col=0).index.values

betas = expit2(df_mvals.loc[:, feats].values)
df_betas = df_mvals.copy()
df_betas.loc[:, feats] = betas
df_betas['Partition'].replace({'Train': 'trn_val', 'Validation': 'tst'}, inplace=True)
df_betas.to_excel(f"{path}/betas.xlsx", index_label='subject_id')

### Collect ML results

In [None]:
path = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/046_adversarial_robustness_toolbox/dnam"

model = 'widedeep_tab_net'

path_runs = f"{path}/models/{model}_trn_val_tst/multiruns"

files = glob(f"{path_runs}/*/*/metrics_all_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)

parts = [
    'trn',
    'val',
    'tst',
    'val_tst'
]

for file in files:
    head, tail = os.path.split(file)
    # Metrics
    df_metrics = pd.read_excel(file, index_col="metric")
    for metric in df_metrics.index.values:
        for part in parts:
            df_res.at[file, metric + f"_{part}"] = df_metrics.at[metric, part]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

df_res["train_worse_val"] = False
df_res.loc[df_res["accuracy_weighted_val"] > df_res["accuracy_weighted_trn"], "train_worse_val"] = True

df_res["File"] = df_res.index.str.replace(path_runs, '', regex=False)
df_res.set_index("File", inplace=True)

first_columns = [
    'accuracy_weighted_trn',
    'accuracy_weighted_val',
    'accuracy_weighted_tst',
    'accuracy_weighted_val_tst'
]
df_res = df_res[first_columns + [col for col in df_res.columns if col not in first_columns]]
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")

### Load data, models

In [None]:
model_type = 'widedeep_tab_net'
model_fn = 'best_fold_0000'
model_version = 'v2'

path = 'D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/046_adversarial_robustness_toolbox/dnam'
pathlib.Path(f"{path}/Origin").mkdir(parents=True, exist_ok=True)
df = pd.read_excel(f"{path}/betas.xlsx", index_col=0)
feats = pd.read_excel(f"{path}/feats_1000.xlsx", index_col=0).index.values
ids_feat = list(range(len(feats)))

df_pred = pd.read_excel(f"{path}/models/{model_type}/{model_version}/predictions.xlsx", index_col=0)
df.loc[df.index, ['Real', 'Pred', 'Prob Control', 'Prob Parkinson']] = df_pred.loc[df.index, ['Status', 'pred', 'pred_prob_0', 'pred_prob_1']].values
df['Data'] = 'Real'
df['Eps'] = 'Origin'

col_real = 'Real'
col_pred = 'Pred'

ids_trn_val = df.index[df['Partition'] == 'trn_val'].values
ids_tst = df.index[df['Partition'] == 'tst'].values
ids_all = df.index[df['Partition'].isin(['trn_val', 'tst'])].values
ids_dict = {
    'trn_val': ids_trn_val,
    'tst': ids_tst,
    'all': ids_all,
}

model = WDTabNetModel.load_from_checkpoint(checkpoint_path=f"{path}/models/{model_type}/{model_version}/{model_fn}.ckpt")
model.produce_probabilities = False
model.eval()
model.freeze()

colors_augs = {
    'FAST_ML': px.colors.qualitative.Light24[0],
    'GaussianCopula': px.colors.qualitative.Light24[1],
    'CTGANSynthesizer': px.colors.qualitative.Light24[2],
    'TVAESynthesizer': px.colors.qualitative.Light24[3],
    'CopulaGANSynthesizer': px.colors.qualitative.Light24[4],
}
colors_atks_eps = {
    "MomentumIterative": px.colors.qualitative.D3[0],
    "BasicIterative": px.colors.qualitative.D3[1],
    "ProjectedGradientDescent": px.colors.qualitative.D3[2],
    "FastGradient": px.colors.qualitative.D3[3],
}
colors_atks_bss = {
    "ElasticNet": px.colors.qualitative.G10[7],
    "CarliniL2Method": px.colors.qualitative.G10[8],
    "ZooAttack": px.colors.qualitative.G10[9],
}
colors_atks_eta = {
    'NewtonFool': px.colors.qualitative.T10[7],
}

dim_red_labels = {
    'PCA': ['PC 1', 'PC 2'],
    'SVD': ['SVD 1', 'SVD 2'],
    't-SNE': ['t-SNE 1', 't-SNE 2'],
    'GRP': ['GRP 1', 'GRP 2'],
    'SRP': ['SRP 1', 'SRP 2'],
    'IsoMap': ['IsoMap 1', 'IsoMap 2'],
    'MBDL': ['MBDL 1', 'MBDL 2'],
}

pyod_method_names = [
    'ECOD',
    'LUNAR',
    'DeepSVDD',
    'VAE',
    'LODA',
    'INNE',
    'IForest',
    'SOD',
    'KNN',
    'CBLOF',
    'LOF',
    'MCD',
    'GMM',
    'Sampling',
    'SOS',
    'COPOD',
]

### Create PyOD models, trained on trn_val samples

In [None]:
contamination = 0.1

pyod_methods = {
    'ECOD': ECOD(contamination=contamination),
    'LUNAR': LUNAR(),
    'DeepSVDD': DeepSVDD(contamination=contamination, verbose=0),
    'VAE': VAE(encoder_neurons=[32, 16, 8], decoder_neurons=[8, 16, 32], contamination=contamination),
    'LODA': LODA(contamination=contamination),
    'INNE': INNE(contamination=contamination),
    'IForest': IForest(contamination=contamination),
    'SOD': SOD(contamination=contamination),
    'KNN': KNN(contamination=contamination),
    'CBLOF': CBLOF(contamination=contamination),
    'LOF': LOF(contamination=contamination),
    'MCD': MCD(contamination=contamination),
    'GMM': GMM(contamination=contamination),
    'Sampling': Sampling(contamination=contamination),
    'SOS': SOS(contamination=contamination),
    'COPOD': COPOD(contamination=contamination),
}

for method_name, method in (pbar := tqdm(pyod_methods.items())):
    pbar.set_description(f"Processing {method_name}")
    
    method.fit(df.loc[ids_trn_val, feats].values)

### Create dimensionality reduction models, trained on trn_val samples

In [None]:
X_dim_red = df.loc[ids_trn_val, feats].values
random_state = 42
dim_red_models = {
    'PCA': PCA(n_components=2, whiten=False, random_state=random_state).fit(X_dim_red),
    'SVD': TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5, random_state=random_state).fit(X_dim_red),
    't-SNE': TSNE(n_components=2, random_state=random_state).fit(X_dim_red),
    'GRP': GaussianRandomProjection(n_components=2, eps=0.5, random_state=random_state).fit(X_dim_red),
    'SRP': SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False, random_state=random_state).fit(X_dim_red),
    'IsoMap': Isomap(n_components=2, n_neighbors=5).fit(X_dim_red),
    'MBDL': MiniBatchDictionaryLearning(n_components=2, batch_size=100, alpha=1, n_iter=25, random_state=random_state).fit(X_dim_red),
}

### Original data processing

In [None]:
add_pyod_outs_to_df(df, pyod_methods, feats)
add_iqr_outs_to_df(df, df.loc[ids_trn_val, :], feats)
for method_name, method in (pbar := tqdm(dim_red_models.items())):
    pbar.set_description(f"Processing {method_name}")
    dim_red_res = method.transform(df.loc[:, feats].values)
    df.loc[:, dim_red_labels[method_name][0]] = dim_red_res[:, 0]
    df.loc[:, dim_red_labels[method_name][1]] = dim_red_res[:, 1]
df.to_excel(f"{path}/Origin/df.xlsx", index_label='sample_id')

### Load original processed data

In [None]:
df = pd.read_excel(f"{path}/Origin/df.xlsx", index_col=0)

### Original data plots in reduced dimension

In [None]:
pathlib.Path(f"{path}/Origin/dim_red").mkdir(parents=True, exist_ok=True)

df_fig = df.loc[:, list(np.concatenate(list(dim_red_labels.values()))) + ['Real', 'Pred', 'Prob Parkinson']].copy()
df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
for method in dim_red_labels:
    plot_cls_dim_red(
        df=df_fig,
        col_class='Status',
        cls_names=['Control', 'Parkinson'],
        col_prob='Prob Parkinson',
        cols_dim_red=dim_red_labels[method],
        title='Original',
        fn=f"{path}/Origin/dim_red/{method}"
    )

### Features distributions plots

In [None]:
pathlib.Path(f"{path}/Origin/feats").mkdir(parents=True, exist_ok=True)

df_stat = pd.DataFrame(index=feats, columns=['mw_pval', 'mw_pval_fdr_bh'])
for f in feats:
    _, df_stat.at[f, 'mw_pval'] = mannwhitneyu(df.loc[df['Real'] == 0, f].values, df.loc[df['Real'] == 1, f].values, alternative='two-sided')
_, df_stat.loc[:, 'mw_pval_fdr_bh'], _, _ = multipletests(df_stat.loc[:, "mw_pval"], 0.05, method='fdr_bh')
df_stat.sort_values(['mw_pval_fdr_bh'], ascending=[True], inplace=True)
df_stat[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_stat['mw_pval_fdr_bh'].astype(float))
df_stat.to_excel(f"{path}/Origin/feats/df_stat.xlsx", index_label="Features")

fig, ax = plt.subplots(figsize=(10, 4))
sns.set_theme(style='whitegrid')
kdeplot = sns.kdeplot(
    data=df_stat,
    x=r'$ -\log_{10}(\mathrm{p-value})$',
    color='darkgreen',
    linewidth=2,
    cut=0,
    fill=True,
    ax=ax
)
kdeplot.set_title('Features Distribution Differences')
plt.savefig(f"{path}/Origin/feats/kde_pval.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/Origin/feats/kde_pval.pdf", bbox_inches='tight')
plt.close(fig)

In [None]:
n_top_features = 10
top_features = list(df_stat.index[0:n_top_features])
df_fig = df.loc[:, top_features + ['Real']].copy()
df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
df_fig = df_fig.melt(id_vars=['Status'], value_vars=list(df_stat.index[0:n_top_features]), var_name='CpG', value_name='Methylation')
df_fig['CpG'].replace({x: f"{x}\npval: {df_stat.at[x, 'mw_pval_fdr_bh']:0.2e}" for x in top_features}, inplace=True)

fig, ax = plt.subplots(figsize=(10, 1 * n_top_features))
sns.set_theme(style='whitegrid')
violin = sns.violinplot(
    data=df_fig,
    x='Methylation',
    y='CpG',
    orient='h',
    hue='Status',
    split=True,
    linewidth=1,
    palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'},
    hue_order=['Control', 'Parkinson'],
    cut=0,
    inner="quart",
    ax=ax
)
plt.savefig(f"{path}/Origin/feats/violins.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/Origin/feats/violins.pdf", bbox_inches='tight')
plt.close(fig)

### Probability distribution plot

In [None]:
df_fig = df.loc[:, top_features + ['Prob Parkinson', 'Real']].copy()
df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
fig, ax = plt.subplots(figsize=(10, 4))
sns.set_theme(style='whitegrid')
kdeplot = sns.kdeplot(
    data=df_fig,
    x='Prob Parkinson',
    hue='Status',
    palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'},
    hue_order=['Control', 'Parkinson'],
    linewidth=2,
    cut=0,
    fill=True,
    ax=ax
)
plt.savefig(f"{path}/Origin/kde_proba.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/Origin/kde_proba.pdf", bbox_inches='tight')
plt.close(fig)

### Outliers analysis

In [None]:
# IQR plots
pathlib.Path(f"{path}/Origin/outliers_iqr").mkdir(parents=True, exist_ok=True)
plot_iqr_outs(df, feats, 'grey', 'Origin', f"{path}/Origin/outliers_iqr", is_msno_plots=False)
df_fig = df.loc[:, ['Real', 'Pred', 'n_outs_iqr', 'Prob Control', 'Prob Parkinson']].copy()
df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
plot_iqr_outs_cls(
    df=df_fig,
    path=f"{path}/Origin/outliers_iqr",
    col_class="Status",
    col_pred="Pred",
    col_real="Real",
    cols_prob=['Prob Control', 'Prob Parkinson'],
    palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'}
)

# PyOD plots
pathlib.Path(f"{path}/Origin/outliers_pyod").mkdir(parents=True, exist_ok=True)
plot_pyod_outs(df, pyod_method_names, 'grey', 'Origin', f"{path}/Origin/outliers_pyod", n_cols=4)
df_fig = df.loc[:, ['Real', 'Pred', 'Detections', 'Prob Control', 'Prob Parkinson']].copy()
df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
plot_pyod_outs_cls(
    df=df_fig,
    path=f"{path}/Origin/outliers_pyod",
    col_class="Status",
    col_pred="Pred",
    col_real="Real",
    cols_prob=['Prob Control', 'Prob Parkinson'],
    palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'}
)

### Confidence

In [None]:
torch.manual_seed(42)
quantiles = torch.tensor([0.05, 0.95])

pathlib.Path(f"{path}/Origin/confidence").mkdir(parents=True, exist_ok=True)

metrics_pred = get_cls_pred_metrics(num_classes=2)
df_metrics = pd.DataFrame(index=list(metrics_pred.keys()))

df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'
y_real_ori = torch.from_numpy(df_ori.loc[:, "Real"].values.astype('int32'))
y_pred_ori = torch.from_numpy(df_ori.loc[:, "Pred"].values.astype('int32'))

for metric_name, metric_pair in metrics_pred.items():
    metric = metric_pair[0]
    bootstrap = BootStrapper(
        metric,
        num_bootstraps=200,
        sampling_strategy="multinomial",
        quantile=quantiles
    )
    bootstrap.update(y_pred_ori, y_real_ori)
    bootstrap_output = bootstrap.compute()
    df_metrics.at[metric_name, 'mean'] = bootstrap_output['mean'].detach().cpu().numpy()
    df_metrics.at[metric_name, 'std'] = bootstrap_output['std'].detach().cpu().numpy()
    df_metrics.at[metric_name, 'q0.05'] = bootstrap_output['quantile'].detach().cpu().numpy()[0]
    df_metrics.at[metric_name, 'q0.95'] = bootstrap_output['quantile'].detach().cpu().numpy()[1]
df_metrics.to_excel(f"{path}/Origin/confidence/metrics.xlsx", index_label='Metrics')

## 2.2 Augmented data

In [None]:
n_smps = 10000

df_aug_sdv_input = df.loc[:, np.concatenate((['Real'], feats))]
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_aug_sdv_input)
metadata.update_column('Real', sdtype='categorical')

synthesizers = {
    'GaussianCopula': GaussianCopulaSynthesizer(metadata),
    'CTGANSynthesizer': CTGANSynthesizer(metadata),
    'TVAESynthesizer': TVAESynthesizer(metadata),
    'CopulaGANSynthesizer': CopulaGANSynthesizer(metadata),
    'FAST_ML': SingleTablePreset(metadata, name='FAST_ML'),
}

for s_name, s in (pbar := tqdm(synthesizers.items())):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

    s.fit(
        data=df_aug_sdv_input
    )
    s.save(
        filepath=f"{path_curr}/synthesizer.pkl"
    )
    df_aug_sdv = s.sample(
        num_rows=n_smps
    )
    quality_report = evaluate_quality(
        df_aug_sdv_input,
        df_aug_sdv,
        metadata
    )
    
    q_rep_prop = quality_report.get_properties()
    q_rep_prop.set_index('Property', inplace=True)
    
    df_col_shapes = quality_report.get_details(property_name='Column Shapes')
    df_col_shapes.sort_values(["Score"], ascending=[False], inplace=True)
    df_col_shapes.to_excel(f"{path_curr}/ColumnShapes.xlsx", index=False)
    
    df_col_pair_trends = quality_report.get_details(property_name='Column Pair Trends')
    df_col_pair_trends.to_excel(f"{path_curr}/ColumnPairTrends.xlsx", index=False)
    
    model.produce_probabilities = True
    y_pred_prob = model(torch.from_numpy(np.float32(df_aug_sdv.loc[:, feats].values))).cpu().detach().numpy()
    y_pred = np.argmax(y_pred_prob, 1)
    df_aug_sdv["Pred"] = y_pred
    df_aug_sdv["Prob Control"] = y_pred_prob[:, 0]
    df_aug_sdv["Prob Parkinson"] = y_pred_prob[:, 1]
    
    for m, drm in dim_red_models.items():
        dim_red_res = drm.transform(df_aug_sdv.loc[:, feats].values)
        df_aug_sdv.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
        df_aug_sdv.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
        
    add_iqr_outs_to_df(df_aug_sdv, df.loc[ids_trn_val, :], feats)
    add_pyod_outs_to_df(df_aug_sdv, pyod_methods, feats)
        
    df_aug_sdv.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')


### Calculate metrics

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'

for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"

    df_aug = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')

    metrics_pred = get_cls_pred_metrics(num_classes=2)
    metrics_prob = get_cls_prob_metrics(num_classes=2)
    df_metrics = pd.DataFrame(index=list(metrics_pred.keys()) + list(metrics_prob.keys()))
    y_real_ori = torch.from_numpy(df_ori.loc[:, "Real"].values.astype('int32'))
    y_pred_ori = torch.from_numpy(df_ori.loc[:, "Pred"].values.astype('int32'))
    y_prob_ori = torch.from_numpy(df_ori.loc[:, ["Prob Control", "Prob Parkinson"]].values)
    
    y_real_aug = torch.from_numpy(df_aug.loc[:, "Real"].values.astype('int32'))
    y_pred_aug = torch.from_numpy(df_aug.loc[:, "Pred"].values.astype('int32'))
    y_prob_aug = torch.from_numpy(df_aug.loc[:, ["Prob Control", "Prob Parkinson"]].values)
    for m in metrics_pred:
        m_val = float(metrics_pred[m][0](y_pred_aug, y_real_aug).numpy())
        metrics_pred[m][0].reset()
        df_metrics.at[m, s_name] = m_val
        m_val = float(metrics_pred[m][0](y_pred_ori, y_real_ori).numpy())
        df_metrics.at[m, 'Origin'] = m_val
        metrics_pred[m][0].reset()
    for m in metrics_prob:
        m_val = 0
        try:
            m_val = float(metrics_prob[m][0](y_prob_aug, y_real_aug).numpy())
        except ValueError:
            pass
        metrics_prob[m][0].reset()
        df_metrics.at[m, s_name] = m_val
        m_val = 0
        try:
            m_val = float(metrics_prob[m][0](y_prob_ori, y_real_ori).numpy())
        except ValueError:
            pass
        metrics_prob[m][0].reset()
        df_metrics.at[m, 'Origin'] = m_val
    df_metrics.to_excel(f"{path_curr}/metrics.xlsx", index_label='Metrics')

### Plot metrics

In [None]:
df_metrics = pd.DataFrame(index=['Real'] + list(colors_augs.keys()), columns=['Accuracy', 'AUROC'])
metrics_dict = {'Accuracy': 'accuracy_weighted', 'AUROC': 'auroc_weighted'}
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    
    df_metrics_curr = pd.read_excel(f"{path_curr}/metrics.xlsx", index_col=0)
    for m in metrics_dict:
        df_metrics.at[s_name, m] = df_metrics_curr.at[metrics_dict[m], s_name]
        if s_name == 'FAST_ML':
            df_metrics.at['Real', m] = df_metrics_curr.at[metrics_dict[m], 'Origin']
df_metrics.to_excel(f"{path}/Augmentation/metrics.xlsx", index_label='Metrics')

df_metrics['Type'] = df_metrics.index
barplots = {}
for m in metrics_dict:
    barplots[m] = pw.Brick(figsize=(3.5, 2.0))
    sns.set_theme(style='whitegrid')
    barplot = sns.barplot(
        data=df_metrics,
        y='Type',
        hue='Type',
        x=m,
        edgecolor='black',
        palette={'Real': 'grey'} | colors_augs,
        dodge=False,
        ax=barplots[m],
    )
    barplots[m].get_legend().remove()
    for container in barplots[m].containers:
        barplots[m].bar_label(container, fmt='%.2f')
pw_fig = barplots['Accuracy'] | barplots['AUROC']
pw_fig.savefig(f"{path}/Augmentation/metrics.png", bbox_inches='tight', dpi=200)
pw_fig.savefig(f"{path}/Augmentation/metrics.pdf", bbox_inches='tight')
pw.clear()

### Plot in reduced dimension

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'

for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    
    df_aug = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    df_ori_aug = pd.concat([df_ori, df_aug])
    for m in dim_red_labels:
        pathlib.Path(f"{path_curr}/dim_red").mkdir(parents=True, exist_ok=True)
        n_bins = 20
        x_xtd = (df_aug[dim_red_labels[m][0]].max() - df_aug[dim_red_labels[m][0]].min()) * 0.075
        x_min = df_aug[dim_red_labels[m][0]].min() - x_xtd
        x_max = df_aug[dim_red_labels[m][0]].max() + x_xtd
        x_shift = (x_max - x_min) / n_bins
        x_bin_centers = np.linspace(
            start=x_min + 0.5 * x_shift,
            stop=x_max - 0.5 * x_shift,
            num=n_bins
        )
        y_xtd = (df_aug[dim_red_labels[m][1]].max() - df_aug[dim_red_labels[m][1]].min()) * 0.075
        y_min = df_aug[dim_red_labels[m][1]].min() - y_xtd
        y_max = df_aug[dim_red_labels[m][1]].max() + y_xtd
        y_shift = (y_max - y_min) / n_bins
        y_bin_centers = np.linspace(
            start=y_min + 0.5 * y_shift,
            stop=y_max - 0.5 * y_shift,
            num=n_bins
        )
        df_heatmap_sum = pd.DataFrame(index=x_bin_centers, columns=y_bin_centers, data=np.zeros((n_bins, n_bins)))
        df_heatmap_cnt = pd.DataFrame(index=x_bin_centers, columns=y_bin_centers, data=np.zeros((n_bins, n_bins)))
        xs = df_aug.loc[:, dim_red_labels[m][0]].values
        xs_ids = np.floor((xs - x_min) / (x_shift + 1e-10)).astype(int)
        ys = df_aug.loc[:, dim_red_labels[m][1]].values
        ys_ids = np.floor((ys - y_min) / (y_shift + 1e-10)).astype(int)
        zs = df_aug.loc[:, "Prob Parkinson"].values
        for d_id in range(len(xs_ids)):
            df_heatmap_sum.iat[xs_ids[d_id], ys_ids[d_id]] += zs[d_id]
            df_heatmap_cnt.iat[xs_ids[d_id], ys_ids[d_id]] += 1
        df_heatmap = pd.DataFrame(data=df_heatmap_sum.values / df_heatmap_cnt.values, columns=df_heatmap_sum.columns, index=df_heatmap_sum.index)
        df_heatmap.to_excel(f"{path_curr}/dim_red/{m}_heatmap.xlsx")
        
        norm = plt.Normalize(df_ori_aug["Prob Parkinson"].min(), df_ori_aug["Prob Parkinson"].max())
        sm = plt.cm.ScalarMappable(cmap='seismic', norm=norm)
        sm.set_array([])
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.set_theme(style='whitegrid')

        ax.imshow(
            X=df_heatmap.transpose().iloc[::-1].values,
            extent=[x_min, x_max, y_min, y_max],
            vmin=df_ori_aug["Prob Parkinson"].min(),
            vmax=df_ori_aug["Prob Parkinson"].max(),
            aspect=x_shift/y_shift,
            cmap="seismic",
            alpha=0.8
        )
        
        scatter_colors = {sample: colors.rgb2hex(sm.to_rgba(row["Prob Parkinson"])) for sample, row in df.iterrows()}
        scatter = sns.scatterplot(
            data=df.loc[ids_tst, :],
            x=dim_red_labels[m][0],
            y=dim_red_labels[m][1],
            palette=scatter_colors,
            hue=df.loc[ids_tst, :].index,
            linewidth=0.2,
            alpha=0.75,
            edgecolor="cyan",
            marker='o',
            s=15,
            ax=ax
        )
        scatter.get_legend().remove()
        fig.colorbar(sm, label="Prob Parkinson")
        plt.title(f'{s_name}', y=1.2, fontsize = 14)
        
        legend_handles = [
            mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Real'),
            mlines.Line2D([], [], marker='s', linestyle='None', markeredgewidth=0, markerfacecolor='lightgrey', markersize=10, label='Synthetic')
        ]
        plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", borderaxespad=0, mode="expand", ncol=2, frameon=False)
        
        plt.savefig(f"{path_curr}/dim_red/{m}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path_curr}/dim_red/{m}.pdf", bbox_inches='tight')
        plt.close(fig)

### Plot features differences

#### Verison 1

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    pathlib.Path(f"{path_curr}/feats").mkdir(parents=True, exist_ok=True)
    
    df_aug = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    df_aug.loc[df_aug['Real'] == 0, 'Status'] = 'Control'
    df_aug.loc[df_aug['Real'] == 1, 'Status'] = 'Parkinson'
    df_aug['Type'] = s_name
    df_ori_aug = pd.concat([df_ori, df_aug])
    
    countplots = {}
    kdeplots = {}
    for cls in ['Control', 'Parkinson']:
        countplots[cls] = pw.Brick(figsize=(3, 1))
        sns.set_theme(style='whitegrid')
        countplot = sns.countplot(
            data=df_ori_aug.loc[df_ori_aug['Status'] == cls, :],
            y='Type',
            edgecolor='black',
            palette={'Real': 'grey'} | colors_augs,
            orient='h',
            order=['Real', s_name],
            ax=countplots[cls]
        )
        countplots[cls].bar_label(countplot.containers[0])
        countplots[cls].set_xlabel("Count")
        countplots[cls].set_title(f"{cls} samples")
        
        kdeplots[cls] = pw.Brick(figsize=(4, 2))
        sns.set_theme(style='whitegrid')
        kde = sns.kdeplot(
            data=df_ori_aug.loc[df_ori_aug['Status'] == cls, :],
            x=f"Prob Parkinson",
            hue='Type',
            linewidth=2,
            palette={'Real': 'grey'} | colors_augs,
            hue_order=['Real', s_name],
            fill=True,
            common_true=False,
            cut=0,
            ax=kdeplots[cls]
        )
        sns.move_legend(kdeplots[cls], "upper center")
        kdeplots[cls].set_title(f"{cls} samples")

    
    df_stat = pd.DataFrame(index=feats, columns=['mw_pval', 'mw_pval_fdr_bh'])
    for f in feats:
        _, df_stat.at[f, 'mw_pval'] = mannwhitneyu(
            df_ori_aug.loc[df_ori_aug['Type'] == 'Real', f].values,
            df_ori_aug.loc[df_ori_aug['Type'] == s_name, f].values,
            alternative='two-sided'
        )
    _, df_stat.loc[:, 'mw_pval_fdr_bh'], _, _ = multipletests(df_stat.loc[:, "mw_pval"], 0.05, method='fdr_bh')
    df_stat.sort_values(['mw_pval_fdr_bh'], ascending=[True], inplace=True)
    df_stat[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_stat['mw_pval_fdr_bh'].astype(float))
    df_stat.to_excel(f"{path_curr}/feats/df_stat.xlsx", index_label="Features")
    
    brick_kde_pvals = pw.Brick(figsize=(6, 1.75))
    sns.set_theme(style='whitegrid')
    kdeplot = sns.kdeplot(
        data=df_stat,
        x=r'$ -\log_{10}(\mathrm{p-value})$',
        color='darkgreen',
        linewidth=2,
        cut=0,
        fill=True,
        ax=brick_kde_pvals
    )
    brick_kde_pvals.set_title('Features Distribution Differences')
    
    n_top_features = 5
    top_features = list(df_stat.index[0:n_top_features])
    df_fig = df_ori_aug.loc[:, top_features + ['Type']].copy()
    df_fig = df_fig.melt(
        id_vars=['Type'],
        value_vars=list(df_stat.index[0:n_top_features]),
        var_name='CpG',
        value_name='Methylation')
    df_fig['CpG'].replace({x: f"{x}\npval: {df_stat.at[x, 'mw_pval_fdr_bh']:0.2e}" for x in top_features}, inplace=True)
    
    brick_feats_violins = pw.Brick(figsize=(6, 3))
    sns.set_theme(style='whitegrid')
    violin = sns.violinplot(
        data=df_fig,
        x='Methylation',
        y='CpG',
        orient='h',
        hue='Type',
        split=True,
        linewidth=1,
        palette={'Real': 'grey'} | colors_augs,
        hue_order=['Real', s_name],
        cut=0,
        inner="quart",
        ax=brick_feats_violins
    )

    pw_fig = (countplots['Control'] | countplots['Parkinson']) / (kdeplots['Control'] | kdeplots['Parkinson']) / brick_kde_pvals / brick_feats_violins
    pw_fig.savefig(f"{path_curr}/feats/fig.png", bbox_inches='tight', dpi=200)
    pw_fig.savefig(f"{path_curr}/feats/fig.pdf", bbox_inches='tight')
    pw.clear()

#### Version 2

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    pathlib.Path(f"{path_curr}/feats").mkdir(parents=True, exist_ok=True)
    
    df_aug = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    df_aug.loc[df_aug['Real'] == 0, 'Status'] = 'Control'
    df_aug.loc[df_aug['Real'] == 1, 'Status'] = 'Parkinson'
    df_aug['Type'] = s_name
    df_ori_aug = pd.concat([df_ori, df_aug])
    
    countplots = {}
    kdeplots = {}
    for cls in ['Control', 'Parkinson']:
        countplots[cls] = pw.Brick(figsize=(3, 1))
        sns.set_theme(style='whitegrid')
        countplot = sns.countplot(
            data=df_ori_aug.loc[df_ori_aug['Status'] == cls, :],
            y='Type',
            edgecolor='black',
            palette={'Real': 'grey'} | colors_augs,
            orient='h',
            order=['Real', s_name],
            ax=countplots[cls]
        )
        countplots[cls].bar_label(countplot.containers[0])
        countplots[cls].set_xlabel("Count")
        countplots[cls].set_title(f"{cls} samples")
        
        kdeplots[cls] = pw.Brick(figsize=(4, 2))
        sns.set_theme(style='whitegrid')
        kde = sns.kdeplot(
            data=df_ori_aug.loc[df_ori_aug['Status'] == cls, :],
            x=f"Prob Parkinson",
            hue='Type',
            linewidth=2,
            palette={'Real': 'grey'} | colors_augs,
            hue_order=['Real', s_name],
            fill=True,
            common_norm=False,
            cut=0,
            ax=kdeplots[cls]
        )
        sns.move_legend(kdeplots[cls], "upper center")
        kdeplots[cls].set_title(f"{cls} samples")

    df_stat = pd.read_excel(f"{path_curr}/ColumnShapes.xlsx", index_col=0)
    df_stat = df_stat.loc[df_stat['Metric'] == "KSComplement", :]
    df_stat.rename(columns={'Score': "KSComplement"}, inplace=True)
    
    brick_scores = pw.Brick(figsize=(7.5, 1.75))
    sns.set_theme(style='whitegrid')
    kdeplot = sns.kdeplot(
        data=df_stat,
        x='KSComplement',
        color='darkgreen',
        linewidth=2,
        cut=0,
        fill=True,
        ax=brick_scores
    )
    brick_scores.set_title('Features Distribution Differences')
    
    n_features = 5
    feats_dict = {
        'Top Features': list(df_stat.index[0:n_top_features]),
        'Bottom Features': list(df_stat.index[-n_top_features-1:-1][::-1])
    }
    brick_feats_violins = {}
    for feats_set in feats_dict:
        df_fig = df_ori_aug.loc[:, feats_dict[feats_set] + ['Type']].copy()
        df_fig = df_fig.melt(
            id_vars=['Type'],
            value_vars=feats_dict[feats_set],
            var_name='CpG',
            value_name='Methylation')
        df_fig['CpG'].replace({x: f"{x}\nScore: {df_stat.at[x, 'KSComplement']:0.2f}" for x in feats_dict[feats_set]}, inplace=True)
        
        brick_feats_violins[feats_set] = pw.Brick(figsize=(2.5, 3))
        sns.set_theme(style='whitegrid')
        violin = sns.violinplot(
            data=df_fig,
            x='Methylation',
            y='CpG',
            orient='h',
            hue='Type',
            split=True,
            linewidth=1,
            palette={'Real': 'grey'} | colors_augs,
            hue_order=['Real', s_name],
            cut=0,
            inner="quart",
            ax=brick_feats_violins[feats_set]
        )
        brick_feats_violins[feats_set].set_title(feats_set)

    pw_fig = ((countplots['Control'] | countplots['Parkinson'])
              / (kdeplots['Control'] | kdeplots['Parkinson'])
              / brick_scores
              / (brick_feats_violins['Top Features'] | brick_feats_violins['Bottom Features']))
    pw_fig.savefig(f"{path_curr}/feats/fig_v2.png", bbox_inches='tight', dpi=200)
    pw_fig.savefig(f"{path_curr}/feats/fig_v2.pdf", bbox_inches='tight')
    pw.clear()

### Outliers analysis

In [None]:
for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    
    df_aug = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')

    # IQR outliers
    pathlib.Path(f"{path}/Augmentation/{s_name}/outliers_iqr").mkdir(parents=True, exist_ok=True)
    plot_iqr_outs(df_aug, feats, 'grey', s_name, f"{path}/Augmentation/{s_name}/outliers_iqr", is_msno_plots=False)
    df_fig = df_aug.loc[:, ['Real', 'Pred', 'n_outs_iqr', 'Prob Control', 'Prob Parkinson']].copy()
    df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
    df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
    plot_iqr_outs_cls(
        df=df_fig,
        path=f"{path}/Augmentation/{s_name}/outliers_iqr",
        col_class="Status",
        col_pred="Pred",
        col_real="Real",
        cols_prob=['Prob Control', 'Prob Parkinson'],
        palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'}
    )
    
    # PyOD plots
    pathlib.Path(f"{path}/Augmentation/{s_name}/outliers_pyod").mkdir(parents=True, exist_ok=True)
    plot_pyod_outs(df_aug, pyod_method_names, 'grey', 'Origin', f"{path}/Augmentation/{s_name}/outliers_pyod", n_cols=4)
    df_fig = df_aug.loc[:, ['Real', 'Pred', 'Detections', 'Prob Control', 'Prob Parkinson']].copy()
    df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
    df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
    plot_pyod_outs_cls(
        df=df_fig,
        path=f"{path}/Augmentation/{s_name}/outliers_pyod",
        col_class="Status",
        col_pred="Pred",
        col_real="Real",
        cols_prob=['Prob Control', 'Prob Parkinson'],
        palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'}
    )

### Confidence

In [None]:
torch.manual_seed(42)
quantiles = torch.tensor([0.05, 0.95])

for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    path_curr = f"{path}/Augmentation/{s_name}"
    
    df_aug = pd.read_excel(f"{path_curr}/df.xlsx", index_col='sample_id')
    df_aug.loc[df_aug['Real'] == 0, 'Status'] = 'Control'
    df_aug.loc[df_aug['Real'] == 1, 'Status'] = 'Parkinson'
    
    pathlib.Path(f"{path_curr}/confidence").mkdir(parents=True, exist_ok=True)

    metrics_pred = get_cls_pred_metrics(num_classes=2)
    df_metrics = pd.DataFrame(index=list(metrics_pred.keys()))
    y_real = torch.from_numpy(df_aug.loc[:, "Real"].values.astype('int32'))
    y_pred = torch.from_numpy(df_aug.loc[:, "Pred"].values.astype('int32'))
    
    for metric_name, metric_pair in metrics_pred.items():
        metric = metric_pair[0]
        bootstrap = BootStrapper(
            metric,
            num_bootstraps=200,
            sampling_strategy="multinomial",
            quantile=quantiles
        )
        bootstrap.update(y_pred, y_real)
        bootstrap_output = bootstrap.compute()
        df_metrics.at[metric_name, 'mean'] = bootstrap_output['mean'].detach().cpu().numpy()
        df_metrics.at[metric_name, 'std'] = bootstrap_output['std'].detach().cpu().numpy()
        df_metrics.at[metric_name, 'q0.05'] = bootstrap_output['quantile'].detach().cpu().numpy()[0]
        df_metrics.at[metric_name, 'q0.95'] = bootstrap_output['quantile'].detach().cpu().numpy()[1]
    df_metrics.to_excel(f"{path_curr}/confidence/metrics.xlsx", index_label='Metrics')

In [None]:
metrics_names = {
    'accuracy_weighted': 'Accuracy',
}
quantiles = [0.05, 0.95]

df_conf = pd.DataFrame(index=['Real'] + list(colors_augs.keys()), columns=[f"{m}_{q}" for m in metrics_names for q in quantiles])

df_metrics = pd.read_excel(f"{path}/Origin/confidence/metrics.xlsx", index_col='Metrics')
for m in metrics_names:
    for q in quantiles:
        df_conf.at["Real", f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]

for s_name in (pbar := tqdm(colors_augs)):
    pbar.set_description(f"Processing {s_name}")
    
    df_metrics = pd.read_excel(f"{path}/Augmentation/{s_name}/confidence/metrics.xlsx", index_col='Metrics')
    for m in metrics_names:
        for q in quantiles:
            df_conf.at[s_name, f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]

colors_dict = {'Real': 'grey'} | colors_augs
for m in metrics_names:
    df_fig = df_conf.loc[:, [f"{m}_{q}" for q in quantiles]].copy()
    df_fig['Type'] = df_fig.index
    df_fig = df_fig.melt(id_vars=['Type'], value_name=metrics_names[m])
    fig, ax = plt.subplots(figsize=(3, 2))
    sns.set_theme(style='ticks') 
    scatter = sns.scatterplot(
        data=df_fig,
        x=metrics_names[m],
        y='Type',
        hue='Type',
        palette=colors_dict,
        hue_order=list(colors_dict.keys()),
        linewidth=0.2,
        alpha=0.95,
        edgecolor="black",
        s=16,
        ax=ax
    )
    scatter.get_legend().set_visible(False)
    line = sns.lineplot(
        data=df_fig,
        x=metrics_names[m],
        y='Type',
        hue='Type',
        palette=colors_dict,
        hue_order=list(colors_dict.keys()),
        linewidth=2,
        ax=ax
    )
    line.get_legend().set_visible(False)
    ax.set_xlabel(f"Confidence Intervals for {metrics_names[m]}")
    plt.savefig(f"{path}/Augmentation/confidence_{m}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path}/Augmentation/confidence_{m}.pdf", bbox_inches='tight')
    plt.close(fig)

## 2.3. Attacks

In [None]:
ids_atk = ids_tst

art_classifier = PyTorchClassifier(
    model=model,
    loss=model.loss_fn,
    input_shape=(len(feats),),
    nb_classes=2,
    optimizer=torch.optim.Adam(
        params=model.parameters(),
        lr=model.hparams.optimizer_lr,
        weight_decay=model.hparams.optimizer_weight_decay
    ),
    use_amp=False,
    opt_level="O1",
    loss_scale="dynamic",
    channels_first=True,
    clip_values=(0.0, 1.0),
    preprocessing_defences=None,
    postprocessing_defences=None,
    preprocessing=(0.0, 1.0),
    device_type="cpu"
)

### Eps-depended attacks

In [None]:
epsilons = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
    set(np.linspace(0.001, 0.01, 10))
)))
df_eps = pd.DataFrame(index=epsilons)

for eps_raw in epsilons:

    eps = np.array([eps_raw * iqr(df.loc[ids_atk, feat].values) for feat in feats])
    eps_step = np.array([0.2 * eps_raw * iqr(df.loc[ids_atk, feat].values) for feat in feats])

    attacks = {
        'MomentumIterative': MomentumIterativeMethod(
            estimator=art_classifier,
            norm=np.inf,
            eps=eps,
            eps_step=eps_step,
            decay=0.1,
            max_iter=100,
            targeted=False,
            batch_size=512,
            verbose=True
        ),
        'BasicIterative': BasicIterativeMethod(
            estimator=art_classifier,
            eps=eps,
            eps_step=eps_step,
            max_iter=100,
            targeted=False,
            batch_size=512,
            verbose=True
        ),
        'ProjectedGradientDescent': ProjectedGradientDescentNumpy(
            estimator=art_classifier,
            norm=np.inf,
            eps=eps,
            eps_step=eps_step,
            decay=None,
            max_iter=100,
            targeted=False,
            num_random_init=0,
            batch_size=512,
            random_eps=False,
            summary_writer=False,
            verbose=True
        ),
        'FastGradient': FastGradientMethod(
            estimator=art_classifier,
            norm=np.inf,
            eps=eps,
            eps_step=eps_step,
            targeted=False,
            num_random_init=0,
            batch_size=512,
            minimal=False,
            summary_writer=False,
        ),
    }

    for attack_name, attack in attacks.items():
        path_curr = f"{path}/Evasion/{attack_name}/eps_{eps_raw:0.4f}"
        pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

        X_adv = attack.generate(np.float32(df.loc[ids_atk, feats].values))
        
        df_adv = df.loc[ids_atk, ['Real']].copy()
        df_adv.loc[ids_atk, feats] = X_adv
        model.produce_probabilities = True
        y_pred_prob = model(torch.from_numpy(np.float32(df_adv.loc[ids_atk, feats].values))).cpu().detach().numpy()
        y_pred = np.argmax(y_pred_prob, 1)
        df_adv["Pred"] = y_pred
        df_adv["Prob Control"] = y_pred_prob[:, 0]
        df_adv["Prob Parkinson"] = y_pred_prob[:, 1]
        df_adv["Eps"] = eps_raw
        df_adv["Data"] = attack_name
        
        for m, drm in dim_red_models.items():
            dim_red_res = drm.transform(df_adv.loc[:, feats].values)
            df_adv.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
            df_adv.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
            
        add_iqr_outs_to_df(df_adv, df.loc[ids_trn_val, :], feats)
        add_pyod_outs_to_df(df_adv, pyod_methods, feats)
            
        df_adv.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')

        metrics_pred = get_cls_pred_metrics(num_classes=2)
        metrics_prob = get_cls_prob_metrics(num_classes=2)
        df_metrics = pd.DataFrame(index=list(metrics_pred.keys()) + list(metrics_prob.keys()))
        y_real = torch.from_numpy(df_adv.loc[ids_atk, "Real"].values.astype('int32'))
        y_pred_atk = torch.from_numpy(df_adv.loc[ids_atk, "Pred"].values.astype('int32'))
        y_pred_ori = torch.from_numpy(df.loc[ids_atk, "Pred"].values.astype('int32'))
        y_prob_atk = torch.from_numpy(df_adv.loc[ids_atk, ["Prob Control", "Prob Parkinson"]].values)
        y_prob_ori = torch.from_numpy(df.loc[ids_atk, ["Prob Control", "Prob Parkinson"]].values)
        for m in metrics_pred:
            m_val = float(metrics_pred[m][0](y_pred_atk, y_real).numpy())
            metrics_pred[m][0].reset()
            df_metrics.at[m, 'Attack'] = m_val
            m_val = float(metrics_pred[m][0](y_pred_ori, y_real).numpy())
            df_metrics.at[m, 'Origin'] = m_val
            metrics_pred[m][0].reset()
        for m in metrics_prob:
            m_val = 0
            try:
                m_val = float(metrics_prob[m][0](y_prob_atk, y_real).numpy())
            except ValueError:
                pass
            metrics_prob[m][0].reset()
            df_metrics.at[m, 'Attack'] = m_val
            m_val = 0
            try:
                m_val = float(metrics_prob[m][0](y_prob_ori, y_real).numpy())
            except ValueError:
                pass
            metrics_prob[m][0].reset()
            df_metrics.at[m, 'Origin'] = m_val
            
        df_metrics.to_excel(f"{path_curr}/metrics.xlsx", index_label='Metrics')
        
        if attack_name == 'MomentumIterative':
            df_eps.loc[eps_raw, "Origin_Accuracy"] = df_metrics.at['accuracy_weighted', 'Origin']
        df_eps.loc[eps_raw, f"{attack_name}_Accuracy"] = df_metrics.at['accuracy_weighted', 'Attack']
            
df_eps.to_excel(f"{path}/Evasion/df_eps.xlsx", index_label='eps')


In [None]:
df_eps = pd.read_excel(f"{path}/Evasion/df_eps.xlsx", index_col=0)

atks_trgt = ['MomentumIterative', 'BasicIterative', 'FastGradient']

df_fig = df_eps.loc[:, [f"{x}_Accuracy" for x in atks_trgt]].copy()
df_fig.rename(columns={f"{x}_Accuracy": x for x in atks_trgt}, inplace=True)
df_fig['Eps'] = df_fig.index.values
df_fig = df_fig.melt(id_vars="Eps", var_name='Method', value_name="Accuracy")
fig = plt.figure()
sns.set_theme(style='whitegrid', font_scale=1)
lines = sns.lineplot(
    data=df_fig,
    x='Eps',
    y="Accuracy",
    hue=f"Method",
    style=f"Method",
    palette=colors_atks_eps,
    hue_order=atks_trgt,
    markers=True,
    dashes=False,
)
plt.xscale('log')
lines.set_xlabel(r'$\epsilon$')
x_min = 0.0009
x_max = 1.05
basic = df_eps.at[0.01, f"Origin_Accuracy"]
lines.set_xlim(x_min, x_max)
plt.gca().plot(
    [x_min, x_max],
    [basic, basic],
    color='k',
    linestyle='dashed',
    linewidth=1
)
plt.savefig(f"{path}/Evasion/line_accuracy_vs_eps.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/Evasion/line_accuracy_vs_eps.pdf", bbox_inches='tight')
plt.close(fig)

### Binary Search Steps attacks

In [None]:
bsss = list(range(1, 11, 1))
df_bss = pd.DataFrame(index=bsss)

for bss in bsss:

    attacks = {
        'ElasticNet': ElasticNet(
            classifier=art_classifier,
            confidence=0.0,
            targeted=False,
            learning_rate=1e-3,
            binary_search_steps=bss,
            max_iter=20,
            beta=1e-3,
            initial_const=1e-4,
            batch_size=1,
            decision_rule="EN",
            verbose=True,
        ),
        'CarliniL2Method': CarliniL2Method(
            classifier=art_classifier,
            confidence=0.0,
            targeted=False,
            learning_rate=0.001,
            binary_search_steps=bss,
            max_iter=20,
            initial_const=1e-4,
            max_halving=5,
            max_doubling=5,
            batch_size=1,
            verbose=True
        ),
        'ZooAttack': ZooAttack(
            classifier=art_classifier,
            confidence=0.0,
            targeted=False,
            learning_rate=0.001,
            max_iter=20,
            binary_search_steps=bss,
            initial_const=1e-4,
            abort_early=True,
            use_resize=False,
            use_importance=True,
            nb_parallel=16,
            batch_size=1,
            variable_h=0.001,
            verbose=True
        ),
    }

    for attack_name, attack in attacks.items():
        path_curr = f"{path}/Evasion/{attack_name}/bss_{bss}"
        pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

        X_adv = attack.generate(np.float32(df.loc[ids_atk, feats].values))
        
        df_adv = df.loc[ids_atk, ['Real']].copy()
        df_adv.loc[ids_atk, feats] = X_adv
        model.produce_probabilities = True
        y_pred_prob = model(torch.from_numpy(np.float32(df_adv.loc[ids_atk, feats].values))).cpu().detach().numpy()
        y_pred = np.argmax(y_pred_prob, 1)
        df_adv["Pred"] = y_pred
        df_adv["Prob Control"] = y_pred_prob[:, 0]
        df_adv["Prob Parkinson"] = y_pred_prob[:, 1]
        df_adv["BSS"] = bss
        df_adv["Data"] = attack_name
        
        for m, drm in dim_red_models.items():
            dim_red_res = drm.transform(df_adv.loc[:, feats].values)
            df_adv.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
            df_adv.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
            
        add_iqr_outs_to_df(df_adv, df.loc[ids_trn_val, :], feats)
        add_pyod_outs_to_df(df_adv, pyod_methods, feats)
            
        df_adv.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')

        metrics_pred = get_cls_pred_metrics(num_classes=2)
        metrics_prob = get_cls_prob_metrics(num_classes=2)
        df_metrics = pd.DataFrame(index=list(metrics_pred.keys()) + list(metrics_prob.keys()))
        y_real = torch.from_numpy(df_adv.loc[ids_atk, "Real"].values.astype('int32'))
        y_pred_atk = torch.from_numpy(df_adv.loc[ids_atk, "Pred"].values.astype('int32'))
        y_pred_ori = torch.from_numpy(df.loc[ids_atk, "Pred"].values.astype('int32'))
        y_prob_atk = torch.from_numpy(df_adv.loc[ids_atk, ["Prob Control", "Prob Parkinson"]].values)
        y_prob_ori = torch.from_numpy(df.loc[ids_atk, ["Prob Control", "Prob Parkinson"]].values)
        for m in metrics_pred:
            m_val = float(metrics_pred[m][0](y_pred_atk, y_real).numpy())
            metrics_pred[m][0].reset()
            df_metrics.at[m, 'Attack'] = m_val
            m_val = float(metrics_pred[m][0](y_pred_ori, y_real).numpy())
            df_metrics.at[m, 'Origin'] = m_val
            metrics_pred[m][0].reset()
        for m in metrics_prob:
            m_val = 0
            try:
                m_val = float(metrics_prob[m][0](y_prob_atk, y_real).numpy())
            except ValueError:
                pass
            metrics_prob[m][0].reset()
            df_metrics.at[m, 'Attack'] = m_val
            m_val = 0
            try:
                m_val = float(metrics_prob[m][0](y_prob_ori, y_real).numpy())
            except ValueError:
                pass
            metrics_prob[m][0].reset()
            df_metrics.at[m, 'Origin'] = m_val
            
        df_metrics.to_excel(f"{path_curr}/metrics.xlsx", index_label='Metrics')
        
        if attack_name == 'ElasticNet':
            df_bss.loc[bss, "Origin_Accuracy"] = df_metrics.at['accuracy_weighted', 'Origin']
        df_bss.loc[bss, f"{attack_name}_Accuracy"] = df_metrics.at['accuracy_weighted', 'Attack']
            
df_bss.to_excel(f"{path}/Evasion/df_bss.xlsx", index_label='eps')

In [None]:
df_bss = pd.read_excel(f"{path}/Evasion/df_bss.xlsx", index_col=0)

atks_trgt = ['ElasticNet', 'CarliniL2Method', 'ZooAttack']

df_fig = df_bss.loc[:, [f"{x}_Accuracy" for x in atks_trgt]].copy()
df_fig.rename(columns={f"{x}_Accuracy": x for x in atks_trgt}, inplace=True)
df_fig['BSS'] = df_fig.index.values
df_fig = df_fig.melt(id_vars="BSS", var_name='Method', value_name="Accuracy")
fig = plt.figure()
sns.set_theme(style='whitegrid', font_scale=1)
lines = sns.lineplot(
    data=df_fig,
    x='BSS',
    y="Accuracy",
    hue=f"Method",
    style=f"Method",
    palette=colors_atks_bss,
    hue_order=atks_trgt,
    markers=True,
    dashes=False,
)
lines.set_xlabel('BSS')
basic = pd.read_excel(f"{path}/Evasion/ElasticNet/bss_1/metrics.xlsx", index_col=0).at['accuracy_weighted', 'Origin']
x_min = 0.5
x_max = 10.5
lines.set_xlim(x_min, x_max)
plt.gca().plot(
    [x_min, x_max],
    [basic, basic],
    color='k',
    linestyle='dashed',
    linewidth=1
)
plt.savefig(f"{path}/Evasion/line_accuracy_vs_bss.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/Evasion/line_accuracy_vs_bss.pdf", bbox_inches='tight')
plt.close(fig)

### Eta-depended attacks

In [None]:
#etas = np.concatenate([np.geomspace(1e-8, 1e-1, 8), np.geomspace(1e-8, 1e-1, 8) * 5])
etas = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
    set(np.linspace(0.001, 0.01, 10))
)))

df_etas = pd.DataFrame(index=etas)

for eta in etas:

    attacks = {
        'NewtonFool': NewtonFool(
            classifier=art_classifier,
            max_iter=100,
            eta=eta,
            batch_size=100,
            verbose=True,
        ),
    }

    for attack_name, attack in attacks.items():
        path_curr = f"{path}/Evasion/{attack_name}/eta_{eta:0.2e}"
        pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

        X_adv = attack.generate(np.float32(df.loc[ids_atk, feats].values))
        
        df_adv = df.loc[ids_atk, ['Real']].copy()
        df_adv.loc[ids_atk, feats] = X_adv
        model.produce_probabilities = True
        y_pred_prob = model(torch.from_numpy(np.float32(df_adv.loc[ids_atk, feats].values))).cpu().detach().numpy()
        y_pred = np.argmax(y_pred_prob, 1)
        df_adv["Pred"] = y_pred
        df_adv["Prob Control"] = y_pred_prob[:, 0]
        df_adv["Prob Parkinson"] = y_pred_prob[:, 1]
        df_adv["Eta"] = f"{eta:0.2e}"
        df_adv["Data"] = attack_name
        
        for m, drm in dim_red_models.items():
            dim_red_res = drm.transform(df_adv.loc[:, feats].values)
            df_adv.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
            df_adv.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
            
        add_iqr_outs_to_df(df_adv, df.loc[ids_trn_val, :], feats)
        add_pyod_outs_to_df(df_adv, pyod_methods, feats)
            
        df_adv.to_excel(f"{path_curr}/df.xlsx", index_label='sample_id')

        metrics_pred = get_cls_pred_metrics(num_classes=2)
        metrics_prob = get_cls_prob_metrics(num_classes=2)
        df_metrics = pd.DataFrame(index=list(metrics_pred.keys()) + list(metrics_prob.keys()))
        y_real = torch.from_numpy(df_adv.loc[ids_atk, "Real"].values.astype('int32'))
        y_pred_atk = torch.from_numpy(df_adv.loc[ids_atk, "Pred"].values.astype('int32'))
        y_pred_ori = torch.from_numpy(df.loc[ids_atk, "Pred"].values.astype('int32'))
        y_prob_atk = torch.from_numpy(df_adv.loc[ids_atk, ["Prob Control", "Prob Parkinson"]].values)
        y_prob_ori = torch.from_numpy(df.loc[ids_atk, ["Prob Control", "Prob Parkinson"]].values)
        for m in metrics_pred:
            m_val = float(metrics_pred[m][0](y_pred_atk, y_real).numpy())
            metrics_pred[m][0].reset()
            df_metrics.at[m, 'Attack'] = m_val
            m_val = float(metrics_pred[m][0](y_pred_ori, y_real).numpy())
            df_metrics.at[m, 'Origin'] = m_val
            metrics_pred[m][0].reset()
        for m in metrics_prob:
            m_val = 0
            try:
                m_val = float(metrics_prob[m][0](y_prob_atk, y_real).numpy())
            except ValueError:
                pass
            metrics_prob[m][0].reset()
            df_metrics.at[m, 'Attack'] = m_val
            m_val = 0
            try:
                m_val = float(metrics_prob[m][0](y_prob_ori, y_real).numpy())
            except ValueError:
                pass
            metrics_prob[m][0].reset()
            df_metrics.at[m, 'Origin'] = m_val
            
        df_metrics.to_excel(f"{path_curr}/metrics.xlsx", index_label='Metrics')
        
        if attack_name == 'NewtonFool':
            df_etas.loc[eta, "Origin_Accuracy"] = df_metrics.at['accuracy_weighted', 'Origin']
        df_etas.loc[eta, f"{attack_name}_Accuracy"] = df_metrics.at['accuracy_weighted', 'Attack']
            
df_etas.to_excel(f"{path}/Evasion/df_etas.xlsx", index_label='eta')

In [None]:
df_etas = pd.read_excel(f"{path}/Evasion/df_etas.xlsx", index_col=0)

atks_trgt = ['NewtonFool']

df_fig = df_etas.loc[:, [f"{x}_Accuracy" for x in atks_trgt]].copy()
df_fig.rename(columns={f"{x}_Accuracy": x for x in atks_trgt}, inplace=True)
df_fig['Eta'] = df_fig.index.values
df_fig = df_fig.melt(id_vars="Eta", var_name='Method', value_name="Accuracy")
fig = plt.figure()
sns.set_theme(style='whitegrid', font_scale=1)
lines = sns.lineplot(
    data=df_fig,
    x='Eta',
    y="Accuracy",
    hue=f"Method",
    style=f"Method",
    palette=colors_atks_eta,
    hue_order=atks_trgt,
    markers=True,
    dashes=False,
)
plt.xscale('log')
lines.set_xlabel(r'$\eta$')
x_min = 8e-4
x_max = 1.1
basic = df_etas.at[0.01, f"Origin_Accuracy"]
lines.set_xlim(x_min, x_max)
plt.gca().plot(
    [x_min, x_max],
    [basic, basic],
    color='k',
    linestyle='dashed',
    linewidth=1
)
plt.savefig(f"{path}/Evasion/line_accuracy_vs_eta.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path}/Evasion/line_accuracy_vs_eta.pdf", bbox_inches='tight')
plt.close(fig)

### Setup for plots

In [None]:
atks_options = {
    'Eps': {
        'types': ['MomentumIterative', 'BasicIterative', 'FastGradient'],
        'values': [0.005, 0.02, 0.05, 0.2]
    },
    'BSS': {
        'types': ['ElasticNet', 'CarliniL2Method', 'ZooAttack'],
        'values': [2, 4, 6, 8]
    },
    'Eta': {
        'types': ['NewtonFool'],
        'values': [1e-3, 2e-3, 3e-3, 1e-2]
    },
}

### Plot in reduced dimension

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'
df_ori['Symbol'] = 'o'
df_ori['MarkerSize'] = 70

for var_param, opt in atks_options.items():
    for atk_type in opt['types']:
        for var_val in opt['values']:
            if var_param == 'Eps':
                path_curr = f"{path}/Evasion/{atk_type}/eps_{var_val:0.4f}"
                val_str = f'Eps = {var_val:0.4f}'
            elif var_param == 'BSS':
                path_curr = f"{path}/Evasion/{atk_type}/bss_{var_val}"
                val_str = f'BSS = {var_val}'
            else:
                path_curr = f"{path}/Evasion/{atk_type}/eta_{var_val:0.2e}"
                val_str = f'Eta = {var_val:0.2e}'
            
            print(f"{atk_type} {val_str}")
            
            pathlib.Path(f"{path_curr}/dim_red").mkdir(parents=True, exist_ok=True)
                
            df_ori[var_param] = 'Origin'
            df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col=0)
            df_adv[var_param] = var_val
            df_adv.set_index(df_adv.index.values + f' {val_str}', inplace=True)
            df_adv['Symbol'] = 'X'
            df_adv['MarkerSize'] = 50
            df_ori_adv = pd.concat([df_ori, df_adv])
            
            for m in ['t-SNE']:
                norm = plt.Normalize(df_ori_adv["Prob Parkinson"].min(), df_ori_adv["Prob Parkinson"].max())
                sm = plt.cm.ScalarMappable(cmap="seismic", norm=norm)
                sm.set_array([])
                
                fig, ax = plt.subplots(figsize=(8, 6))
                sns.set_theme(style='whitegrid')
                
                scatter = sns.scatterplot(
                    data=df_ori_adv,
                    x=dim_red_labels[m][0],
                    y=dim_red_labels[m][1],
                    palette='seismic',
                    hue="Prob Parkinson",
                    linewidth=0.5,
                    alpha=0.75,
                    edgecolor="cyan",
                    style=df_ori_adv.loc[:, 'Symbol'].values,
                    size='MarkerSize',
                    ax=ax
                )
                scatter.get_legend().remove()
                scatter.figure.colorbar(sm, label="Prob Parkinson")
                scatter.set_title(val_str, loc='left', y=1.05, fontdict={'fontsize': 20})
    
                legend_handles = [
                    mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Real'),
                    mlines.Line2D([], [], marker='X', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=7, label='Attack')
                ]
                plt.legend(handles=legend_handles, title="Samples", bbox_to_anchor=(0.4, 1.02, 1, 0.2), loc="lower left", borderaxespad=0, ncol=2, frameon=False)
                
                plt.savefig(f"{path_curr}/dim_red/{m}.png", bbox_inches='tight', dpi=200)
                plt.savefig(f"{path_curr}/dim_red/{m}.pdf", bbox_inches='tight')
                plt.close(fig)  

### Plot features differences

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'

for var_param, opt in atks_options.items():
    for atk_type in opt['types']:
        df_ori[var_param] = 'Origin'
        dfs_ori_advs = [df_ori]
        
        palette_curr = {'Origin': 'gray'} 
        for var_val_id, var_val in enumerate(opt['values']):
            if var_param == 'Eps':
                path_curr = f"{path}/Evasion/{atk_type}/eps_{var_val:0.4f}"
                val_str = f'Eps = {var_val:0.4f}'
            elif var_param == 'BSS':
                path_curr = f"{path}/Evasion/{atk_type}/bss_{var_val}"
                val_str = f'BSS = {var_val}'
            else:
                path_curr = f"{path}/Evasion/{atk_type}/eta_{var_val:0.2e}"
                val_str = f'Eta = {var_val:0.2e}'
            
            palette_curr[var_val] = px.colors.qualitative.G10[var_val_id]
                
            df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col=0)
            df_adv[var_param] = var_val
            df_adv.loc[df_adv['Real'] == 0, 'Status'] = 'Control'
            df_adv.loc[df_adv['Real'] == 1, 'Status'] = 'Parkinson'
            df_adv.set_index(df_adv.index.values + f' {val_str}', inplace=True)
            dfs_ori_advs.append(df_adv)
        df_ori_adv = pd.concat(dfs_ori_advs)

        kdeplots = {}
        for cls in ['Control', 'Parkinson']:
            kdeplots[cls] = pw.Brick(figsize=(4, 2))
            sns.set_theme(style='whitegrid')
            kde = sns.kdeplot(
                data=df_ori_adv.loc[df_ori_adv['Status'] == cls, :],
                x=f"Prob Parkinson",
                hue=var_param,
                linewidth=2,
                palette=palette_curr,
                hue_order=list(palette_curr.keys()),
                fill=True,
                cut=0,
                ax=kdeplots[cls]
            )
            sns.move_legend(kdeplots[cls], "upper center")
            kdeplots[cls].set_title(f"{cls} samples")
    
        df_stat = pd.DataFrame(index=feats, columns=[f'{var_val}' for var_val in opt['values']])
        for f in feats:
            vals_dict = {'Origin': df_ori_adv.loc[df_ori_adv[var_param] == 'Origin', f].values}
            for var_val_id, var_val in enumerate(opt['values']):
                vals_dict[var_val] = df_ori_adv.loc[df_ori_adv[var_param] == var_val, f].values
                _, df_stat.at[f, f'{var_val}'] = mannwhitneyu(
                    vals_dict['Origin'],
                    vals_dict[var_val],
                    alternative='two-sided'
                )
                _, df_stat.loc[:, f'{var_val}_fdr_bh'], _, _ = multipletests(df_stat.loc[:, f'{var_val}'], 0.05, method='fdr_bh')
        df_stat.sort_values([f"{opt['values'][-1]}"], ascending=[True], inplace=True)
        df_stat.to_excel(f"{path}/Evasion/df_stat_{var_param}_{atk_type}.xlsx", index_label="Features")
        
        pw_fig = (kdeplots['Control'] | kdeplots['Parkinson'])
        pw_fig.savefig(f"{path}/Evasion/feats_{var_param}_{atk_type}.png", bbox_inches='tight', dpi=200)
        pw_fig.savefig(f"{path}/Evasion/feats_{var_param}_{atk_type}.pdf", bbox_inches='tight')
        pw.clear()

### Outliers analysis

In [None]:
df_ori = df.loc[ids_tst, :].copy()
df_ori.loc[df_ori['Real'] == 0, 'Status'] = 'Control'
df_ori.loc[df_ori['Real'] == 1, 'Status'] = 'Parkinson'
df_ori['Type'] = 'Real'

for var_param, opt in atks_options.items():
    for atk_type in opt['types']:
        for var_val_id, var_val in enumerate(opt['values']):
            if var_param == 'Eps':
                path_curr = f"{path}/Evasion/{atk_type}/eps_{var_val:0.4f}"
                val_str = f'Eps = {var_val:0.4f}'
            elif var_param == 'BSS':
                path_curr = f"{path}/Evasion/{atk_type}/bss_{var_val}"
                val_str = f'BSS = {var_val}'
            else:
                path_curr = f"{path}/Evasion/{atk_type}/eta_{var_val:0.2e}"
                val_str = f'Eta = {var_val:0.2e}'
            
            df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col=0)
            df_adv[var_param] = var_val
            df_adv.loc[df_adv['Real'] == 0, 'Status'] = 'Control'
            df_adv.loc[df_adv['Real'] == 1, 'Status'] = 'Parkinson'
            df_adv.set_index(df_adv.index.values + f' {val_str}', inplace=True)
            
            # IQR outliers
            pathlib.Path(f"{path_curr}/outliers_iqr").mkdir(parents=True, exist_ok=True)
            plot_iqr_outs(df_adv, feats, 'grey', atk_type, f"{path_curr}/outliers_iqr", is_msno_plots=False)
            df_fig = df_adv.loc[:, ['Real', 'Pred', 'n_outs_iqr', 'Prob Control', 'Prob Parkinson']].copy()
            df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
            df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
            plot_iqr_outs_cls(
                df=df_fig,
                path=f"{path_curr}/outliers_iqr",
                col_class="Status",
                col_pred="Pred",
                col_real="Real",
                cols_prob=['Prob Control', 'Prob Parkinson'],
                palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'}
            )
            
            # PyOD plots
            pathlib.Path(f"{path_curr}/outliers_pyod").mkdir(parents=True, exist_ok=True)
            plot_pyod_outs(df_adv, pyod_method_names, 'grey', 'Origin', f"{path_curr}/outliers_pyod", n_cols=4)
            df_fig = df_adv.loc[:, ['Real', 'Pred', 'Detections', 'Prob Control', 'Prob Parkinson']].copy()
            df_fig.loc[df_fig['Real'] == 0, 'Status'] = 'Control'
            df_fig.loc[df_fig['Real'] == 1, 'Status'] = 'Parkinson'
            plot_pyod_outs_cls(
                df=df_fig,
                path=f"{path_curr}/outliers_pyod",
                col_class="Status",
                col_pred="Pred",
                col_real="Real",
                cols_prob=['Prob Control', 'Prob Parkinson'],
                palette={'Control': 'dodgerblue', 'Parkinson': 'crimson'}
            )


### Confidence

In [None]:
torch.manual_seed(42)
quantiles = torch.tensor([0.05, 0.95])

epsilons = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
    set(np.linspace(0.001, 0.01, 10))
)))

bsss = list(range(1, 11, 1))

etas = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
    set(np.linspace(0.001, 0.01, 10))
)))

atks_options = {
    'Eps': {
        'types': ['MomentumIterative', 'BasicIterative', 'FastGradient'],
        'values': epsilons
    },
    'BSS': {
        'types': ['ElasticNet', 'CarliniL2Method', 'ZooAttack'],
        'values': bsss
    },
    'Eta': {
        'types': ['NewtonFool'],
        'values': etas
    },
}

for var_param, opt in atks_options.items():
    for atk_type in opt['types']:
        for var_val_id, var_val in enumerate(opt['values']):
            if var_param == 'Eps':
                path_curr = f"{path}/Evasion/{atk_type}/eps_{var_val:0.4f}"
                val_str = f'Eps = {var_val:0.4f}'
            elif var_param == 'BSS':
                path_curr = f"{path}/Evasion/{atk_type}/bss_{var_val}"
                val_str = f'BSS = {var_val}'
            else:
                path_curr = f"{path}/Evasion/{atk_type}/eta_{var_val:0.2e}"
                val_str = f'Eta = {var_val:0.2e}'
            
            df_adv = pd.read_excel(f"{path_curr}/df.xlsx", index_col=0)
            df_adv.loc[df_adv['Real'] == 0, 'Status'] = 'Control'
            df_adv.loc[df_adv['Real'] == 1, 'Status'] = 'Parkinson'
            
            pathlib.Path(f"{path_curr}/confidence").mkdir(parents=True, exist_ok=True)

            metrics_pred = get_cls_pred_metrics(num_classes=2)
            df_metrics = pd.DataFrame(index=list(metrics_pred.keys()))
            y_real = torch.from_numpy(df_adv.loc[:, "Real"].values.astype('int32'))
            y_pred = torch.from_numpy(df_adv.loc[:, "Pred"].values.astype('int32'))
            
            for metric_name, metric_pair in metrics_pred.items():
                metric = metric_pair[0]
                bootstrap = BootStrapper(
                    metric,
                    num_bootstraps=200,
                    sampling_strategy="multinomial",
                    quantile=quantiles
                )
                bootstrap.update(y_pred, y_real)
                bootstrap_output = bootstrap.compute()
                df_metrics.at[metric_name, 'mean'] = bootstrap_output['mean'].detach().cpu().numpy()
                df_metrics.at[metric_name, 'std'] = bootstrap_output['std'].detach().cpu().numpy()
                df_metrics.at[metric_name, 'q0.05'] = bootstrap_output['quantile'].detach().cpu().numpy()[0]
                df_metrics.at[metric_name, 'q0.95'] = bootstrap_output['quantile'].detach().cpu().numpy()[1]
            df_metrics.to_excel(f"{path_curr}/confidence/metrics.xlsx", index_label='Metrics')

In [None]:
metrics_names = {
    'accuracy_weighted': 'Accuracy',
}
quantiles = [0.05, 0.95]

epsilons = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
    set(np.linspace(0.001, 0.01, 10))
)))

bsss = list(range(1, 11, 1))

etas = sorted(list(set.union(
    set(np.linspace(0.1, 1.0, 10)), 
    set(np.linspace(0.01, 0.1, 10)),
    set(np.linspace(0.001, 0.01, 10))
)))

atks_options = {
    'Eps': {
        'types': ['MomentumIterative', 'BasicIterative', 'FastGradient'],
        'values': epsilons
    },
    'BSS': {
        'types': ['ElasticNet', 'CarliniL2Method', 'ZooAttack'],
        'values': bsss
    },
    'Eta': {
        'types': ['NewtonFool'],
        'values': etas
    },
}

df_conf = pd.DataFrame(index=['Real'] + list(colors_augs.keys()), columns=[f"{m}_{q}" for m in metrics_names for q in quantiles])

df_metrics = pd.read_excel(f"{path}/Origin/confidence/metrics.xlsx", index_col='Metrics')
for m in metrics_names:
    for q in quantiles:
        df_conf.at["Real", f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]


for var_param, opt in atks_options.items():
    for atk_type in opt['types']:
        df_conf = pd.DataFrame(index=opt['values'], columns=[f"{m}_{q}" for m in metrics_names for q in quantiles])
        for var_val_id, var_val in enumerate(opt['values']):
            if var_param == 'Eps':
                path_curr = f"{path}/Evasion/{atk_type}/eps_{var_val:0.4f}"
                val_str = f'Eps = {var_val:0.4f}'
                palette = colors_atks_eps
            elif var_param == 'BSS':
                path_curr = f"{path}/Evasion/{atk_type}/bss_{var_val}"
                val_str = f'BSS = {var_val}'
                palette = colors_atks_bss
            else:
                path_curr = f"{path}/Evasion/{atk_type}/eta_{var_val:0.2e}"
                val_str = f'Eta = {var_val:0.2e}'
                palette = colors_atks_eta
            df_metrics = pd.read_excel(f"{path_curr}/confidence/metrics.xlsx", index_col='Metrics')
            for m in metrics_names:
                for q in quantiles:
                    df_conf.at[var_val, f"{m}_{q}"] = df_metrics.at[m, f"q{q}"]
        
        if var_param == 'Eps':
            palette = colors_atks_eps
        elif var_param == 'BSS':
            palette = colors_atks_bss
        else:
            palette = colors_atks_eta
        
        for m in metrics_names:
            df_fig = df_conf.loc[:, [f"{m}_{q}" for q in quantiles]].copy()
            df_fig['Type'] = df_fig.index
            df_fig = df_fig.melt(id_vars=['Type'], value_name=metrics_names[m])
            fig, ax = plt.subplots(figsize=(5, 4))
            sns.set_theme(style='ticks') 
            scatter = sns.scatterplot(
                data=df_fig,
                y=metrics_names[m],
                x='Type',
                hue='Type',
                palette={x: palette[atk_type] for x in opt['values']},
                hue_order=opt['values'],
                linewidth=0.2,
                alpha=0.95,
                edgecolor="black",
                s=16,
                ax=ax
            )
            scatter.get_legend().set_visible(False)
            line = sns.lineplot(
                data=df_fig,
                y=metrics_names[m],
                x='Type',
                hue='Type',
                palette={x: palette[atk_type] for x in opt['values']},
                hue_order=opt['values'],
                linewidth=3,
                ax=ax
            )
            line.get_legend().set_visible(False)
            
            if var_param == 'Eps':
                plt.xscale('log')
                ax.set_xlabel(r'$\epsilon$')
            elif var_param == 'BSS':
                ax.set_xlabel("BSS")
            else:
                plt.xscale('log')
                ax.set_xlabel(r'$\eta$')
            ax.set_ylabel(f"Confidence Intervals for {metrics_names[m]}")
            plt.savefig(f"{path}/Evasion/{atk_type}/confidence_{m}.png", bbox_inches='tight', dpi=400)
            plt.savefig(f"{path}/Evasion/{atk_type}/confidence_{m}.pdf", bbox_inches='tight')
            plt.close(fig)
