In [None]:
import pandas as pd
import numpy as np
import scipy
from scripts.python.routines.betas import betas_drop_na
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
from functools import reduce
from scipy.stats import kruskal, mannwhitneyu
from pycombat import Combat
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import upsetplot as upset
import seaborn as sns
import missingno as msno
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
import matplotlib.pyplot as plt
import pathlib

from pyod.models.ecod import ECOD
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD
from pyod.models.sos import SOS
from pyod.models.kde import KDE
from pyod.models.sampling import Sampling
from pyod.models.gmm import GMM

from pyod.models.kpca import KPCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lmdd import LMDD

from pyod.models.lof import LOF
from pyod.models.cof import COF
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.models.sod import SOD

from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.loda import LODA
from pyod.models.suod import SUOD

from pyod.models.auto_encoder_torch import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.deep_svdd import DeepSVDD

from pyod.models.lunar import LUNAR

# Read the data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

samples = "all_1052_121222"
proc = "raw" # "minmax_left(0.0)_right(1.0)_combat" "raw"
imp = "fast_knn"
replace = "quarter"

df = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples({samples})_proc({proc})_imp({imp})_replace({replace}).xlsx", index_col="index")
feats = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx").loc[:, 'gene'].values.tolist()

path_save = f"{path}/{platform}/{dataset}/special/036_outliers_and_combat/samples({samples})_proc({proc})_imp({imp})_replace({replace})"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

# Select controls

In [None]:
df["Split"] = "tst"
df.loc[df["Region"] == "Central", "Split"] = "trn_val"
df_ctrl = df.loc[df["Status"] == "Control", :]
df_ctrl.to_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples(ctrl_{df_ctrl.shape[0]}_from_{samples})_proc{proc})_imp({imp})_replace({replace}).xlsx", index=True)

# Aux plots

In [None]:
status_palette = {
    "Control": px.colors.qualitative.Dark24[0],
    "ESRD": px.colors.qualitative.Dark24[1],
    "COVID": px.colors.qualitative.Dark24[2],
    "DownSyndrome": px.colors.qualitative.Dark24[3],
    "TR": px.colors.qualitative.Dark24[4],
}

path_local = f"aux_plots"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
plt.figure()
sns.set_theme(style='whitegrid', font_scale=1)
bar = sns.countplot(
    data=df,
    x=f"Status",
    palette=status_palette,
    edgecolor='black',
)
bar.set_ylabel("Count")
plt.savefig(f"{path_save}/{path_local}/count_Status.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/count_Status.pdf", bbox_inches='tight')
plt.close()

# Outliers

In [None]:
contamination = 0.1
epochs = 500

path_local = f"outliers/contamination_{contamination}"
scalers = {}
feats_scaled = []
for f in feats:
    scaler = StandardScaler()
    scaler.fit(df.loc[:, f].values.reshape(-1, 1))
    scalers[f] = scaler
    feats_scaled.append(f"{f}_scaled")
    df[f"{f}_scaled"] = scaler.transform(df.loc[:, f].values.reshape(-1, 1))

pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
with open(f"{path_save}/{path_local}/scalers.pkl", 'wb') as handle:
    pickle.dump(scalers, handle, protocol=pickle.HIGHEST_PROTOCOL)

feats_sets = {
    'origin': feats,
    'scaled': feats_scaled
}

for feats_set in feats_sets:

    df_outliers = df.copy()

    pathlib.Path(f"{path_save}/{path_local}/{feats_set}/all").mkdir(parents=True, exist_ok=True)
    pathlib.Path(f"{path_save}/{path_local}/{feats_set}/ctrl").mkdir(parents=True, exist_ok=True)

    outlier_columns = []
    for f in feats_sets[feats_set]:
        q1 = df_outliers[f].quantile(0.25)
        q3 = df_outliers[f].quantile(0.75)
        iqr = q3 - q1
        df_outliers[f"{f}_boxplot_outlier"] = True
        outlier_columns.append(f"{f}_boxplot_outlier")
        filter = (df_outliers[f] >= q1 - 1.5 * iqr) & (df_outliers[f] <= q3 + 1.5 * iqr)
        df_outliers.loc[filter, f"{f}_boxplot_outlier"] = False
    df_outliers[f"n_boxplot_outliers"] = df_outliers.loc[:, outlier_columns].sum(axis=1)

    df_outliers.sort_values([f"n_boxplot_outliers"], ascending=[False], inplace=True)

    plt.figure(figsize=(12, 0.4 * df_outliers.shape[0]))
    sns.set_theme(style='whitegrid', font_scale=2)
    bar = sns.barplot(
        data=df_outliers,
        hue="Region",
        y=df_outliers.index,
        x=f"n_boxplot_outliers",
        palette={
            "Central": "limegreen",
            "Yakutia": "royalblue",
        },
        edgecolor='black',
        orient="h",
        dodge=False
    )
    bar.set_xlabel("Number of boxplot outliers")
    sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/region_barplot_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/region_barplot_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.close()

    plt.figure(figsize=(12, 0.4 * df_outliers.shape[0]))
    sns.set_theme(style='whitegrid', font_scale=2)
    bar = sns.barplot(
        data=df_outliers,
        hue="Status",
        y=df_outliers.index,
        x=f"n_boxplot_outliers",
        palette=status_palette,
        edgecolor='black',
        orient="h",
        dodge=False
    )
    bar.set_xlabel("Number of boxplot outliers")
    sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/status_barplot_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/status_barplot_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.close()

    plt.figure(figsize=(12, 0.4 * df_outliers.loc[df_outliers["Status"] == "Control", :].shape[0]))
    sns.set_theme(style='whitegrid', font_scale=2)
    bar = sns.barplot(
        data=df_outliers.loc[df_outliers["Status"] == "Control", :],
        hue="Region",
        y=df_outliers.loc[df_outliers["Status"] == "Control", :].index,
        x=f"n_boxplot_outliers",
        palette={
            "Central": "limegreen",
            "Yakutia": "royalblue",
        },
        edgecolor='black',
        orient="h",
        dodge=False
    )
    bar.set_xlabel("Number of boxplot outliers")
    sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/region_barplot_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/region_barplot_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.close()

    min_val = df_outliers[f"n_boxplot_outliers"].min()
    max_val = df_outliers[f"n_boxplot_outliers"].max()
    val_width = max_val - min_val
    n_bins = df_outliers[f"n_boxplot_outliers"].max()
    bin_width = val_width / n_bins

    plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_outliers,
        hue_order=["Central", "Yakutia"],
        x=f"n_boxplot_outliers",
        hue="Region",
        palette={
            "Central": "limegreen",
            "Yakutia": "royalblue",
        },
        multiple="stack",
        bins=n_bins,
        binrange=(min_val, max_val),
        binwidth=bin_width,
        discrete=True,
        edgecolor='k',
        linewidth=1
    )
    hist.set_xlabel("Number of boxplot outliers")
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/region_histplot_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/region_histplot_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_outliers,
        x=f"n_boxplot_outliers",
        hue="Status",
        palette=status_palette,
        multiple="stack",
        bins=n_bins,
        binrange=(min_val, max_val),
        binwidth=bin_width,
        discrete=True,
        edgecolor='k',
        linewidth=1
    )
    hist.set_xlabel("Number of boxplot outliers")
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/status_histplot_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/status_histplot_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    min_val = df_outliers.loc[df_outliers["Status"] == "Control", f"n_boxplot_outliers"].min()
    max_val = df_outliers.loc[df_outliers["Status"] == "Control", f"n_boxplot_outliers"].max()
    val_width = max_val - min_val
    n_bins = df_outliers.loc[df_outliers["Status"] == "Control", f"n_boxplot_outliers"].max()
    bin_width = val_width / n_bins

    plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_outliers.loc[df_outliers["Status"] == "Control", :],
        hue_order=["Central", "Yakutia"],
        x=f"n_boxplot_outliers",
        hue="Region",
        palette={
            "Central": "limegreen",
            "Yakutia": "royalblue",
        },
        multiple="stack",
        bins=n_bins,
        binrange=(min_val, max_val),
        binwidth=bin_width,
        discrete=True,
        edgecolor='k',
        linewidth=1
    )
    hist.set_xlabel("Number of boxplot outliers")
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/region_histplot_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/region_histplot_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    df_msno_all = df_outliers.loc[:, outlier_columns].replace({True: np.nan})
    df_msno_all.rename(columns=dict(zip(outlier_columns, feats_sets[feats_set])), inplace=True)
    df_msno_ctrl = df_outliers.loc[df_outliers["Status"] == "Control", outlier_columns].replace({True: np.nan})
    df_msno_ctrl.rename(columns=dict(zip(outlier_columns, feats_sets[feats_set])), inplace=True)

    msno.bar(
        df=df_msno_all,
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/msno_bar_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/msno_bar_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    msno.bar(
        df=df_msno_ctrl,
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/msno_bar_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/msno_bar_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    msno.matrix(
        df=df_msno_all,
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/msno_mtx_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/msno_mtx_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    msno.matrix(
        df=df_msno_ctrl,
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/msno_mtx_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/msno_mtx_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    msno.heatmap(
        df=df_msno_all,
        label_rotation=90,
        cmap="bwr",
        fontsize=12
    )
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/msno_heatmap_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/msno_heatmap_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    msno.heatmap(
        df=df_msno_ctrl,
        label_rotation=90,
        cmap="bwr",
        fontsize=12
    )
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/msno_heatmap_n_boxplot_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/msno_heatmap_n_boxplot_outliers.pdf", bbox_inches='tight')
    plt.clf()

    outlier_methods = {
        'ECOD': {
            "package": "pyod",
            "model": ECOD(contamination=contamination),
        },
        'ABOD': {
            "package": "pyod",
            "model": ABOD(contamination=contamination),
        },
        'COPOD': {
            "package": "pyod",
            "model": COPOD(contamination=contamination),
        },
        'SOS': {
            "package": "pyod",
            "model": SOS(contamination=contamination),
        },
        'KDE': {
            "package": "pyod",
            "model": KDE(contamination=contamination),
        },
        'Sampling': {
            "package": "pyod",
            "model": Sampling(contamination=contamination),
        },
        'GMM': {
            "package": "pyod",
            "model": GMM(contamination=contamination),
        },
        'KPCA': {
            "package": "pyod",
            "model": KPCA(contamination=contamination),
        },
        'MCD': {
            "package": "pyod",
            "model": MCD(contamination=contamination),
        },
        'OCSVM': {
            "package": "pyod",
            "model": OCSVM(contamination=contamination),
        },
        'LMDD': {
            "package": "pyod",
            "model": LMDD(contamination=contamination),
        },
        'LOF': {
            "package": "pyod",
            "model": LOF(contamination=contamination),
        },
        'COF': {
            "package": "pyod",
            "model": COF(contamination=contamination),
        },
        'CBLOF': {
            "package": "pyod",
            "model": CBLOF(contamination=contamination),
        },
        'HBOS': {
            "package": "pyod",
            "model": HBOS(contamination=contamination),
        },
        'KNN': {
            "package": "pyod",
            "model": KNN(contamination=contamination),
        },
        'SOD': {
            "package": "pyod",
            "model": SOD(contamination=contamination),
        },
        'IForest': {
            "package": "pyod",
            "model": IForest(contamination=contamination),
        },
        'INNE': {
            "package": "pyod",
            "model": INNE(contamination=contamination),
        },
        'LODA': {
            "package": "pyod",
            "model": LODA(contamination=contamination),
        },
        'SUOD': {
            "package": "pyod",
            "model": SUOD(contamination=contamination),
        },
        'AutoEncoder': {
            "package": "pyod",
            "model": AutoEncoder(contamination=contamination, epochs=epochs),
        },
        'VAE': {
            "package": "pyod",
            "model": VAE(contamination=contamination, epochs=epochs),
        },
        'DeepSVDD': {
            "package": "pyod",
            "model": DeepSVDD(contamination=contamination, epochs=epochs),
        },
        'LUNAR': {
            "package": "pyod",
            "model": LUNAR(),
        },
    }

    X_outliers = df_outliers.loc[:, feats_sets[feats_set]].to_numpy()
    for method in outlier_methods:
        outlier_methods[method]["model"].fit(X_outliers)
        df_outliers[f"{method}"] = outlier_methods[method]["model"].predict(X_outliers)
        n_outliers = df_outliers[f"{method}"].sum()
        print(f"Sum {method} {feats_set}: {n_outliers}")

    df_outliers["detections"] = df_outliers.loc[:, [f"{method}" for method in outlier_methods]].sum(axis=1)
    df_outliers.to_excel(f"{path_save}/{path_local}/{feats_set}/df_outliers.xlsx", index=True)

    min_val = df_outliers[f"detections"].min()
    max_val = df_outliers[f"detections"].max()
    val_width = max_val - min_val
    n_bins = df_outliers[f"detections"].max()
    bin_width = val_width / n_bins

    plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_outliers,
        hue_order=["Central", "Yakutia"],
        x=f"detections",
        hue="Region",
        palette={
            "Central": "limegreen",
            "Yakutia": "royalblue",
        },
        multiple="stack",
        bins=n_bins,
        binrange=(min_val, max_val),
        binwidth=bin_width,
        discrete=True,
        edgecolor='k',
        linewidth=1
    )
    hist.set_xlabel("Number of detections as outlier in different methods")
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/region_histplot_detections_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/region_histplot_detections_outliers.pdf", bbox_inches='tight')
    plt.clf()

    plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_outliers,
        x=f"detections",
        hue="Status",
        palette=status_palette,
        multiple="stack",
        bins=n_bins,
        binrange=(min_val, max_val),
        binwidth=bin_width,
        discrete=True,
        edgecolor='k',
        linewidth=1
    )
    hist.set_xlabel("Number of detections as outlier in different methods")
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/status_histplot_detections_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/status_histplot_detections_outliers.pdf", bbox_inches='tight')
    plt.clf()

    min_val = df_outliers.loc[df_outliers["Status"] == "Control", f"detections"].min()
    max_val = df_outliers.loc[df_outliers["Status"] == "Control", f"detections"].max()
    val_width = max_val - min_val
    n_bins = df_outliers.loc[df_outliers["Status"] == "Control", f"detections"].max()
    bin_width = val_width / n_bins

    plt.figure()
    sns.set_theme(style='whitegrid')
    hist = sns.histplot(
        data=df_outliers.loc[df_outliers["Status"] == "Control", :],
        hue_order=["Central", "Yakutia"],
        x=f"detections",
        hue="Region",
        palette={
            "Central": "limegreen",
            "Yakutia": "royalblue",
        },
        multiple="stack",
        bins=n_bins,
        binrange=(min_val, max_val),
        binwidth=bin_width,
        discrete=True,
        edgecolor='k',
        linewidth=1
    )
    hist.set_xlabel("Number of detections as outlier in different methods")
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/region_histplot_detections_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/region_histplot_detections_outliers.pdf", bbox_inches='tight')
    plt.clf()

    df_outliers.sort_values(["detections"], ascending=[True], inplace=True)

    sns.set_theme(style='whitegrid', font_scale=2)
    bar = df_outliers.loc[:, [f"{method}" for method in outlier_methods]].plot(
        figsize=(12, 0.4 * df_outliers.shape[0]),
        width=1,
        kind='barh',
        stacked=True,
        color=px.colors.qualitative.Alphabet,
        edgecolor='black',
    )
    bar.set_xlabel("Number of detections as outlier in different methods")
    bar.set_ylabel("Samples")
    sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/barplot_detections_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/barplot_detections_outliers.pdf", bbox_inches='tight')
    plt.close()

    sns.set_theme(style='whitegrid', font_scale=2)
    bar = df_outliers.loc[df_outliers["Status"] == "Control", [f"{method}" for method in outlier_methods]].plot(
        figsize=(12, 0.4 * df_outliers.loc[df_outliers["Status"] == "Control", :].shape[0]),
        width=1,
        kind='barh',
        stacked=True,
        color=px.colors.qualitative.Alphabet,
        edgecolor='black',
    )
    bar.set_xlabel("Number of detections as outlier in different methods")
    bar.set_ylabel("Samples")
    sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/barplot_detections_outliers.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/barplot_detections_outliers.pdf", bbox_inches='tight')
    plt.close()

    plt.figure(figsize=(12, 8))
    sns.set_theme(style='whitegrid', font_scale=2)
    scatter = sns.scatterplot(
        data=df_outliers,
        x="detections",
        y=f"n_boxplot_outliers",
        hue="Status",
        palette=status_palette,
    )
    scatter.set_xlabel("Number of detections as outlier in different methods")
    scatter.set_ylabel("Number of boxplot outliers")
    sns.move_legend(scatter, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/scatter_single_vs_multiple.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/all/scatter_single_vs_multiple.pdf", bbox_inches='tight')
    plt.close()

    plt.figure(figsize=(12, 8))
    sns.set_theme(style='whitegrid', font_scale=2)
    scatter = sns.scatterplot(
        data=df_outliers.loc[df_outliers["Status"] == "Control", :],
        x="detections",
        y=f"n_boxplot_outliers",
        hue="Status",
        palette=status_palette,
    )
    scatter.set_xlabel("Number of detections as outlier in different methods")
    scatter.set_ylabel("Number of boxplot outliers")
    sns.move_legend(scatter, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/scatter_single_vs_multiple.png", bbox_inches='tight')
    plt.savefig(f"{path_save}/{path_local}/{feats_set}/ctrl/scatter_single_vs_multiple.pdf", bbox_inches='tight')
    plt.close()

# Select without outliers

In [None]:
is_combat = False

feats_set = "origin"
contamination = 0.1
path_local = f"outliers/contamination_{contamination}"
df_outliers = pd.read_excel(f"{path_save}/{path_local}/{feats_set}/df_outliers.xlsx", index_col="index")

outliers_indexes = {
    'outliers_single': df_outliers.index[df_outliers[f"n_boxplot_outliers"] <= 10].tolist(),
    'outliers_multi': df_outliers.index[df_outliers[f"detections"] <= 8].tolist(),
}
for out_inxs in outliers_indexes:
    df_all = df.loc[outliers_indexes[out_inxs], :]
    df_all["Split"] = "tst"
    df_all.loc[df_all["Region"] == "Central", "Split"] = "trn_val"

    suffix = f"samples(all_wo_{out_inxs}_{df_all.shape[0]}_from_{samples})_proc({proc})_imp({imp})_replace({replace})"
    df_all.to_excel(f"{path_save}/df_{suffix}.xlsx", index=True, index_label="index")
    print(f"df_all: {df_all.shape[0]}")

    df_ctrl = df_all.loc[df_all["Status"] == "Control", :]
    suffix = f"samples(ctrl_wo_{out_inxs}_{df_ctrl.shape[0]}_from_{samples})_proc({proc})_imp({imp})_replace({replace})"
    df_ctrl.to_excel(f"{path_save}/df_{suffix}.xlsx", index=True, index_label="index")
    print(f"df_ctrl: {df_ctrl.shape[0]}")

    if is_combat:

        df_all_combat = df_all.copy()

        left = 0.00
        right = 1.00
        ranges = {}
        for feat in feats:
            ranges[feat] = (df_all_combat[feat].quantile(left), df_all_combat[feat].quantile(right))

        Y = df_all_combat.loc[:, feats].values
        status_dummies = pd.get_dummies(df_all_combat['Status'], drop_first=True)
        sex_dummies = pd.get_dummies(df_all_combat['Sex'], drop_first=True)
        region_dummies = pd.get_dummies(df_all_combat['Region'], drop_first=True)
        X = pd.concat([df_all_combat.loc[:,'Age'], status_dummies, sex_dummies, region_dummies], axis=1).values
        b = df_all_combat.loc[:, 'file'].values
        combat = Combat()
        Y_adjusted = combat.fit_transform(Y=Y, b=b, X=X)

        scalers = {}
        df_all_combat.loc[:, feats] = Y_adjusted
        df_all_combat.index.name = 'index'
        for feat in feats:
            vals = df_all_combat.loc[:, feat].values
            scalers[feat] = MinMaxScaler(feature_range=ranges[feat])
            vals_scaled = scalers[feat].fit_transform(vals.reshape(-1, 1))
            df_all_combat.loc[:, feat] = vals_scaled

        suffix = f"samples(all_wo_{out_inxs}_{df_all.shape[0]}_from_{samples})_proc(combat)_imp({imp})_replace({replace})"
        df_all_combat.to_excel(f"{path_save}/df_{suffix}.xlsx", index=True, index_label="index")
        transforms = {
            "ranges": ranges,
            "scalers": scalers
        }
        with open(f"{path_save}/transforms_{suffix}.pickle", 'wb') as handle:
            pickle.dump(transforms, handle, protocol=pickle.HIGHEST_PROTOCOL)

        df_ctrl_combat = df_all_combat.loc[df_all_combat["Status"] == "Control", :]
        suffix = f"samples(ctrl_wo_{out_inxs}_{df_ctrl_combat.shape[0]}_from_{samples})_proc(combat)_imp({imp})_replace({replace})"
        df_ctrl_combat.to_excel(f"{path_save}/df_{suffix}.xlsx", index=True, index_label="index")

In [None]:

left = 0.00
right = 1.00
ranges = {}
for feat in feats:
    ranges[feat] = (df[feat].quantile(left), df[feat].quantile(right))

In [None]:

Y = df.loc[:, feats].values

status_dummies = pd.get_dummies(df['Status'], drop_first=True)
sex_dummies = pd.get_dummies(df['Sex'], drop_first=True)
region_dummies = pd.get_dummies(df['Region'], drop_first=True)
X = pd.concat([df.loc[:,'Age'], status_dummies, sex_dummies, region_dummies], axis=1).values
b = df.loc[:, 'file'].values
combat = Combat()
Y_adjusted = combat.fit_transform(Y=Y, b=b, X=X)

scalers = {}

df.loc[:, feats] = Y_adjusted
df.index.name = 'index'
for feat in feats:
    vals = df.loc[:, feat].values
    scalers[feat] = MinMaxScaler(feature_range=ranges[feat])
    vals_scaled = scalers[feat].fit_transform(vals.reshape(-1, 1))
    df.loc[:, feat] = vals_scaled

df["Split"] = "tst"
df.loc[df["Region"] == "Central", "Split"] = "trn_val"
df_ctrl = df.loc[df["Status"] == "Control", :]
df_ctrl.to_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples(ctrl_from_{samples})_proc(minmax_left({left})_right({right})_combat)_imp({imp})_replace({replace}).xlsx", index=True)

df.to_excel(f"{path}/{platform}/{dataset}/data/immuno/df_samples({samples})_proc(minmax_left({left})_right({right})_combat)_imp({imp})_replace({replace}).xlsx", index=True)
transforms = {
    "ranges": ranges,
    "scalers": scalers
}
with open(f"{path}/{platform}/{dataset}/data/immuno/transforms_samples({samples})_proc(minmax_left({left})_right({right})_combat)_imp({imp})_replace({replace}).pickle", 'wb') as handle:
    pickle.dump(transforms, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Read the transforms

In [None]:
with open(f"{path}/{platform}/{dataset}/data/immuno/transforms_samples({samples})_proc(minmax_left({left})_right({right})_combat)_imp(fast_knn)_replace(quarter).pickle", 'rb') as handle:
    transforms = pickle.load(handle)