# Description


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib
import matplotlib.patheffects as path_effects
import random
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot
from matplotlib_venn import venn2, venn2_circles
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.plot.colorscales import get_continuous_color
import plotly
from scripts.python.routines.plot.p_value import add_p_value_annotation
from scripts.python.routines.sections import get_sections
from statannotations.Annotator import Annotator
import functools
import matplotlib.lines as mlines
import patchworklib as pw
import pickle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import missingno as msno
from openTSNE import TSNE

from pyod.models.ecod import ECOD
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD
from pyod.models.sos import SOS
from pyod.models.kde import KDE
from pyod.models.sampling import Sampling
from pyod.models.gmm import GMM

from pyod.models.kpca import KPCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lmdd import LMDD

from pyod.models.lof import LOF
from pyod.models.cof import COF
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.models.sod import SOD

from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.loda import LODA
from pyod.models.suod import SUOD

from pyod.models.auto_encoder_torch import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.deep_svdd import DeepSVDD

from pyod.models.lunar import LUNAR



def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# 1. Prepare data

## 1.1 Load basic data

In [None]:
path = f"D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
df = pd.read_excel(f"{path}/data/immuno/df_samples(all_1052_121222)_proc(raw)_imp(fast_knn)_replace(quarter).xlsx", index_col=0)
df.index = df.index.map(str)
df.rename(columns={'file': 'File'}, inplace=True)
feats = pd.read_excel(f"{path}/data/immuno/feats_con.xlsx", index_col=0).index.values
path_save = f"{path}/special/053_proof_that_immunodata_is_shit"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

### Setup filtering

In [None]:
is_data_filter = False
if is_data_filter:
    path_save = f"{path}/special/053_proof_that_immunodata_is_shit/filtered"
else:
    path_save = f"{path}/special/053_proof_that_immunodata_is_shit/origin"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

## 1.2 Get data with nans

In [None]:
path_curr = f"{path_save}/01_data_with_nans"
pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

files = [
    "Aging L, Q, H, I",
    "Aging-Covid_05.01.2022",
    "Aging-Covid-05.05.22",
    "Covid_results_02_2021",
    "Covid-25.11.20",
    "MULTIPLEX_20_11_2020_ AGING",
    "Yakutiya + TR",
    "Мультиплекс_Agind&Covid",
]
df_immuno_genes = pd.read_excel(f"{path}/data/immuno/immuno_markers_genes.xlsx")
dict_immuno_genes = dict(zip(df_immuno_genes['immuno_marker'], df_immuno_genes['gene']))

dfs_files = []
nans_by_features = {}
for file in files:
    df_file = pd.read_excel(f"{path}/data/immuno/files/processed/{file}.xlsx", index_col="Sample")
    df_file.rename(columns=dict_immuno_genes, inplace=True)
    df_file = df_file.loc[:, feats]

    # duplicates processing
    if file == "MULTIPLEX_20_11_2020_ AGING":
        df_file_doubled_unique = df_file.loc[~df_file.index.duplicated(keep=False), :]
        df_file_doubled_1 = df_file.loc[df_file.index.duplicated(keep='first'), :]
        df_file_doubled_2 = df_file.loc[df_file.index.duplicated(keep='last'), :]
        df_file_duplicates_final = pd.concat([df_file_doubled_2, df_file_doubled_unique], axis=0)
        df_file = df_file_duplicates_final
    df_file_duplicates = df_file.loc[df_file.index.duplicated(keep=False), :]
    if df_file_duplicates.shape[0] > 0:
        print(df_file_duplicates.index)
    
    for feat in df_file:
        nan_vals = set(df_file.loc[df_file[feat].astype(str).str.contains(r'^([<>].*)$', regex=True), feat].values)
        if len(nan_vals) > 0:
            for nv in nan_vals:
                if feat in nans_by_features:
                    nans_by_features[feat].add(nv)
                else:
                    nans_by_features[feat] = {nv}
    
    df_file.replace(r'^([<>].*)$', 'NaN', inplace=True, regex=True)
    df_file = df_file.apply(pd.to_numeric, errors='coerce')
    dfs_files.append(df_file)

print(nans_by_features)

df_w_nans = pd.concat(dfs_files, verify_integrity=False)
df_w_nans.index = df_w_nans.index.map(str)
df_w_nans = df_w_nans.loc[df.index.values, :]
df_w_nans.to_excel(f"{path_curr}/df_w_nans.xlsx", index_label="Index")

# Checking values
df_diff = df.loc[df.index.values, feats] - df_w_nans.loc[df.index.values, feats]
df_diff = df_diff.fillna(0.0)
max_diff = df_diff.values.max()
print(f"max_diff: {max_diff}")

## 1.3 Problem definitions

In [None]:
def get_subsets(df):
    subsets = {
        'All Samples': {
            'color': 'black',
            'samples': df.index.values,
            'path': 'All'
        },
        "File Aging L, Q, H, I": {
            'color': px.colors.qualitative.Plotly[0],
            'samples': df.index[df['File'] == "Aging L, Q, H, I"].values,
            'path': 'Files/Aging L, Q, H, I',
            'value': 'Aging L, Q, H, I'
        },
        "File Aging-Covid_05.01.2022": {
            'color': px.colors.qualitative.Plotly[1],
            'samples': df.index[df['File'] == "Aging-Covid_05.01.2022"].values,
            'path': 'Files/Aging-Covid_05.01.2022',
            'value': 'Aging-Covid_05.01.2022'
        },
        "File Aging-Covid-05.05.22": {
            'color': px.colors.qualitative.Plotly[2],
            'samples': df.index[df['File'] == "Aging-Covid-05.05.22"].values,
            'path': 'Files/Aging-Covid-05.05.22',
            'value': 'Aging-Covid-05.05.22'
        },
        "File Covid_results_02_2021": {
            'color': px.colors.qualitative.Plotly[3],
            'samples': df.index[df['File'] == "Covid_results_02_2021"].values,
            'path': 'Files/Covid_results_02_2021',
            'value': 'Covid_results_02_2021'
        },
        "File Covid-25.11.20": {
            'color': px.colors.qualitative.Plotly[4],
            'samples': df.index[df['File'] == "Covid-25.11.20"].values,
            'path': 'Files/Covid-25.11.20',
            'value': 'Covid-25.11.20'
        },
        "File MULTIPLEX_20_11_2020_ AGING": {
            'color': px.colors.qualitative.Plotly[5],
            'samples': df.index[df['File'] == "MULTIPLEX_20_11_2020_ AGING"].values,
            'path': 'Files/MULTIPLEX_20_11_2020_ AGING',
            'value': 'MULTIPLEX_20_11_2020_ AGING'
        },
        "File Yakutiya + TR": {
            'color': px.colors.qualitative.Plotly[6],
            'samples': df.index[df['File'] == "Yakutiya + TR"].values,
            'path': 'Files/Yakutiya + TR',
            'value': 'Yakutiya + TR'
        },
        "File Мультиплекс_Agind&Covid": {
            'color': px.colors.qualitative.Plotly[7],
            'samples': df.index[df['File'] == "Мультиплекс_Agind&Covid"].values,
            'path': 'Files/Мультиплекс_Agind&Covid',
            'value': 'Мультиплекс_Agind&Covid'
        },
        "Controls": {
            'color': 'lawngreen',
            'samples': df.index[(df['Status'] == "Control") | (df['COVID-19 stage'] == "Reconvalescent")].values,
            'path': 'Status/Controls',
            'value': 'Controls'
        },
        "Controls Central": {
            'color': 'gold',
            'samples': df.index[((df['Status'] == "Control") | (df['COVID-19 stage'] == "Reconvalescent")) & (df['Region'] == "Central") ].values,
            'path': 'Controls/Central',
            'value': 'Central'
        },
        "Controls Yakutia": {
            'color': 'silver',
            'samples': df.index[((df['Status'] == "Control") | (df['COVID-19 stage'] == "Reconvalescent")) & (df['Region'] == "Yakutia") ].values,
            'path': 'Controls/Yakutia',
            'value': 'Yakutia'
        },
        "СOVID-19 Acute and Dynamics": {
            'color': 'crimson',
            'samples': df.index[df['COVID-19 stage'].isin(['Acute', 'Dynamics'])].values,
            'path': 'Status/COVID-19',
            'value': 'COVID-19 Acute and Dynamics'
        },
        "Down Syndrome": {
            'color': 'darkorchid',
            'samples': df.index[df['Down syndrome status'].isin(['Down Syndrome'])].values,
            'path': 'Status/DownSyndrome',
            'value': 'Down Syndrome'
        },
        "ESRD": {
            'color': 'saddlebrown',
            'samples': df.index[df['Status'].isin(['ESRD'])].values,
            'path': 'Status/ESRD',
            'value': 'ESRD'
        },
    }
    df.loc[subsets["Controls"]["samples"], "Controls/Cases"] = "Controls"
    df.loc[subsets["СOVID-19 Acute and Dynamics"]["samples"], "Controls/Cases"] = "COVID-19 Acute and Dynamics"
    df.loc[subsets["Down Syndrome"]["samples"], "Controls/Cases"] = "Down Syndrome"
    df.loc[subsets["ESRD"]["samples"], "Controls/Cases"] = "ESRD"
    for subset_name, subset in subsets.items():
        print(f"{subset_name}: {len(subset['samples'])}")

    return subsets

## 1.4. Prepare data for cleanlab.ai

In [None]:
subsets = get_subsets(df)
df_cl = df.loc[df["Controls/Cases"] == "Controls", list(feats) + ["Sex", "Region", "Age"]]
ptp = np.ptp(df_cl['Age'])
bins = np.concatenate((
    [np.min(df_cl['Age']) - 0.05 * ptp],
    np.percentile(df_cl['Age'], np.linspace(10, 90, 9)),
    [np.max(df_cl['Age']) + 0.05 * ptp]
))
df_cl['AgeQuantile'] = np.digitize(df_cl['Age'], bins) - 1
df_cl.to_excel(f"{path_save}/data_for_cleanlab.xlsx", index_label="index")

# 2. NaN analysis

## Filter features

In [None]:
thld_feats = 25
if is_data_filter:
    df_filter_feats = pd.read_excel(f"{path}/special/053_proof_that_immunodata_is_shit/origin/02_nan_analysis/All/df_nan_feats.xlsx", index_col=0)
    feats = df_filter_feats.index[df_filter_feats['% of NaNs'] <= thld_feats].values
    print(f"Number of filtered features: {len(feats)}")
    df_feats = pd.DataFrame(index=feats)
    df_feats.to_excel(f"{path_save}/feats.xlsx", index_label="Features")

In [None]:
subsets = get_subsets(df)
df_nan_feats_by_group = pd.DataFrame(index=list(subsets.keys()))
for subset_name, subset in subsets.items():

    path_curr = f"{path_save}/02_nan_analysis/{subset['path']}"
    pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)

    # msno plots ===================================================================
    df_msno = df_w_nans.loc[subset['samples'], feats]
    msno_bar = msno.bar(
        df=df_msno,
        label_rotation=90,
        color=subset['color'],
    )
    plt.xticks(ha='center')
    plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
    msno_bar.set_title(f"{subset_name} ({len(subset['samples'])})", fontdict={'fontsize': 22})
    msno_bar.set_ylabel("Non-outlier samples", fontdict={'fontsize': 22})
    plt.savefig(f"{path_curr}/msno_bar.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/msno_bar.pdf", bbox_inches='tight')
    plt.clf()

    msno_mtx = msno.matrix(
        df=df_msno,
        label_rotation=90,
        color=colors.to_rgb(subset['color']),
    )
    plt.xticks(ha='center')
    plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
    msno_mtx.set_title(f"{subset_name} ({len(subset['samples'])})", fontdict={'fontsize': 22})
    msno_mtx.set_ylabel("Samples", fontdict={'fontsize': 22})
    plt.savefig(f"{path_curr}/msno_matrix.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/msno_matrix.pdf", bbox_inches='tight')
    plt.clf()

    msno_heatmap = msno.heatmap(
        df=df_msno,
        label_rotation=90,
        cmap="bwr",
        fontsize=12
    )
    msno_heatmap.set_title(f"{subset_name} ({len(subset['samples'])})", fontdict={'fontsize': 22})
    plt.setp(msno_heatmap.xaxis.get_majorticklabels(), ha="center")
    msno_heatmap.collections[0].colorbar.ax.tick_params(labelsize=20)
    plt.savefig(f"{path_curr}/msno_heatmap.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/msno_heatmap.pdf", bbox_inches='tight')
    plt.clf()

    # NaN features =================================================================
    df_nan_feats = df_w_nans.loc[subset['samples'], feats].isna().sum(axis=0).to_frame(name="Number of NaNs")
    df_nan_feats["% of NaNs"] = df_nan_feats["Number of NaNs"] / len(subset['samples']) * 100
    df_nan_feats["Number of not-NaNs"] = df_w_nans.loc[subset['samples'], feats].notna().sum(axis=0)
    df_nan_feats['Color'] = 'white'
    df_nan_feats.loc[df_nan_feats["% of NaNs"] < 100, 'Color'] = subset['color']
    df_nan_feats.sort_values(["% of NaNs"], ascending=[False], inplace=True)
    df_nan_feats.to_excel(f"{path_curr}/df_nan_feats.xlsx", index_label="Features")

    df_nan_feats_by_group.at[subset_name, "% of NaNs"] = df_nan_feats["Number of NaNs"].sum(axis=0) / df_w_nans.loc[subset['samples'], feats].size * 100

    plt.figure(figsize=(14, 4))
    plt.xticks(rotation=90)
    sns.set_theme(style='whitegrid')
    barplot = sns.barplot(
        data=df_nan_feats,
        x=df_nan_feats.index,
        y=f"% of NaNs",
        edgecolor='black',
        palette=df_nan_feats['Color'].values,
        dodge=False
    )
    barplot.set_title(f"{subset_name} ({len(subset['samples'])})")
    plt.savefig(f"{path_curr}/feats_barplot.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_curr}/feats_barplot.pdf", bbox_inches='tight')
    plt.close()

    # NaN samples ==================================================================
    df_nan_spls = df_w_nans.loc[subset['samples'], feats].isna().sum(axis=1).to_frame(name="Features with NaNs")
    df_nan_spls.to_excel(f"{path_curr}/df_nan_samples.xlsx", index_label="Samples")

    hist_bins = np.linspace(0, len(feats), len(feats) + 1)
    fig = plt.figure(figsize=(6, 4))
    sns.set_theme(style='whitegrid')
    histplot = sns.histplot(
        data=df_nan_spls,
        bins=hist_bins,
        x="Features with NaNs",
        edgecolor='black',
        color=subset['color'],
    )
    histplot.set(xlim=(-0.5, len(feats)+0.5))
    histplot.set_title(f"{subset_name} ({len(subset['samples'])})")
    histplot.set_ylabel(f"Number of samples")
    plt.savefig(f"{path_curr}/spls_histplot.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_curr}/spls_histplot.pdf", bbox_inches='tight')
    plt.close(fig)

# NaN features count in all subsets ============================================
plt.figure(figsize=(4, 6))
sns.set_theme(style='whitegrid')
barplot = sns.barplot(
    data=df_nan_feats_by_group,
    y=df_nan_feats_by_group.index,
    x=f"% of NaNs",
    edgecolor='black',
    palette={k: v['color'] for k, v in subsets.items()},
    dodge=False,
    orient='h'
)
for x in barplot.containers:
    barplot.bar_label(x, fmt="%.1f")
plt.savefig(f"{path_save}/02_nan_analysis/df_nan_feats_by_group.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_save}/02_nan_analysis/df_nan_feats_by_group.pdf", bbox_inches='tight')
plt.close()
df_nan_feats_by_group.to_excel(f"{path_save}/02_nan_analysis/df_nan_feats_by_group.xlsx", index_label="Subset")

## Filter samples

In [None]:
thld_spls = 6
if is_data_filter:
    df_filter_spls = pd.read_excel(f"{path}/special/053_proof_that_immunodata_is_shit/filtered/02_nan_analysis/All/df_nan_samples.xlsx", index_col=0)
    samples = df_filter_spls.index[df_filter_spls['Features with NaNs'] <= thld_spls].values
    print(f"Number of filtered samples: {len(samples)}")
    df_spls_wo_nans = pd.DataFrame(index=samples)
    df_spls_wo_nans.to_excel(f"{path_save}/samples_wo_nans.xlsx", index_label="Samples")
    df = df.loc[samples, :]
    df_w_nans = df_w_nans.loc[samples, :]
    subsets = get_subsets(df)

# 3. Outlier analysis

## 3.1 IQR outliers

In [None]:
for subset_name, subset in subsets.items():

    path_curr = f"{path_save}/03_outliers/IQR/{subset['path']}"
    pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)
    df_curr = df.loc[subset['samples'], :].copy()

    out_columns = []
    for f in feats:
        q1 = df_curr[f].quantile(0.25)
        q3 = df_curr[f].quantile(0.75)
        iqr = q3 - q1
        df_curr[f"{f}_iqr_out"] = True
        out_columns.append(f"{f}_iqr_out")
        filter = (df_curr[f] >= q1 - 1.5 * iqr) & (df_curr[f] <= q3 + 1.5 * iqr)
        df_curr.loc[filter, f"{f}_iqr_out"] = False
    df_curr[f"n_iqr_outs"] = df_curr.loc[:, out_columns].sum(axis=1)
    df_curr.sort_values([f"n_iqr_outs"], ascending=[False], inplace=True)
    df_curr.loc[:, out_columns + ["n_iqr_outs"]].to_excel(f"{path_curr}/df.xlsx", index_label="Sample")
    
    df_feats_w_nan = pd.read_excel(f"{path_save}/02_nan_analysis/{subset['path']}/df_nan_feats.xlsx", index_col=0)
    feats_w_nan = df_feats_w_nan.index[df_feats_w_nan['% of NaNs'] > 25.0].values
    axs = {}
    pw_rows = []
    n_cols = 6
    n_rows = int(np.ceil(len(feats_w_nan) / n_cols))
    for r_id in range(n_rows):
        pw_cols = []
        for c_id in range(n_cols):
            rc_id = r_id * n_cols + c_id
            if rc_id < len(feats_w_nan):
                feat = feats_w_nan[rc_id]
                axs[feat] = pw.Brick(figsize=(2, 2))
                sns.set_theme(style='whitegrid')
                histplot = sns.histplot(
                    data=df_curr.loc[df_curr[f"{feat}_iqr_out"] == False, :],
                    x=feat,
                    multiple="stack",
                    bins=200,
                    edgecolor='k',
                    linewidth=1,
                    color=subset['color'],
                    ax=axs[feat]
                )          
                val_counts = df.loc[df_w_nans[feat].isna(), feat].value_counts().to_frame(name="Number of NaNs")
                val_counts = val_counts.loc[val_counts["Number of NaNs"] >= 10, :]
                for x in val_counts.index.values:
                    axs[feat].axvline(x, color="red", linestyle=":", linewidth=0.5)
                pw_cols.append(axs[feat])
            else:
                empty_fig = pw.Brick(figsize=(2.6, 2))
                empty_fig.axis('off')
                pw_cols.append(empty_fig)
        pw_rows.append(pw.stack(pw_cols, operator="|"))
    pw_fig = pw.stack(pw_rows, operator="/")
    pw_fig.savefig(f"{path_curr}/feats_hists.pdf")
    pw_fig.savefig(f"{path_curr}/feats_hists.png")

    plt.figure(figsize=(6, 12))
    sns.set_theme(style='whitegrid')
    barplot = sns.barplot(
        data=df_curr.head(50),
        y=df_curr.head(50).index,
        x=f"n_iqr_outs",
        edgecolor='black',
        color=subset['color'],
        orient="h",
        dodge=False
    )
    barplot.set(xlim=(0, len(feats)))
    barplot.xaxis.tick_top()
    barplot.xaxis.set_label_position('top')
    barplot.set_xlabel("Number of IQR outliers")
    barplot.set_ylabel("Samples")
    barplot.set_title(f"{subset_name} ({len(subset['samples'])})")
    plt.savefig(f"{path_curr}/barplot.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/barplot.pdf", bbox_inches='tight')
    plt.close()

    hist_bins = np.linspace(0, len(feats), len(feats) + 1)
    plt.figure()
    sns.set_theme(style='whitegrid')
    histplot = sns.histplot(
        data=df_curr,
        x=f"n_iqr_outs",
        multiple="stack",
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        color=subset['color'],
    )
    histplot.set(xlim=(-0.5, len(feats)+0.5))
    histplot.set_title(f"{subset_name} ({len(subset['samples'])})")
    histplot.set_xlabel("Number of IQR outliers")
    plt.savefig(f"{path_curr}/histplot.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/histplot.pdf", bbox_inches='tight')
    plt.clf()

    df_msno = df_curr.loc[:, out_columns].replace({True: np.nan})
    df_msno.rename(columns=dict(zip(out_columns, feats)), inplace=True)

    msno_bar = msno.bar(
        df=df_msno,
        label_rotation=90,
        color=subset['color'],
    )
    plt.xticks(ha='center')
    plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
    msno_bar.set_title(f"{subset_name} ({len(subset['samples'])})", fontdict={'fontsize': 22})
    msno_bar.set_ylabel("Non-outlier samples", fontdict={'fontsize': 22})
    plt.savefig(f"{path_curr}/msno_bar.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/msno_bar.pdf", bbox_inches='tight')
    plt.clf()

    msno_mtx = msno.matrix(
        df=df_msno,
        label_rotation=90,
        color=colors.to_rgb(subset['color']),
    )
    plt.xticks(ha='center')
    plt.setp(msno_bar.xaxis.get_majorticklabels(), ha="center")
    msno_mtx.set_title(f"{subset_name} ({len(subset['samples'])})", fontdict={'fontsize': 22})
    msno_mtx.set_ylabel("Samples", fontdict={'fontsize': 22})
    plt.savefig(f"{path_curr}/msno_matrix.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/msno_matrix.pdf", bbox_inches='tight')
    plt.clf()

    msno_heatmap = msno.heatmap(
        df=df_msno,
        label_rotation=90,
        cmap="bwr",
        fontsize=12
    )
    msno_heatmap.set_title(f"{subset_name} ({len(subset['samples'])})", fontdict={'fontsize': 22})
    plt.setp(msno_heatmap.xaxis.get_majorticklabels(), ha="center")
    msno_heatmap.collections[0].colorbar.ax.tick_params(labelsize=20)
    plt.savefig(f"{path_curr}/msno_heatmap.png", bbox_inches='tight')
    plt.savefig(f"{path_curr}/msno_heatmap.pdf", bbox_inches='tight')
    plt.clf()

## 3.1 PyOD outliers

In [None]:
contamination = 0.1
epochs = 500

for subset_name, subset in subsets.items():

    path_curr = f"{path_save}/03_outliers/pyod_contam({contamination})_epochs({epochs})/{subset['path']}"
    pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)
    df_curr = df.loc[subset['samples'], :].copy()

    scalers = {}
    feats_scaled = []
    for f in feats:
        scaler = StandardScaler()
        scaler.fit(df_curr.loc[:, f].values.reshape(-1, 1))
        scalers[f] = scaler
        feats_scaled.append(f"{f}_scaled")
        df_curr[f"{f}_scaled"] = scaler.transform(df_curr.loc[:, f].values.reshape(-1, 1))
    with open(f"{path_curr}/scalers.pkl", 'wb') as handle:
        pickle.dump(scalers, handle, protocol=pickle.HIGHEST_PROTOCOL)

    feats_sets = {
        'scaled': feats_scaled
    }

    for feats_set in feats_sets:

        pathlib.Path(f"{path_curr}/{feats_set}").mkdir(parents=True, exist_ok=True)
        df_outs = df_curr.copy()

        pyod_methods = {
            'ECOD': ECOD(contamination=contamination),
            'ABOD': ABOD(contamination=contamination),
            'COPOD': COPOD(contamination=contamination),
            'SOS': SOS(contamination=contamination),
            'KDE': KDE(contamination=contamination),
            'Sampling': Sampling(contamination=contamination),
            'GMM': GMM(contamination=contamination),
            'KPCA': KPCA(contamination=contamination),
            'MCD': MCD(contamination=contamination),
            'OCSVM': OCSVM(contamination=contamination),
            'LMDD': LMDD(contamination=contamination),
            'LOF': LOF(contamination=contamination),
            'COF': COF(contamination=contamination),
            'CBLOF': CBLOF(contamination=contamination),
            'HBOS': HBOS(contamination=contamination),
            'KNN': KNN(contamination=contamination),
            'SOD': SOD(contamination=contamination),
            'IForest': IForest(contamination=contamination),
            'INNE': INNE(contamination=contamination),
            'LODA': LODA(contamination=contamination),
            'SUOD': SUOD(contamination=contamination, verbose=0),
            # 'AutoEncoder': AutoEncoder(contamination=contamination, epochs=epochs),
            'VAE': VAE(encoder_neurons=[64, 32, 16], decoder_neurons=[16, 32, 64], contamination=contamination, epochs=epochs, verbose=0),
            'DeepSVDD': DeepSVDD(contamination=contamination, epochs=epochs, verbose=0),
            'LUNAR': LUNAR(),
        }

        X_outliers = df_outs.loc[:, feats_sets[feats_set]].to_numpy()
        for method in pyod_methods:
            if subset_name == "Down Syndrome":
                df_outs[f"{method}"] = 0
                df_outs[f"{method} anomaly score"] = 0.0
            else:
                pyod_methods[method].fit(X_outliers)
                df_outs[f"{method}"] = pyod_methods[method].predict(X_outliers)
                df_outs[f"{method} anomaly score"] = pyod_methods[method].decision_function(X_outliers)
            n_outliers = df_outs[f"{method}"].sum()
            print(f"{subset_name} {feats_set} {method}: {n_outliers}")
        df_outs["Detections"] = df_outs.loc[:, [f"{method}" for method in pyod_methods]].sum(axis=1)
        df_outs.sort_values(["Detections"], ascending=[False], inplace=True)
        df_outs.loc[:, list(pyod_methods.keys()) + [f"{x} anomaly score" for x in pyod_methods] + ["Detections"]].to_excel(f"{path_curr}/{feats_set}/df.xlsx", index=True)

        hist_bins = np.linspace(0, len(pyod_methods), len(pyod_methods) + 1)
        plt.figure()
        sns.set_theme(style='whitegrid')
        histplot = sns.histplot(
            data=df_outs,
            x=f"Detections",
            multiple="stack",
            bins=hist_bins,
            discrete=True,
            edgecolor='k',
            linewidth=1,
            color=subset['color'],
        )
        histplot.set(xlim=(-0.5, len(pyod_methods) + 0.5))
        histplot.set_title(f"{subset_name} ({len(subset['samples'])})")
        histplot.set_xlabel("Number of detections as outlier in different methods")
        plt.savefig(f"{path_curr}/{feats_set}/histplot.png", bbox_inches='tight')
        plt.savefig(f"{path_curr}/{feats_set}/histplot.pdf", bbox_inches='tight')
        plt.clf()

        sns.set_theme(style='whitegrid')
        barplot = df_outs.loc[:, [f"{method}" for method in pyod_methods]].head(50).iloc[::-1].plot(
            figsize=(6, 12),
            width=1,
            kind='barh',
            stacked=True,
            color=px.colors.qualitative.Alphabet,
            edgecolor='black',
        )
        barplot.set(xlim=(0, len(pyod_methods)))
        barplot.xaxis.tick_top()
        barplot.xaxis.set_label_position('top')
        barplot.set_xlabel("Number of detections as outlier in different methods")
        barplot.set_ylabel("Samples")
        barplot.set_title(f"{subset_name} ({len(subset['samples'])})")
        sns.move_legend(barplot, "upper left", bbox_to_anchor=(1, 1))
        plt.savefig(f"{path_curr}/{feats_set}/barplot.png", bbox_inches='tight')
        plt.savefig(f"{path_curr}/{feats_set}/barplot.pdf", bbox_inches='tight')
        plt.close()

        colors_methods = {m: px.colors.qualitative.Alphabet[m_id] for m_id, m in enumerate(pyod_methods)}
        n_cols = 4
        n_rows = int(np.ceil(len(pyod_methods) / n_cols))
        method_names = list(pyod_methods.keys())
        pw_rows = []
        for r_id in range(n_rows):
            pw_cols = []
            for c_id in range(n_cols):
                rc_id = r_id * n_cols + c_id
                if rc_id < len(pyod_methods):
                    method = method_names[rc_id]
                    print(method)
                    brick = pw.Brick(figsize=(3, 2))
                    sns.set_theme(style='whitegrid')
                    data_fig = df_outs[f"{method} anomaly score"].values
                    data_fig = -np.log10(data_fig)
                    if len(np.unique(data_fig)) > 0.1 * len(data_fig):
                        sns.histplot(
                            data=data_fig,
                            color=colors_methods[method],
                            multiple="stack",
                            edgecolor='k',
                            linewidth=1,
                            ax=brick
                        )
                    brick.set_title(f"{method}")
                    brick.set_xlabel(r'$-\log_{10}(\mathrm{Anomaly score})$')
                    brick.set_ylabel('Count')
                    pw_cols.append(brick)
                else:
                    brick = pw.Brick(figsize=(3.5, 2))
                    brick.axis('off')
                    pw_cols.append(brick)
            pw_rows.append(pw.stack(pw_cols, operator="|"))
        pw_fig = pw.stack(pw_rows, operator="/")
        pw_fig.savefig(f"{path_curr}/{feats_set}/methods_anomaly_score.pdf")

## Filter samples

In [None]:
thld_outls_iqr = 6
thld_outls_pyod = 6
subsets_names = ["Controls", "СOVID-19 Acute and Dynamics", "Down Syndrome", "ESRD"]
outs_all = set()
if is_data_filter:
    for subset_name in subsets_names:
        df_outs_iqr = pd.read_excel(f"{path}/special/053_proof_that_immunodata_is_shit/filtered/03_outliers/IQR/{subsets[subset_name]['path']}/df.xlsx", index_col=0)
        outs_iqr = df_outs_iqr.index[df_outs_iqr['n_iqr_outs'] > thld_outls_iqr].values
        print(f"{subset_name} IQR outliers: {len(outs_iqr)}")
        df_outs_pyod = pd.read_excel(f"{path}/special/053_proof_that_immunodata_is_shit/filtered/03_outliers/pyod_contam(0.1)_epochs(500)/{subsets[subset_name]['path']}/scaled/df.xlsx", index_col=0)
        outs_pyod = df_outs_pyod.index[df_outs_pyod['Detections'] > thld_outls_pyod].values
        print(f"{subset_name} PyOD outliers: {len(outs_pyod)}")
        outs_union = set.union(set(outs_iqr), set(outs_pyod))
        outs_all.update(outs_union)
        print(f"{subset_name} union outliers: {len(outs_union)}")
        outs_intxn = set.intersection(set(outs_iqr), set(outs_pyod))
        print(f"{subset_name} intersection outliers: {len(outs_intxn)}")
        
    samples = list(set(df.index.values) - set(outs_all))
    df_spls_wo_nans_wo_outs = pd.DataFrame(index=samples)
    df_spls_wo_nans_wo_outs.to_excel(f"{path_save}/samples_wo_nans_wo_outs.xlsx", index_label="Samples")
    df = df.loc[samples, :]
    df_w_nans = df_w_nans.loc[samples, :]
    subsets = get_subsets(df)
    df.to_excel(f"{path_save}/df_wo_nans_wo_outs.xlsx", index_label="Samples")

# 4. Dimensionality reduction

In [None]:
subsets_dimred = {
    'All Samples': df.index.values,
    "Controls": df.index[(df['Status'] == "Control") | (df['COVID-19 stage'] == "Reconvalescent") ].values
}

colors_file = {}
colors_status = {}
colors_region = {}
for subset_name, subset in subsets.items():
    if subset_name.startswith("File "):
        colors_file[subset['value']] = subset['color']
    elif subset_name.startswith("Controls "):
        colors_region[subset['value']] = subset['color']
    elif subset_name in ["Controls", "СOVID-19 Acute and Dynamics", "Down Syndrome", "ESRD"]:
        colors_status[subset['value']] = subset['color']

for subset_name, samples in subsets_dimred.items():
    path_curr = f"{path_save}/04_dim_red/{subset_name}"
    pathlib.Path(f"{path_curr}").mkdir(parents=True, exist_ok=True)
    df_dim_red = df.loc[samples, :].copy()

    dim_red_labels = {
        'PCA': ['PC 1', 'PC 2'],
        'SVD': ['SVD 1', 'SVD 2'],
        't-SNE': ['t-SNE 1', 't-SNE 2'],
        'MDS': ['MDS 1', 'MDS 2'],
        'GRP': ['GRP 1', 'GRP 2'],
        'SRP': ['SRP 1', 'SRP 2'],
        'IsoMap': ['IsoMap 1', 'IsoMap 2'],
        'MBDL': ['MBDL 1', 'MBDL 2'],
        'ICA': ['ICA 1', 'ICA 2'],
    }
    dim_red_models = {}
    data_dim_red = df_dim_red.loc[:, feats].values

    dim_red_models['PCA'] = PCA(n_components=2, whiten=False).fit(data_dim_red)
    dim_red_models['SVD'] = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5).fit(data_dim_red)
    dim_red_models['t-SNE'] = TSNE(n_components=2).fit(data_dim_red)
    dim_red_models['MDS'] = MDS(n_components=2, metric=True)
    dim_red_models['GRP'] = GaussianRandomProjection(n_components=2, eps=0.5).fit(data_dim_red)
    dim_red_models['SRP'] = SparseRandomProjection(n_components=2, density='auto', eps=0.5, dense_output=False).fit(data_dim_red)
    dim_red_models['IsoMap'] = Isomap(n_components=2, n_neighbors=5).fit(data_dim_red)
    dim_red_models['MBDL'] = MiniBatchDictionaryLearning(n_components=2, batch_size=100, alpha=1, n_iter=25).fit(data_dim_red)
    dim_red_models['ICA'] = FastICA(n_components=2, algorithm='parallel', whiten=True, tol=1e-3, max_iter=1000)

    dim_red_cols = []
    for m, drm in dim_red_models.items():
        if m in ['MDS', 'ICA']:
            dim_red_res = drm.fit_transform(data_dim_red)
        else:
            dim_red_res = drm.transform(data_dim_red)
        df_dim_red.loc[:, dim_red_labels[m][0]] = dim_red_res[:, 0]
        df_dim_red.loc[:, dim_red_labels[m][1]] = dim_red_res[:, 1]
        dim_red_cols += dim_red_labels[m]
    df_dim_red.loc[:, dim_red_cols].to_excel(f"{path_curr}/df_dim_red.xlsx")

    if subset_name == "All Samples":
        pathlib.Path(f"{path_curr}/Files").mkdir(parents=True, exist_ok=True)
        pathlib.Path(f"{path_curr}/Status").mkdir(parents=True, exist_ok=True)
        for m, drm in dim_red_models.items():
            fig = plt.figure(figsize=(8, 6))
            sns.set_theme(style='whitegrid')
            scatterplot = sns.scatterplot(
                data=df_dim_red,
                x=dim_red_labels[m][0],
                y=dim_red_labels[m][1],
                palette=colors_file,
                hue='File',
                style='File',
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                marker='o',
                s=40,
            )
            sns.move_legend(scatterplot, "lower center", bbox_to_anchor=(.5, 1), ncol=4, frameon=False)
            for ha in scatterplot.legend_.legendHandles:
                ha.set_edgecolor("k")
                ha.set_linewidth(0.5)
                ha._sizes = [60]
            plt.setp(scatterplot.get_legend().get_texts(), fontsize='8') # for legend text
            plt.setp(scatterplot.get_legend().get_title(), fontsize='15')
            plt.savefig(f"{path_curr}/Files/{m}.png", bbox_inches='tight', dpi=200)
            plt.savefig(f"{path_curr}/Files/{m}.pdf", bbox_inches='tight')
            plt.close()

            fig = plt.figure(figsize=(8, 6))
            sns.set_theme(style='whitegrid')
            scatterplot = sns.scatterplot(
                data=df_dim_red.loc[df_dim_red["Controls/Cases"].isin(["Controls", "COVID-19 Acute and Dynamics", "Down Syndrome", "ESRD"]), :],
                x=dim_red_labels[m][0],
                y=dim_red_labels[m][1],
                palette=colors_status,
                hue='Controls/Cases',
                style='Controls/Cases',
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                marker='o',
                s=40,
            )
            sns.move_legend(scatterplot, "lower center", bbox_to_anchor=(.5, 1), ncol=4, frameon=False)
            for ha in scatterplot.legend_.legendHandles:
                ha.set_edgecolor("k")
                ha.set_linewidth(0.5)
                ha._sizes = [60]
            plt.setp(scatterplot.get_legend().get_texts(), fontsize='8') # for legend text
            plt.setp(scatterplot.get_legend().get_title(), fontsize='15')
            plt.savefig(f"{path_curr}/Status/{m}.png", bbox_inches='tight', dpi=200)
            plt.savefig(f"{path_curr}/Status/{m}.pdf", bbox_inches='tight')
            plt.close()
    elif subset_name == "Controls":
        pathlib.Path(f"{path_curr}/Region").mkdir(parents=True, exist_ok=True)
        for m, drm in dim_red_models.items():
            fig = plt.figure(figsize=(8, 6))
            sns.set_theme(style='whitegrid')
            scatterplot = sns.scatterplot(
                data=df_dim_red,
                x=dim_red_labels[m][0],
                y=dim_red_labels[m][1],
                palette=colors_region,
                hue='Region',
                style='Region',
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                marker='o',
                s=40,
            )
            sns.move_legend(scatterplot, "lower center", bbox_to_anchor=(.5, 1), ncol=4, frameon=False)
            for ha in scatterplot.legend_.legendHandles:
                ha.set_edgecolor("k")
                ha.set_linewidth(0.5)
                ha._sizes = [60]
            plt.setp(scatterplot.get_legend().get_texts(), fontsize='8') # for legend text
            plt.setp(scatterplot.get_legend().get_title(), fontsize='15')
            plt.savefig(f"{path_curr}/Region/{m}.png", bbox_inches='tight', dpi=200)
            plt.savefig(f"{path_curr}/Region/{m}.pdf", bbox_inches='tight')
            plt.close()


            legend_handles = []
            norm = plt.Normalize(df_dim_red['Age'].min(), df_dim_red['Age'].max())
            sm = plt.cm.ScalarMappable(cmap="spring", norm=norm)
            sm.set_array([])
            fig = plt.figure(figsize=(8, 6))
            sns.set_theme(style='whitegrid')
            scatter = sns.scatterplot(
                data=df_dim_red.loc[df_dim_red['Region'] == 'Central', :],
                x=dim_red_labels[m][0],
                y=dim_red_labels[m][1],
                palette='spring',
                hue='Age',
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                marker='o',
                s=50,
            )
            scatter.get_legend().remove()
            legend_handles.append(mlines.Line2D([], [], marker='o', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Central'))
            scatter = sns.scatterplot(
                data=df_dim_red.loc[df_dim_red['Region'] == 'Yakutia', :],
                x=dim_red_labels[m][0],
                y=dim_red_labels[m][1],
                palette='spring',
                hue='Age',
                linewidth=0.5,
                alpha=0.75,
                edgecolor="k",
                marker='X',
                s=50,
            )
            scatter.get_legend().remove()
            legend_handles.append(mlines.Line2D([], [], marker='X', linestyle='None', markeredgecolor='k', markerfacecolor='lightgrey', markersize=10, label='Yakutia'))
            plt.legend(handles=legend_handles, title="Region", bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=2, frameon=False)
            fig.colorbar(sm, label="Age")
            plt.savefig(f"{path_curr}/Region/{m}_colorAge.png", bbox_inches='tight', dpi=200)
            plt.savefig(f"{path_curr}/Region/{m}_colorAge.pdf", bbox_inches='tight')
            plt.close()

# 5. Region tests

In [None]:
path_curr = f"{path_save}/05_region_tests"
pathlib.Path(f"{path_curr}/feats").mkdir(parents=True, exist_ok=True)

df_curr = df.loc[subsets['Controls']['samples'], :].copy()

colors_region['Central'] = subsets['Controls Central']['color']
colors_region['Yakutia'] = subsets['Controls Yakutia']['color']

df_stat = pd.DataFrame(index=list(feats))
for feat in list(feats) + ['SImAge acceleration']:
    vals = {}
    for group in ['Central', 'Yakutia']:
        vals[group] = df_curr.loc[df_curr['Region'] == group, feat].values
        df_stat.at[feat, f"mean_{group}"] = np.mean(vals[group])
        df_stat.at[feat, f"median_{group}"] = np.median(vals[group])
        df_stat.at[feat, f"q75_{group}"], df_stat.at[feat, f"q25_{group}"] = np.percentile(vals[group], [75 , 25])
        df_stat.at[feat, f"iqr_{group}"] = df_stat.at[feat, f"q75_{group}"] - df_stat.at[feat, f"q25_{group}"]
        if feat == 'SImAge acceleration':
            df_stat.at[feat, f"MAE_{group}"] = mean_absolute_error(df_curr.loc[df_curr['Region'] == group, 'Age'].values, df_curr.loc[df_curr['Region'] == group, 'SImAge'].values)
    _, df_stat.at[feat, "mw_pval"] = mannwhitneyu(vals['Central'], vals['Yakutia'], alternative='two-sided')

_, df_stat.loc[feats, "mw_pval_fdr_bh"], _, _ = multipletests(df_stat.loc[feats, "mw_pval"], 0.05, method='fdr_bh')
df_stat.sort_values([f"mw_pval_fdr_bh"], ascending=[True], inplace=True)
df_stat.to_excel(f"{path_curr}/kw_mw.xlsx", index_label='Features')

feat = 'SImAge acceleration'
plt.figure(figsize=(6, 4))
sns.set_theme(style='whitegrid')
violin = sns.violinplot(
    data=df_curr,
    x='Region',
    y=feat,
    palette=colors_region,
    scale='width',
    order=list(colors_region.keys()),
    saturation=0.75,
)
violin.set_xlabel(f"Region")
mw_pval = df_stat.at[feat, "mw_pval"]
pval_formatted = [f'{mw_pval:.2e}']
annotator = Annotator(
    violin,
    pairs=[('Central', 'Yakutia')],
    data=df_curr,
    x='Region',
    y=feat,
    order=list(colors_region.keys())
)
annotator.set_custom_annotations(pval_formatted)
annotator.configure(loc='outside')
annotator.annotate()
plt.savefig(f"{path_curr}/{feat}.png", bbox_inches='tight', dpi=200)
plt.savefig(f"{path_curr}/{feat}.pdf", bbox_inches='tight')
plt.close()

feats_sorted = df_stat.index[df_stat.index.isin(feats)].values
axs = {}
pw_rows = []
n_cols = 4
n_rows = int(np.ceil(len(feats_sorted) / n_cols))
for r_id in range(n_rows):
    pw_cols = []
    for c_id in range(n_cols):
        rc_id = r_id * n_cols + c_id
        if rc_id < len(feats_sorted):
            feat = feats_sorted[rc_id]
            axs[feat] = pw.Brick(figsize=(3, 2))
            sns.set_theme(style='whitegrid')
            sns.violinplot(
                data=df_curr,
                x='Region',
                y=feat,
                palette=colors_region,
                scale='width',
                order=list(colors_region.keys()),
                saturation=0.75,
                ax=axs[feat]
            )
            axs[feat].set_ylabel(feat)
            axs[feat].set_xlabel(f"Region")
            mw_pval = df_stat.at[feat, "mw_pval_fdr_bh"]
            pval_formatted = [f'{mw_pval:.2e}']
            annotator = Annotator(
                axs[feat],
                pairs=[('Central', 'Yakutia')],
                data=df_curr,
                x='Region',
                y=feat,
                order=list(colors_region.keys()),
            )
            annotator.set_custom_annotations(pval_formatted)
            annotator.configure(loc='outside')
            annotator.annotate()
            pw_cols.append(axs[feat])
        else:
            empty_fig = pw.Brick(figsize=(3.6, 2))
            empty_fig.axis('off')
            pw_cols.append(empty_fig)

    pw_rows.append(pw.stack(pw_cols, operator="|"))
pw_fig = pw.stack(pw_rows, operator="/")
pw_fig.savefig(f"{path_curr}/feats.pdf")