In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import pickle
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
import matplotlib.pyplot as plt
import pathlib
from tqdm import tqdm
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot as upset
import missingno as msno
from pyod.models.lunar import LUNAR
from matplotlib_venn import venn2, venn2_circles
from glob import glob
from hydra import compose, initialize
from omegaconf import OmegaConf
import omegaconf
import os
import ast
import json

# 0. Setup

In [None]:
path_dataset = "D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"
path_load = f"{path_dataset}/data/covid/treatment"
path_save = f"{path_dataset}/special/041_covid_treatment"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

# 1. Empty features

In [None]:
path_local = f"001_empty_features"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

df = pd.read_excel(f"{path_load}/data_0.xlsx", index_col="patient_id")

series_n_nan = df.isna().sum()
df_nan = pd.DataFrame({'n_nans': series_n_nan.values}, index=series_n_nan.index)
df_nan.sort_values([f"n_nans"], ascending=[False], inplace=True)

hist_min = df_nan.loc[:, f"n_nans"].min()
hist_max = df_nan.loc[:, f"n_nans"].max()
hist_width = hist_max - hist_min
hist_n_bins = df_nan.loc[:, f"n_nans"].max()
hist_bin_width = hist_width / hist_n_bins

plt.figure()
sns.set_theme(style='whitegrid')
hist = sns.histplot(
    data=df_nan,
    x=f"n_nans",
    bins=hist_n_bins,
    binrange=(hist_min, hist_max),
    binwidth=hist_bin_width,
    discrete=True,
    edgecolor='k',
    linewidth=1
)
hist.set_xlabel("Number of missing values")
hist.set_ylabel("Number of features")
hist.set_title(f"Total features: {df.shape[1]}\nTotal samples: {df.shape[0]}")
plt.savefig(f"{path_save}/{path_local}/hist_n_nans.png", bbox_inches='tight')
plt.savefig(f"{path_save}/{path_local}/hist_n_nans.pdf", bbox_inches='tight')
plt.clf()

## Update features with nan info

In [None]:
df_feats = pd.read_excel(f"{path_load}/features.xlsx", index_col="feature")
df_feats.loc[df_feats.index, 'n_nans'] = df_nan.loc[df_feats.index, 'n_nans']
df_feats.loc[df_feats.index, 'percentage_nans'] = df_nan.loc[df_feats.index, 'n_nans'] / df.shape[0]
df_feats.to_excel(f"{path_load}/features.xlsx")

## Save filtered data

In [None]:
lim_exclude = 1675
feats_exclude = df_nan.index[df_nan["n_nans"] > lim_exclude].values
df.drop(feats_exclude, axis=1, inplace=True)
df['n_nans'] = df.isnull().sum(axis=1)
df.to_excel(f"{path_load}/data_exclude({lim_exclude}).xlsx")

# 2. Forms features

In [None]:
path_local = f"002_forms_features"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)

df = pd.read_excel(f"{path_load}/data_0.xlsx", index_col="patient_id")
df_feats = pd.read_excel(f"{path_load}/features.xlsx", index_col="feature")

forms = df_feats['form'].unique()

for form in forms:
    pathlib.Path(f"{path_save}/{path_local}/{form}").mkdir(parents=True, exist_ok=True)
    df_feats_form = df_feats.loc[(df_feats["form"] == form) & (df_feats["type"].isin(["cat", "cont"])), :]

    df_form = df.loc[:, df_feats_form.index]
    df_form.rename(columns=dict(zip(df_feats_form.index.values, df_feats_form["eng_title"].values)), inplace=True)
    feats_form = df_feats_form["eng_title"].values
    df_form['Missed features'] = df_form.isnull().sum(axis=1)

    msno.bar(
        df=df_form.loc[:, feats_form],
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{form}/msno_bar.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/{form}/msno_bar.pdf", bbox_inches='tight')
    plt.close()

    fig = plt.figure(figsize=(12, 0.4 * df_feats_form['eng_title'].value_counts(dropna=True).shape[0]))
    sns.set_theme(style='whitegrid', font_scale=1)
    bar = sns.barplot(
        data=df_feats_form,
        y='eng_title',
        x='percentage_nans',
        edgecolor='black',
        orient='h',
        palette=px.colors.qualitative.Alphabet,
        dodge=True
    )
    bar.set_xlabel("Part of NaNs")
    bar.set_ylabel("")
    bar.set_title(f"Features' missing values")
    plt.savefig(f"{path_save}/{path_local}/{form}/bar.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/{form}/bar.pdf", bbox_inches='tight')
    plt.close()

    msno.matrix(
        df=df_form.loc[:, feats_form],
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{form}/msno_mtx.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/{form}/msno_mtx.pdf", bbox_inches='tight')
    plt.close()

    msno.matrix(
        df=df_form.sort_values([f"Missed features"], ascending=[False]).loc[:, feats_form],
        label_rotation=90
    )
    plt.savefig(f"{path_save}/{path_local}/{form}/msno_mtx_sorted.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/{form}/msno_mtx_sorted.pdf", bbox_inches='tight')
    plt.close()

    for feat, row in df_feats_form.iterrows():
        if row['type'] == 'cat':

            pathlib.Path(f"{path_save}/{path_local}/{form}/cat").mkdir(parents=True, exist_ok=True)

            if not pd.isna(row['eng_values']):
                dict_values = ast.literal_eval(row['eng_values'])
                df_form.replace({row['eng_title']: dict_values}, inplace=True)
                palette = {x: px.colors.qualitative.Dark24[x_id] for x_id, x in enumerate(dict_values.values())}
                order = dict_values.values()
            else:
                palette = px.colors.qualitative.Dark24
                order = df_form[row['eng_title']].unique()

            fig = plt.figure(figsize=(12, 0.4 * df_form[row['eng_title']].value_counts(dropna=True).shape[0]))
            sns.set_theme(style='whitegrid', font_scale=1)
            countplot = sns.countplot(
                data=df_form,
                y=row['eng_title'],
                edgecolor='black',
                orient='h',
                palette=palette,
                order=order
            )
            countplot.bar_label(countplot.containers[0])
            countplot.set_xlabel("Count")
            countplot.set_ylabel("")
            countplot.set_title(f"{row['eng_title']} ({df_form[row['eng_title']].count()})")
            plt.savefig(f"{path_save}/{path_local}/{form}/cat/{feat}.png", bbox_inches='tight', dpi=400)
            plt.savefig(f"{path_save}/{path_local}/{form}/cat/{feat}.pdf", bbox_inches='tight')
            plt.close(fig)

        elif row['type'] == 'cont' and df_form[row['eng_title']].count() > 5:

            print(feat)

            pathlib.Path(f"{path_save}/{path_local}/{form}/cont").mkdir(parents=True, exist_ok=True)

            sns.set_theme(style='whitegrid')

            fig, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})

            box = sns.boxplot(df_form[row['eng_title']].values, orient='h', flierprops={"marker": "x"}, ax=ax_box)
            box.set_xlabel("")
            box.set_yticks([])
            box.set_title(f"Total samples: {df_form[row['eng_title']].count()}")
            sns.despine(ax=ax_box, left=False, right=False, bottom=False, top=False)

            if not pd.isna(row['hist_bins']):
                hist_bins_raw = list(map(float, json.loads(row['hist_bins'])))
                hist_bins = np.linspace(hist_bins_raw[0], hist_bins_raw[1], int(hist_bins_raw[2]))

                hist = sns.histplot(
                    data=df_form,
                    x=row['eng_title'],
                    bins=hist_bins,
                    edgecolor='k',
                    linewidth=1,
                    ax=ax_hist
                )
            else:
                hist_n_bins = 20
                hist_min = df_form.loc[:, row['eng_title']].min()
                hist_max = df_form.loc[:, row['eng_title']].max()
                hist_width = hist_max - hist_min
                hist_bin_width = hist_width / hist_n_bins
                hist = sns.histplot(
                    data=df_form,
                    x=row['eng_title'],
                    bins=hist_n_bins,
                    binrange=(hist_min, hist_max),
                    binwidth=hist_bin_width,
                    discrete=False,
                    edgecolor='k',
                    linewidth=1,
                    ax=ax_hist
                )

            plt.savefig(f"{path_save}/{path_local}/{form}/cont/{feat}.png", bbox_inches='tight', dpi=400)
            plt.savefig(f"{path_save}/{path_local}/{form}/cont/{feat}.pdf", bbox_inches='tight')
            plt.close(fig)

# 3. Data filtering

In [None]:
path_local = f"003_data_filtering"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df = pd.read_excel(f"{path_load}/data_0.xlsx", index_col="patient_id")
df_feats = pd.read_excel(f"{path_load}/features.xlsx", index_col="feature")

# Samples only with positive PCR
df = df.loc[df["f04v1_pcr_id"] == 2, :]

# NaNs preprocessing
feats_drop_rows = df_feats.index[df_feats["preprocessing"] == "drop_rows_with_na"].values
df = df.dropna(subset=feats_drop_rows)

# Calculate missing values parts for filtered data
series_n_nan = df.isna().sum()
df_nan = pd.DataFrame({'n_nans': series_n_nan.values}, index=series_n_nan.index)
df_nan.sort_values([f"n_nans"], ascending=[False], inplace=True)
df_feats.loc[df_feats.index, 'n_nans'] = df_nan.loc[df_feats.index, 'n_nans']
df_feats.loc[df_feats.index, 'percentage_nans'] = df_nan.loc[df_feats.index, 'n_nans'] / df.shape[0]
df_feats.to_excel(f"{path_save}/{path_local}/feats.xlsx")

# Include feratures
feats_to_include = df_feats.index[df_feats["include"] == "yes"].values
df = df.loc[:, feats_to_include]
df.to_excel(f"{path_save}/{path_local}/data.xlsx")

# 4. Features plot

In [None]:
path_local = f"004_features"
pathlib.Path(f"{path_save}/{path_local}").mkdir(parents=True, exist_ok=True)
df = pd.read_excel(f"{path_save}/003_data_filtering/data.xlsx", index_col="patient_id")
df_feats = pd.read_excel(f"{path_save}/003_data_filtering/feats.xlsx", index_col="feature")

colors_cat = {
    "Sex": {
        'F': 'crimson',
        'M': 'dodgerblue'
    },
    "Binary": {
        'Yes': 'lavender',
        'No': 'dimgray',
    },
    "Temperature": {
        '< 37.0': 'lawngreen',
        '37.0 - 38.5': 'gold',
        '38.6 - 39.0': 'orangered',
        '> 39.0': 'firebrick'
    },
    "Status": {
        'Recovered': 'lime',
        'Discharged with improvement': 'yellowgreen',
        'No changes': 'yellow',
        'Health deterioration': 'orange',
        'Lethal': 'red',
    }
}

feat_groups = df_feats.loc[df_feats["include"] == "yes", "feat_group"].value_counts().index.values
for feat_group in feat_groups:
    pathlib.Path(f"{path_save}/{path_local}/{feat_group}").mkdir(parents=True, exist_ok=True)

    # Categorical features
    df_feats_group_cat = df_feats.loc[(df_feats["include"] == "yes") & (df_feats["feat_group"] == feat_group) & (df_feats["type"].isin(["cat"])), :]
    df_group = df.loc[:, df_feats_group_cat.index]
    df_group.rename(columns=dict(zip(df_feats_group_cat.index.values, df_feats_group_cat["eng_title"].values)), inplace=True)

    feats = df_feats_group_cat.index.values

    feats_passed = []
    height_ratios = []
    for feat in feats:
        feat_title = df_feats_group_cat.at[feat, 'eng_title']
        n_cats = len(df_group[feat_title].unique())
        if n_cats > 1:
            height_ratios.append(n_cats)
            feats_passed.append(feat)

    fig, axs = plt.subplots(
        nrows=len(feats_passed),
        ncols=1,
        sharex=True,
        figsize=(18, 0.7 * sum(height_ratios)),
        gridspec_kw={'height_ratios': height_ratios}
    )

    for feat_id, feat in enumerate(feats_passed):
        feat_title = df_feats_group_cat.at[feat, 'eng_title']
        feat_values = df_feats_group_cat.at[feat, 'eng_values']
        if not pd.isna(feat_values):
            dict_values = ast.literal_eval(feat_values)
            df_group.replace({feat_title: dict_values}, inplace=True)
            if feat_values == '{1: "M", 2: "F"}':
                palette = colors_cat["Sex"]
                order = list(colors_cat["Sex"].keys())
            elif feat_values == '{1: "< 37.0", 2: "37.0 - 38.5", 3: "38.6 - 39.0", 4: "> 39.0"}':
                palette = colors_cat["Temperature"]
                order = list(colors_cat["Temperature"].keys())
            elif feat_values == '{0: "No", 1: "Yes"}':
                palette = colors_cat["Binary"]
                order = list(colors_cat["Binary"].keys())
            elif feat_values == '{0: "No changes", 1: "Discharged with improvement", 2: "Recovered", 3: "Lethal", 4: "Health deterioration"}':
                palette = colors_cat["Status"]
                order = list(colors_cat["Status"].keys())
            else:
                palette = {x: px.colors.qualitative.Dark24[x_id] for x_id, x in enumerate(dict_values.values())}
                order = dict_values.values()

            # If some categories not exist
            cats_not_exist = list(set(palette.keys()) - set(df_group[feat_title].unique()))
            if len(cats_not_exist) > 0:
                for cat in cats_not_exist:
                    palette.pop(cat, None)
                order = palette.keys()
        else:
            palette = px.colors.qualitative.Dark24
            order = df_group[feat_title].unique()

        sns.set_theme(style='whitegrid', font_scale=1)
        countplot = sns.countplot(
            data=df_group,
            y=feat_title,
            edgecolor='black',
            orient='h',
            palette=palette,
            order=order,
            ax=axs[feat_id]
        )
        countplot.bar_label(countplot.containers[0])
        if feat_id == len(feats_passed) - 1:
            countplot.set_xlabel("Count", fontsize=20)
        else:
            countplot.set_xlabel("")
        countplot.set_ylabel(f"")
        countplot.set_title(f"{feat_title}", fontsize=20)

    fig.tight_layout()
    plt.savefig(f"{path_save}/{path_local}/{feat_group}/cat.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_save}/{path_local}/{feat_group}/cat.pdf", bbox_inches='tight')
    plt.close(fig)

    # Continuous features