In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import pickle
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout
from statsmodels.stats.multitest import multipletests
from scipy.stats import chi2_contingency
from scipy.stats import kruskal, mannwhitneyu
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test
import matplotlib.pyplot as plt
import pathlib
from matplotlib.patches import Rectangle
from tqdm import tqdm
import plotly.colors
from src.utils.plot.bioinfokit import mhat, volcano
import gseapy as gp
import mygene
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding
import upsetplot as upset
import missingno as msno
from pyod.models.lunar import LUNAR
from matplotlib_venn import venn2, venn2_circles
from glob import glob
from hydra import compose, initialize
from omegaconf import OmegaConf
import omegaconf
import os
import ast
import json
from sklearn.preprocessing import MinMaxScaler
from scripts.python.routines.plot.colorscales import get_continuous_color
from src.datamodules.cross_validation import RepeatedStratifiedKFoldCVSplitter
from src.datamodules.tabular import TabularDataModule
from lifelines import CoxPHFitter
from lifelines.statistics import proportional_hazard_test
from sklearn.metrics import mean_absolute_error
from scipy.stats import wilcoxon, friedmanchisquare
from suffix_trees import STree
from itertools import combinations

# 1. Collect ML results

In [22]:
path = "D:/YandexDisk/Work/eeg/alpha_ext/real"
model = 'widedeep_tab_mlp_trn_val_tst'
path_runs = f"{path}/models/{model}/multiruns"

files = glob(f"{path_runs}/*/*/metrics_val_best_*.xlsx")

df_tmp = pd.read_excel(files[0], index_col="metric")
head, tail = os.path.split(files[0])
cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
params = []
for param_pair in cfg:
    param, val = param_pair.split('=')
    params.append(param)
df_res = pd.DataFrame(index=files)
for file in files:
    # Validation
    df_val = pd.read_excel(file, index_col="metric")
    for metric in df_val.index.values:
        df_res.at[file, metric + "_val"] = df_val.at[metric, "val"]

    # Train
    head, tail = os.path.split(file)
    tail = tail.replace('val', 'trn')
    df_trn = pd.read_excel(f"{head}/{tail}", index_col="metric")
    for metric in df_trn.index.values:
        df_res.at[file, metric + "_trn"] = df_trn.at[metric, "trn"]

    # Params
    cfg = OmegaConf.load(f"{head}/.hydra/overrides.yaml")
    for param_pair in cfg:
        param, val = param_pair.split('=')
        df_res.at[file, param] = val

first_columns = [
    'accuracy_weighted_trn',
    'accuracy_weighted_cv_mean_trn',
    'accuracy_weighted_cv_std_trn',
    'accuracy_weighted_val',
    'accuracy_weighted_cv_mean_val',
    'accuracy_weighted_cv_std_val'
]
df_res = df_res[first_columns + [col for col in df_res.columns if col not in first_columns]]
df_res.to_excel(f"{path_runs}/summary.xlsx", index=True, index_label="file")


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

# 2. Plot hyperparameter optimization results

In [None]:
path = "D:/YandexDisk/Work/eeg/many_freqs"
movement_type = "real"
model = 'lightgbm_trn_val_tst'
path_runs = f"{path}/{movement_type}/models/{model}/multiruns"

df_runs = pd.read_excel(f"{path_runs}/summary.xlsx", index_col=0)
cols_dict = {
    "accuracy_weighted_trn": ("Train Best Accuracy", "deepskyblue"),
    "accuracy_weighted_cv_mean_trn": ("Train Mean Accuracy", "dodgerblue"),
    "accuracy_weighted_val": ("Validation Best Accuracy", "lightcoral"),
    "accuracy_weighted_cv_mean_val": ("Validation Mean Accuracy", "crimson"),
}
df_fig = df_runs.loc[:, list(cols_dict.keys())].copy()
df_fig.rename(columns={k: v[0] for k,v in cols_dict.items()}, inplace=True)
for col in cols_dict:
    fig = plt.figure()
    sns.set_theme(style='whitegrid')
    sns.histplot(
        data=df_fig,
        x=cols_dict[col][0],
        color=cols_dict[col][1],
    )
    plt.savefig(f"{path_runs}/{col}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path_runs}/{col}.pdf", bbox_inches='tight')
    plt.close(fig)

# 3. Plot bad splits distributions

In [29]:
path = "D:/YandexDisk/Work/eeg/alpha/real"
model = 'widedeep_tab_mlp_trn_val_tst'
path_runs = f"{path}/models/{model}/multiruns"
path_best_run = f"2023-04-20_18-13-22_1337/4"

metric_main = "val_accuracy_weighted"
metric_thld = 0.55

df_cv_metrics = pd.read_excel(f"{path_runs}/{path_best_run}/cv_progress.xlsx", index_col=0)
pathlib.Path(f"{path_runs}/{path_best_run}/split_details").mkdir(parents=True, exist_ok=True)
df_fig = df_cv_metrics.loc[:, [metric_main]].copy()
df_fig.rename(columns={metric_main: "Validation Accuracy"}, inplace=True)
fig = plt.figure()
sns.set_theme(style='whitegrid')
sns.histplot(
    data=df_fig,
    x="Validation Accuracy",
    color="lightcoral",
)
plt.savefig(f"{path_runs}/{path_best_run}/split_details/metric_main.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_runs}/{path_best_run}/split_details/metric_main.pdf", bbox_inches='tight')
plt.close(fig)


splits = df_cv_metrics.index[df_cv_metrics[metric_main] < metric_thld].values

df_cv_splits = pd.read_excel(f"{path_runs}/{path_best_run}/cv_ids.xlsx", index_col=0)
dict_count = {}
for split in splits:
    col = f"fold_{split:04d}"
    subjs = set(df_cv_splits.loc[df_cv_splits[col] == "val", "subject"].values)
    for subj in subjs:
        if subj in dict_count:
            dict_count[subj] += 1
        else:
            dict_count[subj] = 1
df_count = pd.DataFrame(index=list(dict_count.keys()))
df_count["Count"] = list(dict_count.values())
df_count.sort_values([f"Count"], ascending=[False], inplace=True)

palette = {f"S{s_id}": px.colors.qualitative.Dark24[s_id] for s_id in range(15)}

fig = plt.figure(figsize=(12, 0.4 * df_count['Count'].value_counts(dropna=True).shape[0]))
sns.set_theme(style='whitegrid', font_scale=1)
bar = sns.barplot(
    data=df_count,
    y=df_count.index,
    x='Count',
    edgecolor='black',
    orient='h',
    palette=palette,
    dodge=True
)
bar.set_xlabel("Occurrence in Validation dataset")
bar.set_ylabel("")
bar.set_title(f"")
plt.savefig(f"{path_runs}/{path_best_run}/split_details/subjects_val_count_thld({metric_thld}).png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_runs}/{path_best_run}/split_details/subjects_val_count_thld({metric_thld}).pdf", bbox_inches='tight')
plt.close()

# 4. Dimensionality reduction

In [30]:
path = "D:/YandexDisk/Work/eeg"
movement_type = "real"
features_type = "alpha"
feat_target = "class_simp"

dim_red_methods_dict = {
    'PCA': ['PC 1', 'PC 2'],
    'SVD': ['SVD 1', 'SVD 2'],
    'MDS': ['MDS 1', 'MDS 2'],
    'T-SNE': ['t-SNE 1', 't-SNE 2'],
}
palette = {f"S{s_id}": px.colors.qualitative.Dark24[s_id] for s_id in range(15)}

df_data = pd.read_excel(f"{path}/data.xlsx", index_col=0)
df_feats = pd.read_excel(f"{path}/feats_cont_{features_type}.xlsx", index_col=0)
feats = df_feats.index.values
df_classes = pd.read_excel(f"{path}/classes_{movement_type}.xlsx", index_col=0)
classes = df_classes.index.values

df_data = df_data.loc[df_data[feat_target].isin(classes), :]
df_data.rename(columns={'subject': 'Subject'}, inplace=True)
data_vals = df_data.loc[:, feats].values

pca = PCA(n_components=2, whiten=False)
data_pca = pca.fit_transform(data_vals)
df_data['PC 1'] = data_pca[:, 0]
df_data['PC 2'] = data_pca[:, 1]
tsvd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5)
tsvd.fit(data_vals)
data_svd = tsvd.transform(data_vals)
df_data['SVD 1'] = data_svd[:, 0]
df_data['SVD 2'] = data_svd[:, 1]
mds = MDS(n_components=2, metric=True)
data_mds = mds.fit_transform(data_vals)
df_data['MDS 1'] = data_mds[:, 0]
df_data['MDS 2'] = data_mds[:, 1]
tsne = TSNE(n_components=2, learning_rate=300, perplexity=30, early_exaggeration=12, init='random')
data_tsne = tsne.fit_transform(data_vals)
df_data['t-SNE 1'] = data_tsne[:, 0]
df_data['t-SNE 2'] = data_tsne[:, 1]

for m in dim_red_methods_dict:
    x_name = dim_red_methods_dict[m][0]
    y_name = dim_red_methods_dict[m][1]

    plt.figure()
    sns.set_theme(style='whitegrid')
    scatter = sns.scatterplot(
        data=df_data,
        x=x_name,
        y=y_name,
        hue="Subject",
        linewidth=0.1,
        palette=palette,
        alpha=0.85,
        edgecolor="k",
        s=10,
        hue_order=palette
    )
    sns.move_legend(scatter, "center left", bbox_to_anchor=(1, 0.5))
    plt.savefig(f"{path}/{features_type}/{movement_type}/{m}.png", bbox_inches='tight', dpi=400)
    plt.savefig(f"{path}/{features_type}/{movement_type}/{m}.pdf", bbox_inches='tight')
    plt.close()



