In [3]:
import sys
sys.path.insert(0, '../')
from utils import balance_sample, univariate_classifier, bad_participant, correct_name_markers

import os
import numpy as np
import pandas as pd 
from tqdm.notebook import tqdm


import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as pgo
import plotly.offline as pyo
import plotly.io as pio
from plotly.subplots import make_subplots

pyo.init_notebook_mode(connected = True)


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from pymer4.models import Lmer

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests

Using MNE with API > 0.19


In [4]:
# plotting parameters
grey = "#21201F"
green = "#9AC529"
lblue = "#42B9B2"
pink = "#DE237B"
orange = "#F38A31"

nt_colors = [green, lblue, pink, orange]

plt.style.use("ggplot")
fig_width = 2  # width in inches
fig_height = 8  # height in inches
fig_size = [fig_width, fig_height]
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams["figure.autolayout"] = True

sns.set(
    style="white",
    context="notebook",
    font_scale=1.5,
    rc={
        "axes.labelcolor": grey,
        "text.color": grey,
        "axes.edgecolor": grey,
        "xtick.color": grey,
        "ytick.color": grey,
        'figure.figsize': fig_size
    },
)

sns.set_palette(sns.color_palette(nt_colors))

# Load Data

In [36]:
data_path = "../../Data/"
results_path = "../../Results/"
fig_path = "../../Results/Figs/"

df = pd.read_csv(os.path.join(data_path, 'all_markers.csv'), index_col = 0)

In [6]:
#markers names
all_participants = ['VP07','VP08','VP09', 'VP10','VP11','VP12','VP13','VP14','VP18','VP19','VP20','VP22','VP23','VP24','VP25','VP26','VP27','VP28','VP29','VP30','VP31','VP32','VP33','VP35','VP36','VP37']
#selection of good participants. Not used.
good_participants = all_participants[1:2] +  all_participants[6:10] +  all_participants[12:15]  + all_participants[18:23] + [all_participants[25]]
len(good_participants)

14

In [37]:
markers = ['wSMI_1', 'wSMI_2', 'wSMI_4', 'wSMI_8', 'p_e_1', 'p_e_2',
       'p_e_4', 'p_e_8', 'k', 'se','msf', 'sef90', 'sef95', 'b', 'b_n', 'g',
       'g_n', 't', 't_n', 'd', 'd_n', 'a_n', 'a', 'CNV', 'P1', 'P3a', 'P3b',]
#           'ft', 'ft_n']
erps =['CNV', 'P1', 'P3a', 'P3b']
# erps = [r'$CNV$', r'$P1$', r'$P3a$',r'$P3b$']

# markers =  [r'$\delta$',r'$|\delta|$',r'$\theta$', r'$|\theta|$',r'$\alpha$', r'$|\alpha|$',r'$\beta$', r'$|\beta|$',r'$\gamma$', r'$|\gamma|$',
#             r'$PE\gamma$',r'$PE\beta$',r'$PE\alpha$',r'$PE\theta$',
#             r'$wSMI\gamma$',r'$wSMI\beta$',r'$wSMI\alpha$',r'$wSMI\theta$', 
#             r'$K$',r'$SE$',r'$MSF$', r'$SEF90$', r'$SEF95$', 
#             r'$CNV$', r'$P1$', r'$P3a$',r'$P3b$'
#            ]


# df_subtracted = df.query("preproc == 'subtracted'").drop(columns = erps+['preproc'])
# df_erp = df.query("preproc == 'erp'").drop(columns = np.setdiff1d(markers,erps).tolist()+['preproc'])

# df_markers = df_subtracted.merge(df_erp, 'inner', on =np.setdiff1d(df_subtracted.columns, markers).tolist() )

df_markers = (df
              .query("stimuli == 'go'")
              .query("correct == 'correct'")
              .query('prev_trial < 5')
              .drop(['stimuli', 'correct', 'prev_trial', 'label', 'events',  'epoch_type', 'preproc', 'ft', 'ft_n'], axis = 1)
              .query("mind in ['on-task','dMW', 'sMW']")
              .groupby(['segment', 'participant']).filter(lambda x: len(x) > 1)
             )

df_markers['segment'] = df_markers['segment'].str.replace('s', '').astype(int)

# By Segment Univariate analyses

## On-task Vs Mind- Wandering
This can only be performed for PC probes  as they are the only ones with On-task reports.

In [6]:
agg_dict = {k:['mean', 'std'] for k in markers }
agg_dict.update({k:'first' for k in df_markers.drop(markers, axis=1).columns})

df_mind = (
    df_markers
    .query("probe == 'PC'")
    .groupby(['segment', 'participant'], as_index = False).agg(agg_dict)
    .assign(
    mind2 = lambda df: np.where(df.mind == 'on-task', 'on-task', 'mw'))
)

############################################################
################ Use normal names################
############################################################

df_mind.columns = df_mind.columns.map("_".join)

df_mind  = (df_mind
            .rename(columns = {'participant_first':'participant', 'probe_first':'probe', 'mind_first':'mind', 'segment_first':'segment', 'mind2_':'mind2'})
#             .query("mind != 'dMW'") #if you want to test against just one of the mw            
            .drop([ 'probe', 'mind', 'segment'], axis = 1) 
           )

############################################################
################ Use latex command for nmaes################
############################################################

##it slow downs the computer, just for final figures.

# df_mind = correct_name_markers(df_mind)

# df_mind.columns = df_mind.columns.map("$_{".join).map(lambda x: x + '}$').map(lambda x: x.replace('$$', ''))

# df_mind  = (df_mind
#             .rename(columns = {'participant$_{first}$':'participant', 'probe$_{first}$':'probe', 'mind$_{first}$':'mind', 'segment$_{first}$':'segment', 'mind2$_{}$':'mind2'})
# #             .query("mind != 'dMW'") #if you want to test against just one of the mw            
#             .drop(['probe', 'mind', 'segment'], axis = 1) 
        #    )
        
df_mind['mind2_numeric'] = (df_mind['mind2'] == 'mw').astype(int)


In [27]:
results_df = pd.DataFrame(columns=['Marker', 'Log-Likelihood', 'AIC', 'Random Effects Var', 
                                   'Estimate','P_val', 'Z_stat', 
                                   'AUC_mean', 'AUC_std', 'AUC_sem', 'AUC_range'])


for marker in tqdm(df_mind.drop(['mind2', 'mind2_numeric', 'participant'], axis = 1).columns, desc="Markers"):
    formula = f"mind2_numeric ~ {marker} + (1|participant)"
    
    # Fitting the LMER model
    model = Lmer(formula, data=df_mind, family="binomial")
    model.fit(verbose = False)
    
    # Stratified KFold for ROC AUC
    skf = StratifiedKFold(n_splits=5)
    X = df_mind[marker].values.reshape(-1, 1)
    y = df_mind['mind2_numeric'].values
    auc_scores = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model_kfold = Lmer(formula, data=df_mind.iloc[train_index],  family="binomial")
        model_kfold.fit(verbose = False)
        

        predicted_probabilities = model_kfold.predict(df_mind.iloc[test_index], use_rfx=True, verify_predictions=False)

        
        auc = roc_auc_score(y_test, predicted_probabilities)
        auc_scores.append(auc)
    
    # Compute AUC statistics
    auc_mean = np.mean(auc_scores)
    auc_std = np.std(auc_scores)
    auc_sem = auc_std / np.sqrt(len(auc_scores))
    auc_range = np.ptp(auc_scores)
    
    # Save to DataFrame
    results_df = results_df.append({
        'Marker': marker,
        'Log-Likelihood': model.logLike,
        'AIC': model.AIC,
        'Random Effects Var': model.ranef_var['Var'],
        'Estimate':model.coefs['Estimate'][0], 
        'P_val': model.coefs['P-val'][0],
        'Z_stat': model.coefs['Z-stat'][0],
        'AUC_mean': auc_mean,
        'AUC_std': auc_std,
        'AUC_sem': auc_sem,
        'AUC_range': auc_range
    }, ignore_index=True)

mind_glmm = results_df.assign(
                    p_corrected = lambda df: multipletests(df.P_val, method = 'fdr_bh')[1],
                    significant = lambda df: np.select([(df.P_val < 0.05) & (df.p_corrected < 0.05), (df.P_val < 0.05) & (df.p_corrected > 0.05),  
                                                 (df.P_val > 0.05) & (df.p_corrected > 0.05)], ['p < 0.05 FDR corrected','p < 0.05 uncorrected', 'p > 0.05'])
                   )

mind_glmm.to_csv(os.path.join(results_path,'univariate_glmm_mind.csv'))


Markers:   0%|          | 0/55 [00:00<?, ?it/s]

Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: mind2_numeric~wSMI_1_mean+(1|participant)

Family: binomial	 Inference: parametric

Number of observations: 176	 Groups: {'participant': 24.0}

Log-likelihood: -116.115 	 AIC: 238.230

Random effects:

                    Name    Var    Std
participant  (Intercept)  1.054  1.027

No random effect correlations specified

Fixed effects:

Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: mind2_numeric~wSMI_1_mean+(1|participant)

Family: binomial	 Inference: parametric

Number of observations: 140	 Groups: {'participant': 24.0}

Log-likelihood: -90.286 	 AIC: 186.572

Random effects:

                    Name    Var    Std
participant  (Intercept)  1.477  1.215

No random effect correlations specified

Fixed effects:

Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: mind2_numeric~wSMI_1_mean+(1|participant)

Family: binomial	 Inference: parametric

Number of observations: 141	 Groups: {'p

In [29]:
mind_glmm = pd.read_csv(os.path.join(results_path,'univariate_glmm_mind.csv'))

# segment_mind_roc = segment_mind_roc.sort_values(by = 'AUC', ascending = False).head(10).append(segment_mind_roc.sort_values(by = 'AUC', ascending = False).tail(10))

fig = px.scatter(mind_glmm.sort_values(by = 'AUC_mean'),x = 'AUC_mean', y = 'Marker', template = "plotly_white", symbol = 'significant', 
                 symbol_sequence = ['circle-open','circle','hexagram' ],
#                  color = 'significant',
                 color_discrete_sequence = [pink, green,orange, pink], 
                 
                 category_orders = {'significant': ['p > 0.05','p < 0.05 uncorrected', 'p < 0.05 FDR corrected']},
                 labels = {'AUC': 'TUT>OT                      TUT<OT', 'significant': 'Statistical Significance', 'markers':''}
                )
fig.add_vline(x=0.5, line_width=3, line_dash="dash", line_color="grey")
fig.update_traces(marker=dict(size = 13))

fig.update_layout(
    width=850,
    height=1300,
#     autosize = True, 
    template = 'plotly_white',
        font=dict(
        family="Times new roman",
        size=20,
        color="black"
    ),
    xaxis = dict(
            visible=True,
            range = [0.45,0.70], 
            tickfont = {"size": 20},
        ),
    yaxis = dict(
        tickfont = {"size": 20},
        autorange = False,    
        automargin = True,
        range = [-1,len(mind_glmm)],
        dtick = 1
        ),
    showlegend=True, 

)

fig.show()

fig.write_image(os.path.join(fig_path,'univariate_glmm_mind.png'))
fig.write_image(os.path.join(fig_path,'univariate_glmm_mind.pdf'))

## dMW Vs sMW
This will be only performed in SC as they have more trials

In [46]:
agg_dict = {k:['mean', 'std'] for k in markers }
agg_dict.update({k:'first' for k in df_markers.drop(markers, axis=1).columns})

df_mw = (
    df_markers
    .query("probe == 'SC'")
    .query("mind != 'on-task'")
    .groupby(['segment', 'participant'], as_index = False).agg(agg_dict)
)

############################################################
################ Use normal names################
############################################################
df_mw.columns = df_mw.columns.map("_".join)

df_mw  = (df_mw
            .rename(columns = {'participant_first':'participant', 'probe_first':'probe', 'mind_first':'mind', 'segment_first':'segment'})
            .drop([ 'probe',], axis = 1) 
           )


############################################################
################ Use latex command for nmaes################
############################################################

# df_mw = correct_name_markers(df_mw)

# df_mw.columns = df_mw.columns.map("$_{".join).map(lambda x: x + '}$').map(lambda x: x.replace('$$', ''))

# df_mw  = (df_mw
#             .rename(columns = {'participant$_{first}$':'participant', 'probe$_{first}$':'probe', 'mind$_{first}$':'mind', 'segment$_{first}$':'segment', 'mind$_{}$':'mind'})
# #             .query("mind != 'dMW'") #if you want to test against just one of the mw   
#             .drop(['participant', 'probe',  'segment'], axis = 1)

#            )


df_mw['mind_numeric'] = (df_mw['mind'] == 'sMW').astype(int)

df_mw.to_csv(os.path.join(results_path,'data_mw.csv'))

In [39]:
from scipy.stats import zscore

# Function to remove outliers based on Z-score
def remove_outliers(df, columns, z_threshold=3):
    for col in columns:
        col_zscore = zscore(df[col])
        df = df[(col_zscore < z_threshold) & (col_zscore > -z_threshold)]
    return df

# Columns to remove outliers from
columns_to_check = df_mw.drop(['mind', 'mind_numeric', 'participant', 'segment'], axis = 1).columns

# Remove outliers
df_mw_filtered = remove_outliers(df_mw, columns_to_check, z_threshold=4)
df_mw = df_mw_filtered

In [58]:
px.scatter(x=df_mw['a_mean'], y=df_mw['a_std'], color=df_mw['mind'], template = "plotly_white")

In [44]:
results_df = pd.DataFrame(columns=['Marker', 'Log-Likelihood', 'AIC', 'Random Effects Var', 
                                   'Estimate','P_val', 'Z_stat', 
                                   'AUC_mean', 'AUC_std', 'AUC_sem', 'AUC_range'])


for marker in tqdm(df_mw.drop(['mind', 'mind_numeric', 'participant', 'segment'], axis = 1).columns, desc="Markers"):
    # formula = f"mind_numeric ~ {marker} + (1|participant)"
    # formula = f"mind_numeric ~ {marker} + (1 + {marker}|participant)"
    # formula = f"mind_numeric ~ {marker} + (1|participant/segment)"
    formula = f"mind_numeric ~ {marker} + (1|participant) + (1|segment)"
    
    
    # Fitting the LMER model
    model = Lmer(formula, data=df_mw, family="binomial")
    model.fit(verbose = False)
    
    # Stratified KFold for ROC AUC
    skf = StratifiedKFold(n_splits=5)
    X = df_mw[marker].values.reshape(-1, 1)
    y = df_mw['mind_numeric'].values
    auc_scores = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model_kfold = Lmer(formula, data=df_mw.iloc[train_index],  family="binomial")
        model_kfold.fit(verbose = False)
        

        predicted_probabilities = model_kfold.predict(df_mw.iloc[test_index], use_rfx=True, verify_predictions=False)
        

        
        auc = roc_auc_score(y_test, predicted_probabilities)
        auc_scores.append(auc)
    
    # Compute AUC statistics
    auc_mean = np.mean(auc_scores)
    auc_std = np.std(auc_scores)
    auc_sem = auc_std / np.sqrt(len(auc_scores))
    auc_range = np.ptp(auc_scores)
    
    # Save to DataFrame
    results_df = results_df.append({
        'Marker': marker,
        'Log-Likelihood': model.logLike,
        'AIC': model.AIC,
        'Random Effects Var': model.ranef_var['Var'],
        'Estimate':model.coefs['Estimate'][0], 
        'P_val': model.coefs['P-val'][0],
        'Z_stat': model.coefs['Z-stat'][0],
        'AUC_mean': auc_mean,
        'AUC_std': auc_std,
        'AUC_sem': auc_sem,
        'AUC_range': auc_range
    }, ignore_index=True)

mw_glmm = results_df.assign(
                    p_corrected = lambda df: multipletests(df.P_val, method = 'fdr_bh')[1],
                    significant = lambda df: np.select([(df.P_val < 0.05) & (df.p_corrected < 0.05), (df.P_val < 0.05) & (df.p_corrected > 0.05),  
                                                (df.P_val > 0.05) & (df.p_corrected > 0.05)], ['p < 0.05 FDR corrected','p < 0.05 uncorrected', 'p > 0.05'])
                    )

mw_glmm.to_csv(os.path.join(results_path,'univariate_glmm_mw.csv'))


Markers:   0%|          | 0/54 [00:00<?, ?it/s]

Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: mind_numeric~wSMI_1_mean+(1|participant)+(1|segment)

Family: binomial	 Inference: parametric

Number of observations: 466	 Groups: {'segment': 115.0, 'participant': 20.0}

Log-likelihood: -292.327 	 AIC: 592.654

Random effects:

                    Name    Var    Std
segment      (Intercept)  0.000  0.000
participant  (Intercept)  0.692  0.832

No random effect correlations specified

Fixed effects:

boundary (singular) fit: see ?isSingular 

Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: mind_numeric~wSMI_1_mean+(1|participant)+(1|segment)

Family: binomial	 Inference: parametric

Number of observations: 372	 Groups: {'segment': 101.0, 'participant': 18.0}

Log-likelihood: -224.905 	 AIC: 457.811

Random effects:

                    Name    Var    Std
segment      (Intercept)  0.000  0.000
participant  (Intercept)  1.134  1.065

No random effect correlations specified

Fixed effects:

Linear mixe

In [54]:
from merf import MERF
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from tqdm import tqdm
from statsmodels.sandbox.stats.multicomp import multipletests
import os

# Initialize results DataFrame
results_df = pd.DataFrame(columns=['Marker', 'AUC_mean', 'AUC_std', 'AUC_sem', 'AUC_range'])

# Loop through each marker
for marker in tqdm(df_mw.drop(['mind', 'mind_numeric', 'participant', 'segment'], axis=1).columns, desc="Markers"):
    
    # Prepare data
    X = df_mw[marker]
    Z = np.ones((X.shape[0], 1))  # Random effects design matrix
    clusters = df_mw['participant']
    y = df_mw['mind_numeric']

    # Initialize StratifiedKFold and AUC scores list
    skf = StratifiedKFold(n_splits=5)
    auc_scores = []

    # Perform Stratified KFold Cross-Validation
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        X_train = np.array(X_train).reshape(-1, 1)
        X_test = np.array(X_test).reshape(-1, 1)

        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clusters_train, clusters_test = clusters.iloc[train_index], clusters.iloc[test_index]
        
        # Initialize and train MERF
        merf = MERF()
        merf.fit(X_train, Z[train_index], clusters_train, y_train)

        # Make predictions
        y_pred = merf.predict(X_test, Z[test_index], clusters_test)

        # Compute AUC and append to list
        auc = roc_auc_score(y_test, y_pred)
        auc_scores.append(auc)

    # Compute AUC statistics
    auc_mean = np.mean(auc_scores)
    auc_std = np.std(auc_scores)
    auc_sem = auc_std / np.sqrt(len(auc_scores))
    auc_range = np.ptp(auc_scores)

    # Append results to DataFrame
    results_df = results_df.append({
        'Marker': marker,
        'AUC_mean': auc_mean,
        'AUC_std': auc_std,
        'AUC_sem': auc_sem,
        'AUC_range': auc_range
    }, ignore_index=True)

# Save results to CSV
results_df.to_csv(os.path.join(results_path, 'univariate_merf_mw.csv'))


Markers:   0%|          | 0/54 [00:00<?, ?it/s]

INFO     [merf.py:307] Training GLL is -867.6841047857208 at iteration 1.
INFO     [merf.py:307] Training GLL is -962.8968536555981 at iteration 2.
INFO     [merf.py:307] Training GLL is -968.049919554243 at iteration 3.
INFO     [merf.py:307] Training GLL is -968.7343222998188 at iteration 4.
INFO     [merf.py:307] Training GLL is -971.6042147097471 at iteration 5.
INFO     [merf.py:307] Training GLL is -970.2674706957099 at iteration 6.
INFO     [merf.py:307] Training GLL is -964.7085445960471 at iteration 7.
INFO     [merf.py:307] Training GLL is -964.5541677461651 at iteration 8.
INFO     [merf.py:307] Training GLL is -965.8797129682979 at iteration 9.
INFO     [merf.py:307] Training GLL is -963.7494215714811 at iteration 10.
INFO     [merf.py:307] Training GLL is -967.2960109536933 at iteration 11.
INFO     [merf.py:307] Training GLL is -969.0943282025991 at iteration 12.
INFO     [merf.py:307] Training GLL is -962.8175155691094 at iteration 13.
INFO     [merf.py:307] Training GLL

In [56]:
mw_glmm = pd.read_csv(os.path.join(results_path, 'univariate_glmm_mw.csv'))

# segment_mw_roc = segment_mw_roc.sort_values(by = 'AUC', ascending = False).head(10).append(segment_mw_roc.sort_values(by = 'AUC', ascending = False).tail(10))

fig = px.scatter(mw_glmm.sort_values(by = 'AUC_mean'),x = 'AUC_mean', y = 'Marker', template = "plotly_white", symbol = 'significant', 
                 symbol_sequence = ['circle-open','circle','hexagram' ],
#                  color = 'significant',
                 color_discrete_sequence = [lblue, green,orange, pink], 
                 
                 category_orders = {'significant': ['p > 0.05','p < 0.05 uncorrected', 'p < 0.05 FDR corrected']},
                 labels = {'AUC': 'sTUT>dTUT              sTUT<dTUT', 'significant': 'Statistical Significance', 'markers':''}
                )
fig.add_vline(x=0.5, line_width=3, line_dash="dash", line_color="grey")
fig.update_traces(marker=dict(size = 13))

fig.update_layout(
    width=850,
    height=1300,
#     autosize = True, 
    template = 'plotly_white',
        font=dict(
        family="Times new roman",
        size=20,
        color="black"
    ),
    xaxis = dict(
            visible=True,
            range = [0.45,0.75], 
            tickfont = {"size": 20},
        ),
    yaxis = dict(
        tickfont = {"size": 20},
        autorange = False,    
        automargin = True,
        range = [-1,len(mw_glmm)],
        dtick = 1
        ),
    showlegend=True, 

)

fig.show()
# pio.write_json(fig, 'Figs/univariate_roc_mw_segment.plotly')
fig.write_image( os.path.join(fig_path, 'univariate_roc_mw_segment.png'))
fig.write_image( os.path.join(fig_path, 'univariate_roc_mw_segment.pdf'))

## Probe Caught vs Self-Caught


In [17]:
agg_dict = {k:['mean', 'std'] for k in markers }
agg_dict.update({k:'first' for k in df_markers.drop(markers, axis=1).columns})

df_probe = (
    df_markers
    .query("mind != 'on-task'")
    .groupby(['segment', 'participant'], as_index = False).agg(agg_dict)
)

# df_probe.columns = df_probe.columns.map("_".join)

# df_probe  = (df_probe
#             .rename(columns = {'participant_first':'participant', 'probe_first':'probe', 'mind_first':'mind', 'segment_first':'segment'})
#             .drop(['participant', 'mind', 'segment'], axis = 1) 
#            )

#### Use latex command for nmaes###
##it slow downs the computer, just for final figures.

df_probe = correct_name_markers(df_probe)

df_probe.columns = df_probe.columns.map("$_{".join).map(lambda x: x + '}$').map(lambda x: x.replace('$$', ''))

df_probe  = (df_probe
            .rename(columns = {'participant$_{first}$':'participant', 'probe$_{first}$':'probe', 'mind$_{first}$':'mind', 'segment$_{first}$':'segment'})
           
            .drop(['participant', 'mind', 'segment'], axis = 1) 
           )

In [19]:
sc = df_probe[df_probe.probe == 'SC']
pc = df_probe[df_probe.probe == 'PC']
print(f'SC {len(sc)}, PC: {len(pc)}')

AUC = []
pvalues = {}
for i in df_probe.drop('probe', axis = 1).columns:
    rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable

    probe_rus = rus.fit_resample(df_probe[i].astype("float32").values.reshape(-1,1), df_probe.probe)

    df_probe_rus = pd.DataFrame(probe_rus[0], columns =['marker']).assign(probe = probe_rus[1])
    
    sc = df_probe_rus.query("probe == 'SC'")['marker']
    pc = df_probe_rus.query("probe == 'PC'")['marker']
    auc = roc_auc_score(df_probe_rus['probe'], df_probe_rus['marker'])
    print(f'AUC {i} = {auc}', f'{mannwhitneyu(x = sc, y = pc)}')
    AUC.append([i, auc])
    pvalues[i] = mannwhitneyu(x = sc, y =pc).pvalue 
        
probe_roc = pd.DataFrame(AUC, columns = ['markers', 'AUC'])

p_df =pd.DataFrame.from_dict(pvalues, orient = 'index', columns = ['p_value']).reset_index().rename(columns ={'index': 'markers'})
probe_roc_rus = (probe_roc
            .merge(p_df, on = 'markers', how = 'inner')
            .assign(
                    p_corrected = lambda df: multipletests(df.p_value, method = 'fdr_bh')[1],
                    significant = lambda df: np.select([(df.p_value < 0.05) & (df.p_corrected < 0.05), (df.p_value < 0.05) & (df.p_corrected > 0.05),  
                                                 (df.p_value > 0.05) & (df.p_corrected > 0.05)], ['p < 0.05 FDR corrected','p < 0.05 uncorrected', 'p > 0.05']),
                                    balance = 'under-sample'
                   )
           )

AUC = []
pvalues = {}
for i in df_probe.drop('probe', axis = 1).columns:
    ros = SMOTE(random_state=42)# fit predictor and target variable

    probe_ros = ros.fit_resample(df_probe[i].astype("float32").values.reshape(-1,1), df_probe.probe)

    df_probe_ros = pd.DataFrame(probe_ros[0], columns =['marker']).assign(probe = probe_ros[1])
    
    sc = df_probe_ros.query("probe == 'SC'")['marker']
    pc = df_probe_ros.query("probe == 'PC'")['marker']
    auc = roc_auc_score(df_probe_ros['probe'], df_probe_ros['marker'])
    print(f'AUC {i} = {auc}', f'{mannwhitneyu(x = sc, y = pc)}')
    AUC.append([i, auc])
    pvalues[i] = mannwhitneyu(x = sc, y =pc).pvalue 
        
probe_roc = pd.DataFrame(AUC, columns = ['markers', 'AUC'])

p_df =pd.DataFrame.from_dict(pvalues, orient = 'index', columns = ['p_value']).reset_index().rename(columns ={'index': 'markers'})
probe_roc_ros = (probe_roc
            .merge(p_df, on = 'markers', how = 'inner')
            .assign(
                    p_corrected = lambda df: multipletests(df.p_value, method = 'fdr_bh')[1],
                    significant = lambda df: np.select([(df.p_value < 0.05) & (df.p_corrected < 0.05), (df.p_value < 0.05) & (df.p_corrected > 0.05),  
                                                 (df.p_value > 0.05) & (df.p_corrected > 0.05)], ['p < 0.05 FDR corrected','p < 0.05 uncorrected', 'p > 0.05']),
                    balance = 'over-sample'
                   )
           )

SC 536, PC: 82
AUC $wSMI\gamma_{mean}$ = 0.4805175490779299 MannwhitneyuResult(statistic=3231.0, pvalue=0.6677869786100235)
AUC $wSMI\gamma_{std}$ = 0.46519928613920286 MannwhitneyuResult(statistic=3128.0, pvalue=0.44252716973948825)
AUC $wSMI\beta_{mean}$ = 0.5001487209994051 MannwhitneyuResult(statistic=3363.0, pvalue=0.9986879628094777)
AUC $wSMI\beta_{std}$ = 0.5028256989886972 MannwhitneyuResult(statistic=3381.0, pvalue=0.9514845365519969)
AUC $wSMI\alpha_{mean}$ = 0.5355443188578228 MannwhitneyuResult(statistic=3601.0, pvalue=0.43281913312030584)
AUC $wSMI\alpha_{std}$ = 0.5093694229625223 MannwhitneyuResult(statistic=3425.0, pvalue=0.8371428769757068)
AUC $wSMI\theta_{mean}$ = 0.5553242117787031 MannwhitneyuResult(statistic=3734.0, pvalue=0.22178861827367669)
AUC $wSMI\theta_{std}$ = 0.48393813206424746 MannwhitneyuResult(statistic=3254.0, pvalue=0.7236798744881828)
AUC $PE\gamma_{mean}$ = 0.4867638310529447 MannwhitneyuResult(statistic=3273.0, pvalue=0.7710069583144967)
AUC $PE

AUC $MSF_{std}$ = 0.46460440214158233 MannwhitneyuResult(statistic=3124.0, pvalue=0.43475081989767006)
AUC $SEF90_{mean}$ = 0.4884741225461035 MannwhitneyuResult(statistic=3284.5, pvalue=0.8000850149280309)
AUC $SEF90_{std}$ = 0.45226055919095776 MannwhitneyuResult(statistic=3041.0, pvalue=0.2918565852096886)
AUC $SEF95_{mean}$ = 0.4842355740630577 MannwhitneyuResult(statistic=3256.0, pvalue=0.7286157721304451)
AUC $SEF95_{std}$ = 0.470553242117787 MannwhitneyuResult(statistic=3164.0, pvalue=0.5159918412210651)
AUC $\beta_{mean}$ = 0.5132361689470553 MannwhitneyuResult(statistic=3451.0, pvalue=0.7710069583144967)
AUC $\beta_{std}$ = 0.4950922070196312 MannwhitneyuResult(statistic=3329.0, pvalue=0.9148796519591063)
AUC $|\beta|_{mean}$ = 0.4629684711481261 MannwhitneyuResult(statistic=3113.0, pvalue=0.4137772529749967)
AUC $|\beta|_{std}$ = 0.46460440214158244 MannwhitneyuResult(statistic=3124.0, pvalue=0.43475081989767006)
AUC $\gamma_{mean}$ = 0.4863176680547293 MannwhitneyuResult(sta

In [21]:
probe_roc = pd.concat([probe_roc_ros, probe_roc_rus])
fig = px.scatter(probe_roc.sort_values(by = 'AUC'),x = 'AUC', y = 'markers', template = "plotly_white", symbol = 'significant', 
                 symbol_sequence = ['circle-open','circle','hexagram' ],
                 facet_col = 'balance',
#                  color = 'significant',
                 color_discrete_sequence = [orange], 
                 labels = {'AUC': 'PC>SC                PC<SC'},
                 
                 
                 category_orders = {'significant': ['p > 0.05','p < 0.05 uncorrected', 'p < 0.05 FDR corrected']})
fig.add_vline(x=0.5, line_width=3, line_dash="dash", line_color="black")

fig.update_traces(marker=dict(size = 8))
fig.update_layout(
    autosize=False,
    width=800,
    height=800,
    yaxis = {
            'showticklabels': True,
            'tickmode': 'linear',
        }
    
)
fig.show()
# pio.write_json(fig, 'Figs/univariate_roc_probe.plotly')
# fig.write_image('Figs/univariate_roc_mw_segment.png')