In [1]:
import os
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from scipy.special import expit 

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
from plotly.subplots import make_subplots

pyo.init_notebook_mode(connected = True)


from sklearn.metrics import roc_auc_score, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import (
    GridSearchCV,
    train_test_split,
    cross_val_score,
    GroupShuffleSplit,
    permutation_test_score,
    StratifiedKFold,
    KFold,
    cross_validate
)

from tqdm import tqdm 

from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor

import optuna
from merf import MERF
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold


import sys
sys.path.insert(0, '../')
# from utils import multivariate_classifier, correct_name_markers, plot_univariate
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [2]:
# plotting parameters
grey = "#21201F"
green = "#9AC529"
lblue = "#42B9B2"
pink = "#DE237B"
orange = "#F38A31"

nt_colors = [green, lblue, pink, orange]

plt.style.use("ggplot")
fig_width = 12  # width in inches
fig_height = 9  # height in inches
fig_size = [fig_width, fig_height]
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams["figure.autolayout"] = True

sns.set(
    style="white",
    context="notebook",
    font_scale=1,
    rc={
        "axes.labelcolor": grey,
        "text.color": grey,
        "axes.edgecolor": grey,
        "xtick.color": grey,
        "ytick.color": grey,
    },
)

sns.set_palette(sns.color_palette
(nt_colors))

# Load Data
Loads data from the computed markers. From `Data` directory

In [3]:
data_path = "../../Data/"
results_path = "../../Results/"
fig_path = "../../Results/Figs/"

df = pd.read_csv(os.path.join(data_path, 'all_markers.csv'), index_col = 0)

In [4]:
markers = ['wSMI_1', 'wSMI_2', 'wSMI_4', 'wSMI_8', 'p_e_1', 'p_e_2',
       'p_e_4', 'p_e_8', 'k', 'se','msf', 'sef90', 'sef95', 'b', 'b_n', 'g',
       'g_n', 't', 't_n', 'd', 'd_n', 'a_n', 'a', 'CNV', 'P1', 'P3a', 'P3b',]

erps =['CNV', 'P1', 'P3a', 'P3b']

df_markers = (df
              .query("stimuli == 'go'") # only go trials
              .query("correct == 'correct'") #only correct trials
              .query('prev_trial < 5') # only last 5 trials before each probe. 
              .drop(['stimuli', 'correct', 'prev_trial', 'label', 'events',  'epoch_type', 'preproc', 'ft', 'ft_n'], axis = 1) # drop unnecessary columns
              .query("mind in ['on-task','dMW', 'sMW']") # only mind wandering and on-task trials
              .groupby(['segment', 'participant']).filter(lambda x: len(x) > 1) # drop participants with less than 2 trials per segment
             )

# By segment Mulivariate analysis

## On-task Vs Mind- Wandering
This can only be performed for PC probes  as they are the only ones with On-task reports.


Prepares the dataframe for the analysis

In [5]:
agg_dict = {k:['mean', 'std'] for k in markers }
agg_dict.update({k:'first' for k in df_markers.drop(markers, axis=1).columns})

df_mind = (
    df_markers
    .query("probe == 'PC'")
    .groupby(['segment', 'participant'], as_index = False).agg(agg_dict)
    .assign(
    mind2 = lambda df: np.where(df.mind == 'on-task', 'on-task', 'mw'))
)


############################################################
################ Use normal names################
############################################################

df_mind.columns = df_mind.columns.map("_".join)

df_mind  = (df_mind
            .rename(columns = {'participant_first':'participant', 'probe_first':'probe', 'mind_first':'mind', 'segment_first':'segment', 'mind2_':'mind2'})
            # .query("mind != 'dMW'") #if you want to test against just one of the mw            
            .drop([ 'probe', 'mind',], axis = 1) 
           )

############################################################
################ Use latex command for nmaes################
############################################################

##it slow downs the computer, just for final figures.

# df_mind = correct_name_markers(df_mind)

# df_mind.columns = df_mind.columns.map("$_{".join).map(lambda x: x + '}$').map(lambda x: x.replace('$$', ''))

# df_mind  = (df_mind
#             .rename(columns = {'participant$_{first}$':'participant', 'probe$_{first}$':'probe', 'mind$_{first}$':'mind', 'segment$_{first}$':'segment', 'mind2$_{}$':'mind2'})
# #             .query("mind != 'dMW'") #if you want to test against just one of the mw            
#             .drop(['probe', 'mind', 'segment'], axis = 1) 
        #    )
        
df_mind['mind2_numeric'] = (df_mind['mind2'] == 'mw').astype(int)

# def filter_participants(group):
#     counts = group['mind2'].value_counts()
#     # Check if there is only one level of 'mind2' for the participant
#     if len(counts) == 1:
#         return False
#     return all(count >= 2 for count in counts)

# df_mind = df_mind.groupby('participant').filter(filter_participants)


from scipy.stats import zscore
def replace_outliers_with_participant_mean(df, columns, participant_column='participant', z_threshold=3):
    df_copy = df.copy()
    
    for col in columns:
        for participant in df[participant_column].unique():
            subset = df[df[participant_column] == participant]
            col_zscore = zscore(subset[col])
            mean_value = np.mean(subset[col][np.abs(col_zscore) < z_threshold])
            
            # Count the outliers for each participant
            outlier_count = np.sum(np.abs(col_zscore) >= z_threshold)
            total = len(col_zscore)
            
            # Replace outliers with the mean value for each participant
            subset_indices = subset.index[np.abs(col_zscore) >= z_threshold]
            df_copy.loc[subset_indices, col] = mean_value
            
            if outlier_count > 0:
                print(f"Replaced {outlier_count} outliers in column '{col}' out of {total} observations for participant {participant} with the mean value.")
    
    return df_copy
# Columns to remove outliers from
columns_to_check = df_mind.drop(['mind2', 'mind2_numeric', 'participant', 'segment'], axis = 1).columns

# Remove outliers
df_mind_filtered = replace_outliers_with_participant_mean(df_mind, columns_to_check, z_threshold=3)

df_mind = df_mind_filtered

def filter_participants(group):
    counts = group['mind2_numeric'].value_counts()
    # Check if there is only one level of 'mind2' for the participant
    if len(counts) == 1:
        return False
    return all(count >= 1 for count in counts)
# df_mind = df_mind.groupby('participant').filter(filter_participants)

Replaced 1 outliers in column 'msf_mean' out of 12 observations for participant 17 with the mean value.
Replaced 1 outliers in column 't_std' out of 12 observations for participant 17 with the mean value.
Replaced 1 outliers in column 'd_mean' out of 16 observations for participant 3 with the mean value.


In [6]:
# Initialize lists to store AUC and permutation scores
scores = []
perm_auc_scores_all = []
optimal_cutoffs = []

n_splits = 4
group_kfold = GroupKFold(n_splits=n_splits)

# Assuming df_mw['participant'] contains the participant IDs
# Prepare data
X = df_mind.drop(['mind2', 'mind2_numeric', 'participant','segment'], axis=1)
Z = np.ones((X.shape[0], 1))  # Random effects design matrix
groups = df_mind['participant']
y = df_mind['mind2_numeric']

# Global dictionary for storing trial data
trial_data = {}
def objective(trial):
    global trial_data
    # Hyperparameters to be optimized
    n_estimators = trial.suggest_int('n_estimators', 2, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)

    scores = []
    
    feature_importances = None
    for train_index, test_index in group_kfold.split(X, y, groups.values):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        X_train = StandardScaler().fit_transform(X_train)
        X_test = StandardScaler().fit_transform(X_test)
        
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clusters_train, clusters_test = groups.iloc[train_index], groups.iloc[test_index]
        
        # Initialize MERF with trial-suggested parameters
        merf = MERF(fixed_effects_model=RandomForestRegressor(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    max_features=max_features,
                    random_state=42,
                    n_jobs=-1
        ),
        gll_early_stop_threshold=1,
        max_iterations=50
        )
        
        merf.fit(X_train, Z[train_index], clusters_train, y_train)
        
        
        y_pred = merf.predict(X_test, Z[test_index], clusters_test)

        # Convert continuous outputs to probabilities
        y_pred_proba = expit(y_pred)
        
        # Find optimal cutoff point based on F1 score
        thresholds = np.linspace(0, 1, 100)
        f1_scores = [f1_score(y_test, (y_pred_proba > t).astype(int)) for t in thresholds]
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        optimal_cutoffs.append(optimal_threshold)
        
        # Thresholding to get class labels
        y_pred_class = (y_pred_proba > optimal_threshold).astype(int)

        # Compute AUC for probabilities
        auc = roc_auc_score(y_test, y_pred_proba)
        scores.append(auc)
        
        # Get feature importances (only need to do this once per trial)
        if feature_importances is None:
            feature_importances = merf.fe_model.feature_importances_

    # Store trial data
    trial_data[trial.number] = {
        'fold_aucs': scores,
        'feature_importances': feature_importances
    }

    # Average AUC over all folds
    avg_auc = np.mean(scores)
    return avg_auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)


# Construct the file path for the study database
study_db_path = os.path.join(results_path, 'multivariate_merf_mind_study.db')
# Use the SQLite database at the specified path for storage
study = optuna.create_study(direction="maximize", study_name="mind_mvpa_study", storage=f'sqlite:///{study_db_path}', load_if_exists=True)

# Best hyperparameters.
# Save the best trial's parameters to a file
best_params_path = os.path.join(results_path, 'mvpa_merf_mind_best_params.txt')
with open(best_params_path, 'w') as file:
    for key, value in study.best_trial.params.items():
        file.write(f'{key}: {value}\n')

# print('Best trial:', study.best_trial.params)
study_df = study.trials_dataframe()
for trial_num in trial_data:
    study_df.loc[study_df.number == trial_num, 'fold_aucs'] = str(trial_data[trial_num]['fold_aucs'])
    study_df.loc[study_df.number == trial_num, 'feature_importances'] = str(trial_data[trial_num]['feature_importances'])
    
study_df.sort_values('value', ascending=False).to_csv(os.path.join(results_path, 'mvpa_merf_mind_opt_trials.csv'))



best_params = study.best_trial.params

# Initialize lists to store AUC and permutation scores
scores = []
perm_auc_scores_all = []
optimal_cutoffs = []

n_splits = 4
group_kfold = GroupKFold(n_splits=n_splits)


# Replace with your actual file path
best_params_path = os.path.join(results_path, 'mvpa_merf_mind_best_params.txt')

# Read the parameters from the file and store them in a dictionary
best_params = {}
with open(best_params_path, 'r') as file:
    for line in file:
        key, value = line.split(': ')
        best_params[key] = float(value.strip())

scores = []

for train_index, test_index in group_kfold.split(X, y, groups.values):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)
    
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clusters_train, clusters_test = groups.iloc[train_index], groups.iloc[test_index]
    
    # Create and configure the model with the best parameters
    merf = MERF(
        RandomForestRegressor(
            n_estimators=int(best_params["n_estimators"]),
            max_depth=int(best_params["max_depth"]),
            min_samples_leaf=int(best_params["min_samples_leaf"]),
            max_features=best_params["max_features"],
            random_state=42,  # Assuming you want to keep the random state fixed
            n_jobs=-1
            ),
        gll_early_stop_threshold=0.5,
        max_iterations=50
        )
    
    merf.fit(X_train, Z[train_index], clusters_train, y_train)
    
    
    y_pred = merf.predict(X_test, Z[test_index], clusters_test)

    # Convert continuous outputs to probabilities
    y_pred_proba = expit(y_pred)
    
    # Find optimal cutoff point based on F1 score
    thresholds = np.linspace(0, 1, 100)
    f1_scores = [f1_score(y_test, (y_pred_proba > t).astype(int)) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    optimal_cutoffs.append(optimal_threshold)
    
    # Thresholding to get class labels
    y_pred_class = (y_pred_proba > optimal_threshold).astype(int)

    # Compute AUC for probabilities
    auc = roc_auc_score(y_test, y_pred_proba)
    scores.append(auc)

# Average AUC over all folds
avg_auc = np.mean(scores)
print(scores)
print(avg_auc)

[I 2024-01-05 12:20:57,713] A new study created in memory with name: no-name-a0c6e071-e039-476a-9431-aef14730e9bf

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

INFO     [merf.py:307] Training GLL is -136.67008978234202 at iteration 1.
INFO     [merf.py:307] Training GLL is -162.80744685615008 at iteration 2.
INFO     [merf.py:321] Gll -162.80744685615008 less than threshold 0.19124416407008937, stopping early ...
INFO     [merf.py:307] Training GLL is -148.35718593464057 at iteration 1.
INFO     [merf.py:307] Training GLL is -181.99552323287156 at iteration 2.
INFO     [merf.py:321] Gll -181.99552323287156 less than threshold 0.22673884710276532, stopping early ...
INFO     [merf.py:307] Training GLL is -133.0758801517491 at iteration 1.
INFO     [merf.py:307] Training GLL is -159.95889636030262 at iteration 2.
INFO     [merf.py:321] Gll -159.95889636030262 le

In [7]:
# Extract feature importances
feature_importances = merf.fe_model.feature_importances_

features = df_mind.drop(['mind2', 'mind2_numeric', 'participant','segment'], axis=1).columns
# Combine names and importances into a DataFrame
importances_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importances_df.to_csv(os.path.join(results_path, f'mvpa_merf_feat_imp.csv'))


fig = px.scatter(importances_df, x='Importance', y='Feature', orientation='h',
                title=f'Feat Imp MERF, AUC: {avg_auc:.3f}', template = "plotly_white",
                color_discrete_sequence = [pink],
                labels = {'value':'Feature importance', 'features': 'Markers'}

                )

fig.update_traces(marker=dict(size = 8))

fig.update_layout(
    width=650,
    height=900,
#     autosize = True, 
    template = 'plotly_white',
        font=dict(
        family="Times new roman",
        size=20,
        color="black"
    ),
    xaxis = dict(
            visible=True,
            # range = [0.37,0.63], 
            tickfont = {"size": 20},
            title = 'Feature Importance'
        ),
    yaxis = dict(
        categoryorder =  'total ascending',
        tickfont = {"size": 20},
        # autorange = False,    
        automargin = True,
        # range = [-1,len(feat_import_mw)],
        dtick = 1
        ),
    showlegend=True, 

)

fig.show()
# pio.write_json(fig, 'Figs/univariate_roc_mw_segment.plotly')
fig.write_image(os.path.join(fig_path, 'feat_importance_mind_optuna.png'))
fig.write_image(os.path.join(fig_path, 'feat_importance_mind_optuna.svg'))

## dMW Vs sMW
This will be only performed in SC as they have more trials

In [17]:
agg_dict = {k:['mean', 'std'] for k in markers }
agg_dict.update({k:'first' for k in df_markers.drop(markers, axis=1).columns})

df_mw = (
    df_markers
    .query("probe == 'PC'")
    .query("mind != 'on-task'")
    .groupby(['segment', 'participant'], as_index = False).agg(agg_dict)
)

############################################################
################ Use normal names################
############################################################
df_mw.columns = df_mw.columns.map("_".join)

df_mw  = (df_mw
            .rename(columns = {'participant_first':'participant', 'probe_first':'probe', 'mind_first':'mind', 'segment_first':'segment'})
            .drop([ 'probe',], axis = 1) 
           )


############################################################
################ Use latex command for nmaes################
############################################################

# df_mw = correct_name_markers(df_mw)

# df_mw.columns = df_mw.columns.map("$_{".join).map(lambda x: x + '}$').map(lambda x: x.replace('$$', ''))

# df_mw  = (df_mw
#             .rename(columns = {'participant$_{first}$':'participant', 'probe$_{first}$':'probe', 'mind$_{first}$':'mind', 'segment$_{first}$':'segment', 'mind$_{}$':'mind'})
# #             .query("mind != 'dMW'") #if you want to test against just one of the mw   
#             .drop(['participant', 'probe',  'segment'], axis = 1)

#            )


df_mw['mind_numeric'] = (df_mw['mind'] == 'sMW').astype(int)

df_mw.to_csv(os.path.join(results_path,'data_mw.csv'))

# Columns to remove outliers from
columns_to_check = df_mw.drop(['mind', 'mind_numeric', 'participant', 'segment'], axis = 1).columns

def filter_participants(group):
    counts = group['mind_numeric'].value_counts()
    # Check if there is only one level of 'mind2' for the participant
    if len(counts) == 1:
        return False
    return all(count >= 1 for count in counts)
df_mw = df_mw.groupby('participant').filter(filter_participants)
# Remove outliers
df_mw_filtered = replace_outliers_with_participant_mean(df_mw, columns_to_check, z_threshold=3)

# df_mw = df_mw_filtered


In [24]:
# Initialize lists to store AUC and permutation scores
scores = []
perm_auc_scores_all = []
optimal_cutoffs = []

n_splits = 4
group_kfold = GroupKFold(n_splits=n_splits)

# Assuming df_mw['participant'] contains the participant IDs
# Prepare data
X = df_mw.drop(['mind', 'mind_numeric', 'participant','segment'], axis=1)
Z = np.ones((X.shape[0], 1))  # Random effects design matrix
groups = df_mw['participant']
y = df_mw['mind_numeric']


# Global dictionary for storing trial data
trial_data = {}
def objective(trial):
    global trial_data
    # Hyperparameters to be optimized
    n_estimators = trial.suggest_int('n_estimators', 2, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)

    scores = []
    
    feature_importances = None
    for train_index, test_index in group_kfold.split(X, y, groups.values):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        X_train = StandardScaler().fit_transform(X_train)
        X_test = StandardScaler().fit_transform(X_test)
        
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clusters_train, clusters_test = groups.iloc[train_index], groups.iloc[test_index]
        
        # Initialize MERF with trial-suggested parameters
        merf = MERF(fixed_effects_model=RandomForestRegressor(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    max_features=max_features,
                    random_state=42,
                    n_jobs=-1
        ),
        gll_early_stop_threshold=1,
        max_iterations=50
        )
        
        merf.fit(X_train, Z[train_index], clusters_train, y_train)
        
        
        y_pred = merf.predict(X_test, Z[test_index], clusters_test)

        # Convert continuous outputs to probabilities
        y_pred_proba = expit(y_pred)
        
        # Find optimal cutoff point based on F1 score
        thresholds = np.linspace(0, 1, 100)
        f1_scores = [f1_score(y_test, (y_pred_proba > t).astype(int)) for t in thresholds]
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        optimal_cutoffs.append(optimal_threshold)
        
        # Thresholding to get class labels
        y_pred_class = (y_pred_proba > optimal_threshold).astype(int)

        # Compute AUC for probabilities
        auc = roc_auc_score(y_test, y_pred_proba)
        scores.append(auc)
        
        # Get feature importances (only need to do this once per trial)
        if feature_importances is None:
            feature_importances = merf.fe_model.feature_importances_

    # Store trial data
    trial_data[trial.number] = {
        'fold_aucs': scores,
        'feature_importances': feature_importances
    }

    # Average AUC over all folds
    avg_auc = np.mean(scores)
    return avg_auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)


# Construct the file path for the study database
study_db_path = os.path.join(results_path, 'multivariate_merf_mw_study.db')
# Use the SQLite database at the specified path for storage
study = optuna.create_study(direction="maximize", study_name="mw_mvpa_study", storage=f'sqlite:///{study_db_path}', load_if_exists=True)

# Best hyperparameters.
# Save the best trial's parameters to a file
best_params_path = os.path.join(results_path, 'mvpa_merf_mw_best_params.txt')
with open(best_params_path, 'w') as file:
    for key, value in study.best_trial.params.items():
        file.write(f'{key}: {value}\n')

# print('Best trial:', study.best_trial.params)
study_df = study.trials_dataframe()
for trial_num in trial_data:
    study_df.loc[study_df.number == trial_num, 'fold_aucs'] = str(trial_data[trial_num]['fold_aucs'])
    study_df.loc[study_df.number == trial_num, 'feature_importances'] = str(trial_data[trial_num]['feature_importances'])
    
study_df.sort_values('value', ascending=False).to_csv(os.path.join(results_path, 'mvpa_merf_mw_opt_trials.csv'))


best_params = study.best_trial.params

# Initialize lists to store AUC and permutation scores
scores = []
perm_auc_scores_all = []
optimal_cutoffs = []

n_splits = 4
group_kfold = GroupKFold(n_splits=n_splits)


# Replace with your actual file path
best_params_path = os.path.join(results_path, 'mvpa_merf_mw_best_params.txt')

# Read the parameters from the file and store them in a dictionary
best_params = {}
with open(best_params_path, 'r') as file:
    for line in file:
        key, value = line.split(': ')
        best_params[key] = float(value.strip())

scores = []

for train_index, test_index in group_kfold.split(X, y, groups.values):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)
    
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clusters_train, clusters_test = groups.iloc[train_index], groups.iloc[test_index]
    
    # Create and configure the model with the best parameters
    merf = MERF(
        RandomForestRegressor(
            n_estimators=int(best_params["n_estimators"]),
            max_depth=int(best_params["max_depth"]),
            min_samples_leaf=int(best_params["min_samples_leaf"]),
            max_features=best_params["max_features"],
            random_state=42,  # Assuming you want to keep the random state fixed
            n_jobs=-1
            ),
        gll_early_stop_threshold=0.5,
        max_iterations=50
        )
    
    merf.fit(X_train, Z[train_index], clusters_train, y_train)
    
    
    y_pred = merf.predict(X_test, Z[test_index], clusters_test)

    # Convert continuous outputs to probabilities
    y_pred_proba = expit(y_pred)
    
    # Find optimal cutoff point based on F1 score
    thresholds = np.linspace(0, 1, 100)
    f1_scores = [f1_score(y_test, (y_pred_proba > t).astype(int)) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    optimal_cutoffs.append(optimal_threshold)
    
    # Thresholding to get class labels
    y_pred_class = (y_pred_proba > optimal_threshold).astype(int)

    # Compute AUC for probabilities
    auc = roc_auc_score(y_test, y_pred_proba)
    scores.append(auc)

# Average AUC over all folds
avg_auc = np.mean(scores)
print(scores)
print(avg_auc)

[I 2023-12-29 09:39:22,919] A new study created in memory with name: no-name-ed7c8306-e004-427d-bd50-4ba0e0ce10fc

suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float instead.

INFO     [merf.py:307] Training GLL is -51.59652948530491 at iteration 1.
INFO     [merf.py:307] Training GLL is -70.1409782926503 at iteration 2.
INFO     [merf.py:321] Gll -70.1409782926503 less than threshold 0.3594127161716757, stopping early ...
INFO     [merf.py:307] Training GLL is -51.62425264005837 at iteration 1.
INFO     [merf.py:307] Training GLL is -69.92745170971098 at iteration 2.
INFO     [merf.py:321] Gll -69.92745170971098 less than threshold 0.3545465190027768, stopping early ...
INFO     [merf.py:307] Training GLL is -50.470475757017994 at iteration 1.
INFO     [merf.py:307] Training GLL is -68.60546696430218 at iteration 2.
INFO     [merf.py:321] Gll -68.60546696430218 less than thr

[0.75, 0.746031746031746, 0.75, 0.6825396825396826]
0.7321428571428572


In [25]:
# Extract feature importances
feature_importances = merf.fe_model.feature_importances_

features = df_mw.drop(['mind', 'mind_numeric', 'participant','segment'], axis=1).columns
# Combine names and importances into a DataFrame
importances_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importances_df.to_csv(os.path.join(results_path, f'mvpa_mw_merf_feat_imp.csv'))


fig = px.scatter(importances_df, x='Importance', y='Feature', orientation='h',
                title=f'Feat Imp MERF, AUC: {avg_auc:.3f}', template = "plotly_white",
                color_discrete_sequence = [lblue],
                labels = {'value':'Feature importance', 'features': 'Markers'}

                )

fig.update_traces(marker=dict(size = 8))

fig.update_layout(
    width=650,
    height=900,
#     autosize = True, 
    template = 'plotly_white',
        font=dict(
        family="Times new roman",
        size=20,
        color="black"
    ),
    xaxis = dict(
            visible=True,
            # range = [0.37,0.63], 
            tickfont = {"size": 20},
            title = 'Feature Importance'
        ),
    yaxis = dict(
        categoryorder =  'total ascending',
        tickfont = {"size": 20},
        # autorange = False,    
        automargin = True,
        # range = [-1,len(feat_import_mw)],
        dtick = 1
        ),
    showlegend=True, 

)

fig.show()
# pio.write_json(fig, 'Figs/univariate_roc_mw_segment.plotly')
fig.write_image(os.path.join(fig_path, 'feat_importance_mw.png'))
fig.write_image(os.path.join(fig_path, 'feat_importance_mw.svg'))