In [60]:
# %%
import os
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
from plotly.subplots import make_subplots

from scipy.stats import wilcoxon, mannwhitneyu, trim_mean, rankdata
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import roc_auc_score


import sys
sys.path.insert(0, '../../')


# %%
# plotting parameters
grey = "#21201F"
green = "#9AC529"
lblue = "#42B9B2"
pink = "#DE237B"
orange = "#F38A31"

colors = [pink,  green, orange, lblue]

comparisons = ['on-task_vs_mw','on-task_vs_dMW', 'on-task_vs_sMW', 'dMW_vs_sMW']

# %% [markdown]
# # Load Data
# Loads data from the computed markers. From `Data` directory
data_path = "../../Data/"
results_path = "../..//Results/Stats/"
fig_path = results_path + "Figs/"

df = pd.read_csv(os.path.join(data_path, 'all_markers.csv'), index_col = 0)

# %%
markers = ['wSMI_1', 'wSMI_2', 'wSMI_4', 'wSMI_8', 'p_e_1', 'p_e_2',
       'p_e_4', 'p_e_8', 'k', 'se','msf', 'sef90', 'sef95', 'b', 'b_n', 'g',
       'g_n', 't', 't_n', 'd', 'd_n', 'a_n', 'a', 'CNV', 'P1', 'P3a', 'P3b',]

erps =['CNV', 'P1', 'P3a', 'P3b']

df_markers = (df
              .dropna()
            #   .query("stimuli == 'go'") # only go trials
              .query("correct == 'correct'") #only correct trials
              .query('prev_trial < 6') # only last 5 trials before each probe. 
              .drop(['stimuli', 'correct', 'prev_trial', 'label', 'events',  'epoch_type', 'preproc', 'ft', 'ft_n'], axis = 1) # drop unnecessary columns
              .query("mind in ['on-task','dMW', 'sMW']") # only mind wandering and on-task trials
            #   .groupby(['segment', 'participant']).filter(lambda x: len(x) > 1) # drop participants with less than 2 trials per segment
             )


comparisons = ['on-task_vs_mw','on-task_vs_dMW', 'on-task_vs_sMW', 'dMW_vs_sMW']

def preprocess_data(df_markers, markers, probe_type, comparison=None, only_full_participants=False, average_participants = False,latex_names=False, results_path=None):
    # Filtering and grouping
    df = df_markers.query(f"probe == '{probe_type}'")

    # Adjust mind categories based on comparison type
    if comparison:
        if comparison == 'on-task_vs_mw':
            df['mind_category'] = df['mind'].replace({'dMW': 'mw', 'sMW': 'mw'})
        elif comparison in ['on-task_vs_dMW', 'on-task_vs_sMW', 'dMW_vs_sMW']:
            mind_types = comparison.split('_vs_')
            df = df[df['mind'].isin(mind_types)]
            df['mind_category'] = df['mind']
    else:
        df['mind_category'] = df['mind']

    # Aggregation dictionary
    agg_dict = {k: [apply_trim_mean,'std'] for k in markers}
    agg_dict.update({k: 'first' for k in df.drop(markers, axis=1).columns})
    df = df.groupby(['segment', 'participant'], as_index=False).agg(agg_dict)
    # df = df.groupby(['mind_category', 'participant'], as_index=False).agg(agg_dict)
    

    # Renaming columns
    df.columns = df.columns.map("_".join)
    rename_dict = {
        'participant_first': 'participant',
        'probe_first': 'probe',
        'segment_first': 'segment',
        'mind_first': 'mind',
        'mind_category_first': 'mind_category'
    }
        # Update rename_dict for mean columns
    for marker in markers:
        rename_dict[f"{marker}_apply_trim_mean"] = f"{marker}_mean"
        
    df = df.rename(columns=rename_dict)

    # Dropping unnecessary columns
    df = df.drop(['probe', 'segment'], axis=1)

    if latex_names:
        # Apply latex naming
        df = correct_name_markers(df)
        df.columns = df.columns.map("$_{".join).map(lambda x: x + '}$').map(lambda x: x.replace('$$', ''))

    # Convert mind category to numeric for analysis
    mind_categories = df['mind_category'].unique()
    mind_category_numeric = {cat: i for i, cat in enumerate(mind_categories)}
    df['mind_numeric'] = df['mind_category'].map(mind_category_numeric)

    # Remove outliers
    columns_to_check = df.drop(['mind_category', 'mind_numeric', 'participant'], axis=1).columns
    # df = replace_outliers_with_participant_mean(df, columns_to_check, z_threshold=3)

    if only_full_participants:
        # Filter participants
        df = df.dropna().groupby('participant').filter(lambda group: filter_participants(group, 'mind_numeric'))
        
    if average_participants:
        df = df.drop(['mind', 'mind_category'], axis = 1).groupby(['participant', 'mind_numeric']).mean().reset_index()

    # Save to CSV if a path is provided
    if results_path:
        df.to_csv(os.path.join(results_path, f'data_{comparison}.csv'))

    return df

# Helper functions
from scipy.stats import zscore
def replace_outliers_with_participant_mean(df, columns, participant_column='participant', z_threshold=3):
    df_copy = df.copy()

    # Identify numeric columns
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns

    for col in columns:
        if col in numeric_cols:
            for participant in df_copy[participant_column].unique():
                subset = df_copy[df_copy[participant_column] == participant]
                col_zscore = zscore(subset[col])
                mean_value = np.mean(subset[col][np.abs(col_zscore) < z_threshold])

                # Count the outliers for each participant
                outlier_count = np.sum(np.abs(col_zscore) >= z_threshold)
                total = len(col_zscore)

                # Replace outliers with the mean value for each participant
                subset_indices = subset.index[np.abs(col_zscore) >= z_threshold]
                df_copy.loc[subset_indices, col] = mean_value

                if outlier_count > 0:
                    print(f"Replaced {outlier_count} outliers in column '{col}' out of {total} observations for participant {participant} with the mean value.")
    return df_copy


def filter_participants(group, mind_col_numeric):
    counts = group[mind_col_numeric].value_counts()
    # Check if there is only one level of mind state for the participant
    if len(counts) == 1:
        return False
    return all(count >= 1 for count in counts)

# Define a function to apply the trimmed mean
def apply_trim_mean(group):
    return trim_mean(group, 0.1)

# Function to calculate Cliff's Delta
def cliffs_delta(lst1, lst2):
    all_data = np.concatenate([lst1, lst2])
    ranks = rankdata(all_data)
    rank1 = ranks[:len(lst1)]
    rank2 = ranks[len(lst1):]
    delta = 2 * np.mean(rank1 > rank2) - 1
    return delta

def cohens_d(group1, group2):
    # Calculate the size of each group
    n1, n2 = len(group1), len(group2)
    
    # Calculate the variance of each group
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    
    # Calculate the pooled standard deviation (using unbiased estimator)
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    
    # Calculate Cohen's d
    d = (np.mean(group1) - np.mean(group2)) / pooled_std
    return d

def rosenthals_r(statistic, n_pairs):    
    # Calculate the mean and standard deviation of the rank sum distribution under the null hypothesis
    mean_rank_sum = n_pairs * (n_pairs + 1) / 4
    std_rank_sum = np.sqrt(n_pairs * (n_pairs + 1) * (2 * n_pairs + 1) / 24)
    
    # Calculate the Z-score
    z = (statistic - mean_rank_sum) / std_rank_sum
    
    # Calculate Rosenthal's r
    r = z / np.sqrt(n_pairs)
    return r


# On-task vs Off-task 

In [67]:
for idx, contrast in enumerate(comparisons):

    df_mind = preprocess_data(df_markers, markers, probe_type = 'PC', comparison= contrast, only_full_participants=True, average_participants=True,latex_names=False, results_path=None)
    print('Participants included in analysis:', df_mind.shape[0]/2)


    AUC = []
    pvalues = {}
    effect_sizes = {}
    for i in df_mind.drop(['participant', 'mind_numeric'], axis=1).columns:
        mind_0 = df_mind.query("mind_numeric == 0")[i]
        mind_1 = df_mind.query("mind_numeric == 1")[i]
        auc = roc_auc_score(df_mind['mind_numeric'], df_mind[i])
        AUC.append([i, auc])
        test_result = wilcoxon(x=mind_0, y=mind_1, alternative='two-sided', zero_method='zsplit')
        pvalues[i] = test_result.pvalue
        n1 = len(mind_0) 
        n2 = len(mind_1)
        rank_biserial = (2 * test_result.statistic) / (n1 * n2) - 1
        delta = cliffs_delta(mind_0, mind_1)
        d = cohens_d(mind_0, mind_1)
        r = rosenthals_r(test_result.statistic, n1)

        effect_sizes[i] = (rank_biserial, delta, d, r)

    wilcoxon_df = pd.DataFrame.from_dict(pvalues, orient='index', columns=['p_value']).reset_index().rename(columns={'index': 'markers'})
    wilcoxon_df['AUC'] = wilcoxon_df['markers'].map(dict(AUC))

    # Adding effect sizes to the DataFrame
    wilcoxon_df['Rank_Biserial'] = wilcoxon_df['markers'].apply(lambda x: effect_sizes[x][0])
    wilcoxon_df['Cliffs_Delta'] = wilcoxon_df['markers'].apply(lambda x: effect_sizes[x][1])
    wilcoxon_df['Cohens_d'] = wilcoxon_df['markers'].apply(lambda x: effect_sizes[x][2])
    wilcoxon_df['Rosenthals_r'] = wilcoxon_df['markers'].apply(lambda x: effect_sizes[x][3])
    


    wilcoxon_df = (wilcoxon_df
                .assign(
                        p_corrected = lambda df: multipletests(df.p_value, method='fdr_bh')[1],
                        significant = lambda df: np.select([(df.p_value < 0.05) & (df.p_corrected < 0.05), (df.p_value < 0.05) & (df.p_corrected > 0.05),  
                                                    (df.p_value > 0.05) & (df.p_corrected > 0.05)], ['p < 0.05 FDR corrected','p < 0.05 uncorrected', 'p > 0.05'])
                    )
            )


    wilcoxon_df.to_csv(os.path.join(results_path,f'wilcoxon_{contrast}.csv'))


    # segment_mind_roc = segment_mind_roc.sort_values(by = 'AUC', ascending = False).head(10).append(segment_mind_roc.sort_values(by = 'AUC', ascending = False).tail(10))

    fig = px.scatter(wilcoxon_df.sort_values(by = 'Cohens_d'),x = 'Cohens_d', y = 'markers', template = "plotly_white", symbol = 'significant', 
                    symbol_sequence = ['circle-open','circle','hexagram' ],
    #                  color = 'significant',
                    color_discrete_sequence = [colors[idx]], 
                    title= f'Wilcoxon size effect for {contrast} (n = {int(df_mind.shape[0]/2)})',
                    
                    category_orders = {'significant': ['p > 0.05','p < 0.05 uncorrected', 'p < 0.05 FDR corrected']},
                    labels = {
                        'AUC': '1>0                      0<1', 
                            'significant': 'Statistical Significance', 'markers':''}
                    )
    fig.add_vline(x=0, line_width=3, line_dash="dash", line_color="grey")
    fig.update_traces(marker=dict(size = 13))

    fig.update_layout(
        width=850,
        height=1300,
    #     autosize = True, 
        template = 'plotly_white',
            font=dict(
            family="Times new roman",
            size=20,
            color="black"
        ),
        xaxis = dict(
                visible=True,
                # range = [0.25,0.70], 
                tickfont = {"size": 20},
            ),
        yaxis = dict(
            tickfont = {"size": 20},
            autorange = False,    
            automargin = True,
            range = [-1,len(wilcoxon_df)],
            dtick = 1
            ),
        showlegend=True, 

    )

    fig.show()

    fig.write_image(os.path.join(fig_path,f'wilcoxon_{contrast}.png'))
    fig.write_image(os.path.join(fig_path,f'wilcoxon_{contrast}.pdf'))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Participants included in analysis: 17.0


Participants included in analysis: 14.0


Participants included in analysis: 13.0


Participants included in analysis: 12.0
