In [1]:
import os
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go


import statsmodels.api as sm

results_path = "../../Results/Multivariate_all_against_all/"
fig_path = results_path + "Figs/"

# %%
markers = ['wSMI_1_mean', 'wSMI_1_std', 'wSMI_2_mean', 'wSMI_2_std', 'wSMI_4_mean',
       'wSMI_4_std', 'wSMI_8_mean', 'wSMI_8_std', 'p_e_1_mean', 'p_e_1_std',
       'p_e_2_mean', 'p_e_2_std', 'p_e_4_mean', 'p_e_4_std', 'p_e_8_mean',
       'p_e_8_std', 'k_mean', 'k_std', 'se_mean', 'se_std', 'msf_mean',
       'msf_std', 'sef90_mean', 'sef90_std', 'sef95_mean', 'sef95_std',
       'b_mean', 'b_std', 'b_n_mean', 'b_n_std', 'g_mean', 'g_std', 'g_n_mean',
       'g_n_std', 't_mean', 't_std', 't_n_mean', 't_n_std', 'd_mean', 'd_std',
       'd_n_mean', 'd_n_std', 'a_n_mean', 'a_n_std', 'a_mean', 'a_std',
       'CNV_mean', 'CNV_std', 'P1_mean', 'P1_std', 'P3a_mean', 'P3a_std',
       'P3b_mean', 'P3b_std']

# plotting parameters
grey = "#21201F"
green = "#9AC529"
lblue = "#42B9B2"
pink = "#DE237B"
orange = "#F38A31"
colors = [pink,  green, orange, lblue]

comparisons = ['on-task_vs_mw','on-task_vs_dMW', 'on-task_vs_sMW', 'dMW_vs_sMW']
# comparisons = ['on-task_vs_mw',]




In [2]:
contrast = 'on-task_vs_sMW'
df = pd.read_csv(os.path.join(results_path, f'{contrast}_PC_K4_6_opt_trials.csv'), index_col = 0).sort_values('datetime_start', ascending = False)
# .drop_duplicates()
# df = df.query("number <= 200")

# .dropna()
# .drop_duplicates()
# df = df.query("number <= 200")

# Select the top 10 models based on their AUC values
top_models = df.nlargest(10, 'value')

# Function to parse the feature importances from string to list
def parse_importances(importances_str):
    values = importances_str.replace('[', '').replace(']', '').split()
    return [float(val) for val in values]

# Applying the parsing function to the feature importances
parsed_importances = top_models['feature_importances'].dropna().apply(parse_importances).tolist()

In [7]:
dfs = {}
for i, contrast in enumerate(comparisons):

    df = pd.read_csv(os.path.join(results_path, f'{contrast}_PC_K4_6_opt_trials.csv'), index_col = 0).sort_values('datetime_start', ascending = True)

    # .dropna()
    # .drop_duplicates()
    # df = df.query("number <= 200")
    
    # Select the top 10 models based on their AUC values
    top_models = df.nlargest(10, 'value')

    # Function to parse the feature importances from string to list
    def parse_importances(importances_str):
        values = importances_str.replace('[', '').replace(']', '').split()
        return [float(val) for val in values]

    # Applying the parsing function to the feature importances
    parsed_importances = top_models['feature_importances'].dropna().apply(parse_importances).tolist()

    # Feature names (replace this list with your actual feature names)
    feature_names = ['wSMI_1_mean', 'wSMI_1_std', 'wSMI_2_mean', 'wSMI_2_std', 'wSMI_4_mean',
                    'wSMI_4_std', 'wSMI_8_mean', 'wSMI_8_std', 'p_e_1_mean', 'p_e_1_std',
                    'p_e_2_mean', 'p_e_2_std', 'p_e_4_mean', 'p_e_4_std', 'p_e_8_mean',
                    'p_e_8_std', 'k_mean', 'k_std', 'se_mean', 'se_std', 'msf_mean',
                    'msf_std', 'sef90_mean', 'sef90_std', 'sef95_mean', 'sef95_std',
                    'b_mean', 'b_std', 'b_n_mean', 'b_n_std', 'g_mean', 'g_std', 'g_n_mean',
                    'g_n_std', 't_mean', 't_std', 't_n_mean', 't_n_std', 'd_mean', 'd_std',
                    'd_n_mean', 'd_n_std', 'a_n_mean', 'a_n_std', 'a_mean', 'a_std',
                    'CNV_mean', 'CNV_std', 'P1_mean', 'P1_std', 'P3a_mean', 'P3a_std',
                    'P3b_mean', 'P3b_std']

    # Creating a dataframe suitable for boxplot visualization
    feat_importances = pd.DataFrame(parsed_importances, columns=feature_names)
    boxplot_data = feat_importances.melt(var_name='Feature', value_name='Importance')
    
    dfs[contrast] = boxplot_data
    
    # Calculate medians for ordering
    medians = boxplot_data.groupby('Feature')['Importance'].mean().sort_values()

    # Create the box plot
    fig = go.Figure()

    # Add box plots
    fig.add_trace(go.Box(
        y=boxplot_data['Feature'], 
        x=boxplot_data['Importance'], 
        orientation='h', 
        marker_color=colors[i], 
        boxpoints='all', # Show all points
        pointpos=0, # Points are overlaid on the box
        jitter=0, # No jitter for the points; they are aligned vertically
    ))

    # Update layout with ordered features
    fig.update_layout(
        yaxis=dict(
            title='Feature',
            categoryorder='array',
            categoryarray=medians.index
        ),
        xaxis=dict(
            title='Importance'
        )
    )

    # Update marker size for all points
    fig.update_traces(marker=dict(size=3))



    fig.update_layout(
        width=650,
        height=1100,
        template='plotly_white',
        font=dict(
            family="Times new roman",
            size=20,
            color="black"
        ),
        xaxis=dict(
            visible=True,
            range = [0,0.3], 
            tickfont={"size": 20},
            title='Feature Importance'
        ),
        yaxis=dict(
            tickfont={"size": 20},
            showgrid=True, 
            automargin=True,
            range=[-1, len(markers)],
            dtick=1,
            categoryorder='array',
            categoryarray=medians.index,
        ),
        showlegend=True,
        # margin=dict(l=50, r=50, t=150, b=50)  # Adjust the top margin
    )
    fig.show()
    
    filename = os.path.join(fig_path, f'{contrast}_top_10_feat_importances_PC_K4')
    fig.write_image(filename +'.png')
    fig.write_image(filename +'.pdf')
    fig.write_image(filename +'.svg')
    # fig.write_image(filename +'.eps')
    
    # Calculate range, mean, and std for each feature
    summary_stats = feat_importances.agg(['min', 'max', 'mean', 'std']).T
    # summary_stats['range'] = summary_stats['max'] - summary_stats['min']
    # summary_stats = summary_stats[['range', 'mean', 'std']]
    summary_stats.to_csv(filename + '.csv')

In [9]:
contrast = 'on-task_vs_sMW'
i = 2


df = pd.read_csv(os.path.join(results_path, f'{contrast}_PC_K4_6_opt_trials.csv'), index_col = 0).sort_values('datetime_start', ascending = True)

# .dropna()
# .drop_duplicates()
# df = df.query("number <= 200")

# Select the top 10 models based on their AUC values
top_models = df.nlargest(10, 'value')

# Function to parse the feature importances from string to list
def parse_importances(importances_str):
    values = importances_str.replace('[', '').replace(']', '').split()
    return [float(val) for val in values]

# Applying the parsing function to the feature importances
parsed_importances = top_models['feature_importances'].dropna().apply(parse_importances).tolist()

# Feature names (replace this list with your actual feature names)
feature_names = ['wSMI_1_mean', 'wSMI_1_std', 'wSMI_2_mean', 'wSMI_2_std', 'wSMI_4_mean',
                'wSMI_4_std', 'wSMI_8_mean', 'wSMI_8_std', 'p_e_1_mean', 'p_e_1_std',
                'p_e_2_mean', 'p_e_2_std', 'p_e_4_mean', 'p_e_4_std', 'p_e_8_mean',
                'p_e_8_std', 'k_mean', 'k_std', 'se_mean', 'se_std', 'msf_mean',
                'msf_std', 'sef90_mean', 'sef90_std', 'sef95_mean', 'sef95_std',
                'b_mean', 'b_std', 'b_n_mean', 'b_n_std', 'g_mean', 'g_std', 'g_n_mean',
                'g_n_std', 't_mean', 't_std', 't_n_mean', 't_n_std', 'd_mean', 'd_std',
                'd_n_mean', 'd_n_std', 'a_n_mean', 'a_n_std', 'a_mean', 'a_std',
                'CNV_mean', 'CNV_std', 'P1_mean', 'P1_std', 'P3a_mean', 'P3a_std',
                'P3b_mean', 'P3b_std']

# Creating a dataframe suitable for boxplot visualization
feat_importances= pd.DataFrame(parsed_importances, columns=feature_names)
boxplot_data = feat_importances.melt(var_name='Feature', value_name='Importance')

# Calculate medians for ordering
medians = boxplot_data.groupby('Feature')['Importance'].mean().sort_values()

# Create the box plot
fig = go.Figure()

# Add box plots
fig.add_trace(go.Box(
    y=boxplot_data['Feature'], 
    x=boxplot_data['Importance'], 
    orientation='h', 
    marker_color=colors[i], 
    boxpoints='all', # Show all points
    pointpos=0, # Points are overlaid on the box
    jitter=0, # No jitter for the points; they are aligned vertically
))

# Update layout with ordered features
fig.update_layout(
    yaxis=dict(
        title='Feature',
        categoryorder='array',
        categoryarray=medians.index
    ),
    xaxis=dict(
        title='Importance'
    )
)

# Update marker size for all points
fig.update_traces(marker=dict(size=3))

fig.update_layout(
    width=650,
    height=1100,
    template='plotly_white',
    font=dict(
        family="Times new roman",
        size=20,
        color="black"
    ),
    xaxis=dict(
        visible=True,
        range = [0,0.9], 
        tickfont={"size": 20},
        title='Feature Importance'
    ),
    yaxis=dict(
        tickfont={"size": 20},
        automargin=True,
        showgrid=True, 
        range=[-1, len(markers)],
        dtick=1,
        categoryorder='array',
        categoryarray=medians.index
    ),
    showlegend=True,
    # margin=dict(l=50, r=50, t=150, b=50)  # Adjust the top margin
)
fig.show()

filename = os.path.join(fig_path, f'{contrast}_top_10_feat_importances_PC_K4')
fig.write_image(filename +'.png')
fig.write_image(filename +'.pdf')
fig.write_image(filename +'.svg')
# fig.write_image(filename +'.eps')

summary_stats = feat_importances.agg(['min', 'max', 'mean', 'std']).T
# summary_stats['range'] = summary_stats['max'] - summary_stats['min']
# summary_stats = summary_stats[['range', 'mean', 'std']]
summary_stats.to_csv(filename + '.csv')

In [41]:
df1 = dfs['on-task_vs_sMW'].groupby('Feature').mean().sort_values('Feature', ascending = False).reset_index()
df2 = dfs['dMW_vs_sMW'].groupby('Feature').mean().sort_values('Feature', ascending = False).reset_index()

regression = sm.OLS(df1.Importance, sm.add_constant(df2.Importance)).fit().fittedvalues

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df1.Importance,
    y=df2.Importance,
    text = df1.Feature,
    mode = 'markers+text',
    marker = {'color':pink, 'size': 5},
#     marker_symbol= symbol_fun(mind, significant_mw)
    
))
fig.add_trace(px.line(x=[0,df1.Importance.max()], y=[0,df2.Importance.max()]).data[0])

# fig.add_vline(x=0.5, line_width=1, line_dash="dash", line_color="grey")
# fig.add_hline(y=0.5, line_width=1, line_dash="dash", line_color="grey")

# fig.add_trace(go.Scatter(name='line of best fit', x=df1.Importance, y=regression, mode='lines', line_color=pink))


# fig.add_trace(go.Scatter(
#     x=mw.AUC_x,
#     y=mw.AUC_y,
#     text = mw.marker,
#     mode = 'markers+text',
#     marker = {'color':lblue, 'size': 5},
# #     marker_symbol= symbol_fun(mind, significant_mw)
    
# ), row =1, col = 2)
# fig.add_vline(x=0.5, line_width=1, line_dash="dash", line_color="grey", row = 1, col = 2)
# fig.add_hline(y=0.5, line_width=1, line_dash="dash", line_color="grey", row = 1, col = 2)
# fig.add_trace(go.Scatter(name='line of best fit', x=mw.AUC_x, y=mw_regression, mode='lines', line_color=lblue), row = 1, col = 2)


# # Update xaxis properties
# fig.update_xaxes(title_text="Average", range=[.35, .65],row=1, col=1)
# fig.update_xaxes(title_text="Average", range=[.35,.65], row=1, col=2)

# # Update yaxis properties
# fig.update_yaxes(title_text="Fluctuations", range=[.35, .65],row=1, col=1)
# fig.update_yaxes(title_text="Fluctuations", range=[.35,.65], row=1, col=2)


fig.update_layout(
    width=400,
    height=400,    
    template = "plotly_white",
    showlegend=False,

)

fig.show()
# pio.write_json(fig, 'Figs/average_fluctuations_auc_all.plotly')