# Set env:

In [None]:
import os
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import re
import scipy
from scipy.stats import norm, linregress, pearsonr
from scipy.optimize import Bounds, LinearConstraint, minimize, SR1
import seaborn as sns
import sys
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.colors import LinearSegmentedColormap
from IPython.display import display, HTML
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, roc_curve, auc
from tqdm import tqdm 
pd.set_option('display.width', 1000)


# Define fixed ranges for the structure
pred_pss_values = np.arange(100, 401, 50)  # Pred_PSS_Value: 0 to 400, increments of 50
pred_accuracies = np.arange(0, 1.1, 0.1).round(1)  # Pred_Accuracy: 0.1 to 1.0, increments of 0.1

# Define functions:

REFERENCE SAMPLE: Function to compute response counts

In [None]:
def compute_response_counts_ref(participants_df, participant_id, i):
    # Count the occurrences of each response code per Staircase_name
    response_counts = participants_df.groupby(['Staircase_name', 'Response_code']).size().unstack(fill_value=0)
    
    # Rename columns for clarity
    response_counts = response_counts.rename(columns={-1: "Before_Response", 
                                                       0: "Same_Response", 
                                                       1: "After_Response"})
    
    # Reset index for merging or visualization
    response_counts = response_counts.reset_index()
    
    # Add participant_id column
    response_counts['participant_id'] = f"{participant_id}_{i}"
    
    return response_counts

REFERENCE SAMPLE: Function to create simuation_matrix. Creates plots of Point of subjective simultaneity (PSS) and accuracy trajectories

In [None]:
def reference_matrix(simulation_df, pred_pss_values, pred_accuracies, drop = True, plots = True):
    
    # Recode trial numbers: Make trials continuous from 1-30
    simulation_df['Recode_Trial'] = np.where(simulation_df['Staircase_name'].str.contains('400'),
                                             simulation_df['Trial'], 
                                             simulation_df['Trial'] + 15)
    
    
    if plots:   
        colors = plt.cm.viridis(np.linspace(0, 1, len(pred_accuracies)))  # Color map for accuracy levels

        # Create a grid for the subplots
        n_rows = 4
        n_cols = 2
        n_iterations = 100
        fig, axs = plt.subplots(n_rows, n_cols, figsize=(12, 12), dpi=300)
        axs = axs.ravel()  # Flatten the 2D array of axes to 1D for easy iteration

        for i, pss_value in enumerate(pred_pss_values):
            ax = axs[i]
            df_pss = simulation_df[simulation_df['PSS_value'] == pss_value]

            # Set title for the subplot
            ax.set_title(f'PSS Value: {pss_value} ms', fontsize=14)

            # Loop through each accuracy value
            for accuracy_idx, accuracy in enumerate(pred_accuracies):
                # Filter data by accuracy and group by Staircase_name and Trial
                df_accuracy = df_pss[df_pss['Accuracy'] == accuracy]
                mean_stim_values = df_accuracy.groupby(['Staircase_name', 'Trial'])['Current Delay'].mean()
                std_stim_values = df_accuracy.groupby(['Staircase_name', 'Trial'])['Current Delay'].std()

                # Plot each staircase separately
                for staircase_name, group in mean_stim_values.groupby(level=0):
                    # Compute 95% confidence intervals
                    trial_numbers = group.index.get_level_values(1)
                    ci_95 = 1.96 * std_stim_values.loc[staircase_name] / np.sqrt(n_iterations)

                    # Plot the mean stimulus values and confidence intervals
                    ax.plot(group.index.get_level_values(1), group.values, 
                            label=f'{staircase_name} - Accuracy {accuracy*100:.0f}%', 
                            color=colors[accuracy_idx], 
                            marker='o')

                    ax.fill_between(group.index.get_level_values(1), 
                                    group.values - ci_95, 
                                    group.values + ci_95, 
                                    color=colors[accuracy_idx], 
                                    alpha=0.2)

            # Labeling the subplot
            ax.set_xlabel('Trial')
            ax.set_ylabel('Current Delay (ms)')
            ax.set_ylim(0, 800)  # Set y-axis limit between 0 and 800
            #ax.legend(loc='upper right')
            ax.axhspan((pss_value-1), (pss_value +1), color='red', alpha=1.0, linestyle = '--', zorder=10)

            # Adjust layout for better spacing between subplots
            plt.tight_layout()

            
            save_path = '/PATH/Reference_trajectories_100_400.png'
            
        plt.savefig(save_path) 
        plt.show()
        print(f"Saved to {save_path}")
    
    if drop: 
        #Remove the first two trials from each staircase
        #simulation_df = simulation_df[~simulation_df['Recode_Trial'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25])] #last 5 trials only
        simulation_df = simulation_df[~simulation_df['Recode_Trial'].isin([1, 2, 3, 4, 5, 16, 17, 18, 19, 20])] #trials 5-15

    
    # Create a pivot table to calculate mean delay
    pivot_df = simulation_df.pivot_table(
        index=['PSS_value', 'Recode_Trial'],  # Depth: PSS_value; Rows: Trial number
        columns='Accuracy',                  # Columns: Accuracy values
        values='Current Delay',              # Values: Mean Delay
        aggfunc='mean'                       # Aggregate function: mean
    )
    
    # Only use PSS = 100-400; exclude PSS isin([50, 450, 500, 550, 600]
    pivot_df = pivot_df.loc[~pivot_df.index.get_level_values('PSS_value').isin([50, 450, 500, 550, 600])]
    
    # Fill missing values
    pivot_df = pivot_df.fillna(0)
    
    # Prepare labels for heatmaps
    trial_labels = [f"Trial {i}" for i in sorted(simulation_df['Recode_Trial'].unique())]
    accuracy_labels = [f"Acc {round(acc, 2)}" for acc in sorted(simulation_df['Accuracy'].unique())]
    pss_values = sorted(pivot_df.index.get_level_values('PSS_value').unique())
    #pss_values = sorted(simulation_df[~simulation_df['PSS_value'].isin([50, 450, 500, 550, 600])

    return pivot_df


REAL SAMPLE: Function to process and plot data for each PSS value, grouped by Staircase_name and Accuracy


In [None]:
def plot_simulations_with_participant_data(simulation_df, participant_df, participant_id, pss_value_list, accuracy_list, n_iterations, trials_per_iteration=15):
    colors = plt.cm.viridis(np.linspace(0, 1, len(accuracy_list)))  # Color map for accuracy levels

    # Create a grid for the subplots
    n_rows = 3
    n_cols = 3
    n_iterations = 100
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(18, 12))
    axs = axs.ravel()  # Flatten the 2D array of axes to 1D for easy iteration

    # Loop through each PSS value
    for i, pss_value in enumerate(pss_value_list):
        ax = axs[i]
        df_pss = simulation_df[simulation_df['PSS_value'] == pss_value]
        
        # Set title for the subplot
        ax.set_title(f'PSS Value: {pss_value} ms', fontsize=14)

        # Loop through each accuracy value
        for accuracy_idx, accuracy in enumerate(accuracy_list):
            # Filter data by accuracy and group by Staircase_name and Trial
            df_accuracy = df_pss[df_pss['Accuracy'] == accuracy]
            mean_stim_values = df_accuracy.groupby(['Staircase_name', 'Trial'])['Current Delay'].mean()
            std_stim_values = df_accuracy.groupby(['Staircase_name', 'Trial'])['Current Delay'].std()

            # Plot each staircase separately
            for staircase_name, group in mean_stim_values.groupby(level=0):
                # Compute 95% confidence intervals
                trial_numbers = group.index.get_level_values(1)
                ci_95 = 1.96 * std_stim_values.loc[staircase_name] / np.sqrt(n_iterations)

                # Plot the mean stimulus values and confidence intervals
                ax.plot(group.index.get_level_values(1), group.values, 
                        label=f'{staircase_name} - Accuracy {accuracy*100:.0f}%', 
                        color=colors[accuracy_idx], 
                        marker='o')

                ax.fill_between(group.index.get_level_values(1), 
                                group.values - ci_95, 
                                group.values + ci_95, 
                                color=colors[accuracy_idx], 
                                alpha=0.2)

        # Overlay participant data (mean of repeated staircases)
        # Group participant data by Modified Staircase_name and Trial
        participant_mean_data = participant_df.groupby(['Modified_Staircase_name', 'Trial'])['Current Delay'].mean()
        participant_std_data = participant_df.groupby(['Modified_Staircase_name', 'Trial'])['Current Delay'].std()

        # Plot each staircase mean with a different line
        for staircase_name in participant_df['Modified_Staircase_name'].unique():
            # Filter participant data for the current staircase
            participant_data_for_staircase = participant_mean_data.loc[staircase_name]
            if len(participant_data_for_staircase) > 1:  # Ensure there is more than 1 data point
                trial_numbers = participant_data_for_staircase.index  # X-axis: Trial numbers
                mean_delays = participant_data_for_staircase.values  # Y-axis: Current Delay values

                # Compute confidence intervals for participant data
                ci_participant_95 = 1.96 * participant_std_data.loc[staircase_name] / np.sqrt(n_iterations)

                # Plot the participant mean and confidence interval
                ax.plot(trial_numbers, mean_delays, 
                        label=f'Participant ({staircase_name})', 
                        color='red', 
                        linewidth=2, 
                        marker='x')

                ax.fill_between(trial_numbers, 
                                mean_delays - ci_participant_95, 
                                mean_delays + ci_participant_95, 
                                color='red', alpha=0.2)

        # Labeling the subplot
        ax.set_xlabel('Trial')
        ax.set_ylabel('Current Delay (ms)')
        ax.set_ylim(0, 800)  # Set y-axis limit between 0 and 800
        #ax.legend(loc='upper right')

    # Adjust layout and show the plot
    file_name = '/PATH/Real_participants_trajectory_plots/Trajectory_sub-' + participant_id + '.png'
    
    plt.tight_layout()
    plt.savefig(file_name)
    plt.show()


REAL SAMPLE: Function to plot data for each PSS value, grouped by Staircase_name and Accuracy


In [None]:
def plot_participant_trajectory(participant_df, participant_id, trials_per_iteration=15):
    n_iterations = 100  # Used for confidence interval calculation
    
    # Create a single plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Loop through each unique staircase in 'Staircase_name'
    for staircase_name in participant_df['Staircase_name'].unique():
        # Filter participant data for the current staircase
        staircase_data = participant_df[participant_df['Staircase_name'] == staircase_name]
        
        # Compute mean based on 'Modified_Staircase_name'
        participant_mean_data = staircase_data.groupby(['Modified_Staircase_name', 'Trial'])['Current Delay'].mean()
        participant_std_data = staircase_data.groupby(['Modified_Staircase_name', 'Trial'])['Current Delay'].std()
        
        for modified_staircase_name in staircase_data['Modified_Staircase_name'].unique():
            if modified_staircase_name in participant_mean_data.index:
                modified_staircase_data = participant_mean_data.loc[modified_staircase_name]
                trial_numbers = modified_staircase_data.index  # X-axis: Trial numbers
                mean_delays = modified_staircase_data.values  # Y-axis: Current Delay values
                
                # Compute confidence intervals for participant data
                ci_participant_95 = 1.96 * participant_std_data.loc[modified_staircase_name] / np.sqrt(n_iterations)
                
                # Plot the participant mean and confidence interval
                ax.plot(trial_numbers, mean_delays, 
                        label=f'Participant ({modified_staircase_name})', 
                        linewidth=2, 
                        color = 'blue',
                        marker='x')
                
                ax.fill_between(trial_numbers, 
                                mean_delays - ci_participant_95, 
                                mean_delays + ci_participant_95, 
                                color = 'blue',
                                alpha=0.2)

        for staircase_name in participant_df['Modified_Staircase_name'].unique():
            # Filter participant data for the current staircase
            staircase_data = participant_df[participant_df['Modified_Staircase_name'] == staircase_name]
            
            # Compute mean based on 'Modified_Staircase_name'
            participant_mean_data = staircase_data.groupby(['Modified_Staircase_name', 'Trial'])['Current Delay'].mean()
            participant_std_data = staircase_data.groupby(['Modified_Staircase_name', 'Trial'])['Current Delay'].std()
            
            for modified_staircase_name in staircase_data['Modified_Staircase_name'].unique():
                if modified_staircase_name in participant_mean_data.index:
                    modified_staircase_data = participant_mean_data.loc[modified_staircase_name]
                    trial_numbers = modified_staircase_data.index  # X-axis: Trial numbers
                    mean_delays = modified_staircase_data.values  # Y-axis: Current Delay values
                    
                    # Compute confidence intervals for participant data
                    ci_participant_95 = 1.96 * participant_std_data.loc[modified_staircase_name] / np.sqrt(n_iterations)
                    
                    # Plot the participant mean and confidence interval
                    ax.plot(trial_numbers, mean_delays, 
                            label=f'Participant ({staircase_name})', 
                            color='red', 
                            linewidth=2, 
                            marker='x')
    
                    ax.fill_between(trial_numbers, 
                                    mean_delays - ci_participant_95, 
                                    mean_delays + ci_participant_95, 
                                    color='red', alpha=0.2)
    
    # Labeling the plot
    ax.set_xlabel('Trial')
    ax.set_ylabel('Current Delay (ms)')
    ax.set_ylim(0, 800)  # Set y-axis limit between 0 and 800
    #ax.legend(loc='upper right')
    
    # Save and show the plot
    file_name = f'/PATH/Real_participants_trajectory_plots/Trajectory_nosim_sub-{participant_id}.png'
    plt.tight_layout()
    plt.savefig(file_name)
    plt.show()

REAL SAMPLE: Function to generate 'participant_matrix'. For each participant returns pred_pss_value, pred_accuracy, rmse, rmse_table, and optional heatmap.

In [None]:
def participant_matrix(df, participant_id, pivot_df, drop = True, plots=False, verbose=False):
    # Filter the data for this participant directly from the DataFrame
    participant_data = df
    
    if participant_data.empty:
        # If no data is found, return None
        return None, None, None, None, None  # Return None if no data is found

    # Recode the trial numbers: 1-15 for 400ms, 16-30 for 100ms
    participant_data.loc[:, 'Recode_Trial'] = np.where(participant_data['Staircase_name'].str.contains('400'),
                                                       participant_data['Trial'], 
                                                       participant_data['Trial'] + 15)

    # For 400ms (trials 1-15)
    mean_delays_400 = participant_data[participant_data['Staircase_name'].str.contains('400')] \
                        .groupby('Recode_Trial')['Current Delay'].mean()

    # For 100ms (trials 16-30)
    mean_delays_100 = participant_data[participant_data['Staircase_name'].str.contains('100')] \
                        .groupby('Recode_Trial')['Current Delay'].mean()

    # Combine the two mean delay series (400_1, 400_2, and 100_1, 100_2) into one DataFrame
    mean_delays = pd.concat([mean_delays_400, mean_delays_100])

    # Reindex the trials: 1-15 for 400ms, 16-30 for 100ms
    mean_delays.index = np.arange(1, 31)

    if drop:
        #mean_delays = mean_delays.drop([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) # last 5 trials only
        mean_delays = mean_delays.drop([1, 2, 3, 4, 5, 16, 17, 18, 19, 20]) #trials 5-15

    if verbose: 
        print('Mean Delays:')
        print(mean_delays)
        
    # Create an empty DataFrame to store the RMSE values for each PSS and accuracy level
    rmse_table = pd.DataFrame(index=pivot_df.index.get_level_values('PSS_value').unique(), 
                              columns=[round(i * 0.1, 1) for i in range(11)])

    # Loop through each PSS value and accuracy level, calculate RMSE and store it in the table
    for pss in rmse_table.index:
        for accuracy in rmse_table.columns:
            # Extract the trial indices for the current PSS and accuracy level from pivot_df
            trial_indices = pivot_df.xs(pss, level='PSS_value').loc[:, accuracy].index

            # Extract corresponding mean delay values for those trials
            mean_delay_values = mean_delays.loc[trial_indices].values

            # Extract the corresponding pivot values for the current PSS and accuracy level
            pivot_values = pivot_df.xs(pss, level='PSS_value').loc[:, accuracy].values

            # Calculate RMSE if both arrays are not empty
            if mean_delay_values.size > 0 and pivot_values.size > 0:
                rmse = np.sqrt(np.mean((mean_delay_values - pivot_values) ** 2))
            else:
                rmse = np.nan  # If no data is available, assign NaN

            # Store the RMSE in the table
            rmse_table.loc[pss, accuracy] = rmse
    
    if verbose: 
        display(HTML(rmse_table.to_html()))

    # Ensure the RMSE table is numeric (in case of any issues with non-numeric values)
    rmse_table = rmse_table.apply(pd.to_numeric, errors='coerce')
    
        # 2. Find the lowest 5 RMSE values
    lowest_rmse = rmse_table.unstack().sort_values().head(5)

    # Display the lowest 5 RMSE values along with corresponding PSS and accuracy levels
    lowest_rmse_table = pd.DataFrame(lowest_rmse).reset_index()
    lowest_rmse_table.columns = ['Accuracy_Level', 'PSS_Value', 'RMSE']
    
    if verbose: 
        display(HTML(lowest_rmse_table.to_html()))
        print(f"{lowest_rmse_table[0:1]['PSS_Value']} \n {lowest_rmse_table[0:1]['Accuracy_Level']} \n {lowest_rmse_table[0:1]['RMSE']}")

    # Assuming that pred_pss_value, pred_accuracy, and rmse are the first values from lowest_rmse_table
    pred_pss_value = lowest_rmse_table['PSS_Value'].values[0]
    pred_accuracy = lowest_rmse_table['Accuracy_Level'].values[0]
    rmse = lowest_rmse_table['RMSE'].values[0]
    
    
    # Plot if requested
    if plots:
        # Create the heatmap
        plt.figure(figsize=(12, 8))  # Adjust figure size if necessary
        sns.heatmap(rmse_table, annot=True, cmap="coolwarm", cbar_kws={'label': 'RMSE'}, fmt='.1f')
        plt.title("Heatmap of RMSE for Different PSS Values and Accuracy Levels")
        plt.xlabel("Accuracy Levels")
        plt.ylabel("PSS Values")
        file_name = '/PATH/RMSE_heatmaps/PSS_stimulus_progression_sub-' + participant_id + '.png'
        plt.savefig(file_name)
        #plt.show()
                
            
        #Create a density plot:
        # Flatten the RMSE table values into a 1D array
        rmse_values = rmse_table.values.flatten().round()
        plt.figure(figsize=(12, 8))
        sns.kdeplot(rmse_values, shade=True, color='skyblue', bw_adjust=0.5)

        # Add annotated vertical lines based on RMSE values from the table
        # Assuming these are the RMSE values where you want the lines
        rmse_table['RMSE'] = pd.to_numeric(lowest_rmse_table['RMSE'], errors='coerce')
        rmse_line_values = lowest_rmse_table['RMSE'].tolist()

        # Plot vertical lines and annotate
        for value in rmse_line_values:
            plt.axvline(x=value, color='red', linestyle='--', linewidth=1)
            plt.text(value + 0.2, 0.002, f'RMSE = {value:.0f}', color='red', rotation=45)

        # Add title and labels
        plt.title("Density Plot of RMSE Values with Annotated Vertical Lines")
        plt.xlabel("RMSE Value")
        plt.ylabel("Density")
        # Display the plot
        #plt.show()

        
        # Create a scatter plot
        rmse_table_reset = rmse_table.reset_index()

        # Melt the DataFrame so that each row corresponds to an (accuracy, RMSE) pair with a PSS value
        rmse_table_melted = rmse_table_reset.melt(id_vars=['PSS_value'], var_name='Accuracy_Level', value_name='RMSE')

        # Create the scatter plot
        plt.figure(figsize=(10, 6))

        # Use seaborn's scatterplot with color mapped to PSS_value
        sns.scatterplot(data=rmse_table_melted, x='Accuracy_Level', y='RMSE', hue='PSS_value', palette='viridis', s=100, marker='o')

        # Add labels and title
        plt.title("Accuracy vs RMSE (Colored by PSS Value)", fontsize=16)
        plt.xlabel("Accuracy Level", fontsize=14)
        plt.ylabel("RMSE", fontsize=14)

        # Show the plot
        plt.legend(title='PSS Value', bbox_to_anchor=(1.05, 1), loc='upper left')
        file_name = '/PATH/RMSE_heatmaps/PSS_stimulus_progression_sub-' + participant_id + '_2.png'
        plt.savefig(file_name)
        #plt.show()

    return pred_pss_value, pred_accuracy, rmse, rmse_table 




REAL SAMPLE: Function to compute response counts

In [None]:
def compute_response_counts(participants_df, participant_id):
    # Count the occurrences of each response code per Staircase_name
    response_counts = participants_df.groupby(['Staircase_name', 'Response_code']).size().unstack(fill_value=0)
    
    # Rename columns for clarity
    response_counts = response_counts.rename(columns={-1: "Before_Response", 
                                                       0: "Same_Response", 
                                                       1: "After_Response"})
    
    # Reset index for merging or visualization
    response_counts = response_counts.reset_index()
    
    # Add participant_id column
    response_counts['participant_id'] = participant_id
    
    return response_counts


REAL SAMPLE: Function to get p-value of rmse for allocated PSS, relative to the null, for each participant

In [None]:
def compute_p_values(participant_data, df_vector_across_slices, plotting=False):
    p_values = []
    
    for idx, participant in participant_data.iterrows():
        participant_rmse = participant['RMSE']
        participant_pss = participant['Pred_PSS_Value']
        participant_accuracy = participant['Pred_Accuracy']
        
        # Extract the corresponding distribution from df_vector_across_slices
        try:
            null_vector = df_vector_across_slices.loc[participant_pss, participant_accuracy]
            null_vector = [x for x in null_vector if not np.isnan(x)]  # Remove NaNs if any
        except KeyError:
            print(f"Warning: PSS={participant_pss}, Accuracy={participant_accuracy} not found in df_vector_across_slices.")
            p_values.append(np.nan)
            continue
        
        if len(null_vector) < 2:
            #print(f"Insufficient data for PSS={participant_pss}, Accuracy={participant_accuracy}.")
            # Assign p-value as 1/10,000 if there's no data (extremely unlikely)
            print(f"Extreme case: PSS={participant_pss}, Accuracy={participant_accuracy} Participant RMSE of {participant_rmse:.2f} is out of range of the null distribution.")
            p_values.append(1/10000)  # Reflect extreme unlikeliness
            continue
        
        # Calculate mean and std deviation of the null distribution
        mean_null = np.mean(null_vector)
        std_null = np.std(null_vector, ddof=1)  # Sample std deviation
        
        # Calculate one-tailed p-value (Pr(X < participant_rmse))
        p_value = norm.cdf(participant_rmse, loc=mean_null, scale=std_null)
        
        p_values.append(p_value)
        
        # Optional plotting
        if plotting:
            plt.figure(figsize=(8, 6))
            #sns.histplot(np.array(null_vector).ravel(), bins=50, kde=True, color='#F27B5A', label='Null Distribution', alpha=0.1)
            sns.histplot(null_vector, bins=50, kde=False, color='#F27B5A', label='Null Distribution', alpha=0.1)
            plt.axvline(participant_rmse, color='red', linestyle='--', linewidth=2, label=f'Participant RMSE ({participant_rmse:.2f})')
            plt.title(f'Participant RMSE vs Null Distribution\nPSS={participant_pss}, Accuracy={participant_accuracy}, p-value={p_value:.3f}')
            plt.xlabel('RMSE')
            plt.ylabel('Frequency')
            plt.legend()
            plt.close()  # Close the plot to suppress showing
    
    # Add p-values to the participant_data DataFrame
    participant_data['p_value'] = p_values
    return participant_data

REAL SAMPLE: Function to calculate how many participants had >5 or <15 button presses per staircase

In [None]:
def count_participants_with_values_outside_range_for_staircase_type(pivoted_df, staircase_type='100'):
    """
    This function counts the number of participants whose any of the response columns for the specified staircase type
    (e.g., '100' or '400') fall outside the range [5, 15].

    Parameters:
    pivoted_df : DataFrame
        The pivoted DataFrame with response counts.
    staircase_type : str
        The staircase type ('100' or '400') to filter for.

    Returns:
    int
        The number of participants whose any of the response columns fall outside the range [5, 15].
    """
    # Define the response columns
    response_columns = ['Before_Response', 'Same_Response', 'After_Response']
    
    # Filter the columns based on the staircase type (e.g., '100' or '400')
    staircase_columns = [f"{col}_{staircase_type}" for col in response_columns]
    
    # Check if any of the columns for this staircase type fall outside the range [5, 15]
    condition = (pivoted_df[staircase_columns] <= 5) | (pivoted_df[staircase_columns] >= 15)
    
    # Get the participant IDs where any of the conditions hold true (i.e., any value falls outside [5, 15])
    participants_outside_range = pivoted_df[condition.any(axis=1)].index.tolist()
    
    return participants_outside_range


REAL SAMPLE: Function to calculate how many participants had >5 or <15 button presses in either staircase

In [None]:
def count_overall_outside_range(pivoted_df):
    """
    This function counts the number of unique participants whose any response column for either staircase type
    ('100' or '400') falls outside the range [5, 15].

    Parameters:
    pivoted_df : DataFrame
        The pivoted DataFrame with response counts.
    
    Returns:
    unique_participants_outside_range : list
        The list of unique participant IDs who meet the condition for either staircase type.
    """
    # Get participants for staircase '100'
    participants_100_outside_range = count_participants_with_values_outside_range_for_staircase_type(pivoted_df, staircase_type='100')

    # Get participants for staircase '400'
    participants_400_outside_range = count_participants_with_values_outside_range_for_staircase_type(pivoted_df, staircase_type='400')

    # Combine the two lists to get unique participants (those who meet either or both conditions)
    unique_participants_outside_range = list(set(participants_100_outside_range) | set(participants_400_outside_range))
    
    return unique_participants_outside_range

REAL SAMPLE: Filter to get unique participants who have Before_Response, Same_Response, or After_Response counts == either 0 or 1


In [None]:
def count_participants_with_0_or_1(response_counts_df):
    # Filter the rows where Before_Response, Same_Response, or After_Response counts are either 0 or 1
    response_columns = ["Before_Response", "Same_Response", "After_Response"]
    
    # Create a mask for the condition: counts of 0 or 1 in any of the response columns
    mask = response_counts_df[response_columns].isin([0, 1]).any(axis=1)
    
    # If we want to count the number of unique participants, we can filter on the 'participant_id' column
    unique_participants = response_counts_df[mask]['participant_id'].nunique()
    
    return unique_participants

Define colour palettes:

In [None]:
#Referenece:
null_colors = ["#B5D5E0", "#4A738F"]
null_cmap = LinearSegmentedColormap.from_list("custom_heatmap", null_colors, N=256)
data = np.random.rand(10, 10)
sns.heatmap(data, cmap=null_cmap)
plt.show()

#Null:
null_colors = ["#B1DFEE", "#5085E1"]
null_cmap = LinearSegmentedColormap.from_list("custom_heatmap", null_colors, N=256)
data = np.random.rand(10, 10)
sns.heatmap(data, cmap=null_cmap)
plt.show()


#Simulated Participants:
sim_participant_colors = ["#EFD0B6", "#e86a36"]
sim_participant_cmap = LinearSegmentedColormap.from_list("custom_heatmap", sim_participant_colors, N=256)
data = np.random.rand(10, 10)
sns.heatmap(data, cmap=sim_participant_cmap)
plt.show()


# Real Participants:
real_participant_colors = ["#EEB0C4", "#e54e40"]
real_participant_cmap = LinearSegmentedColormap.from_list("custom_heatmap", real_participant_colors, N=256)
data = np.random.rand(10, 10)
sns.heatmap(data, cmap=real_participant_cmap)
plt.show()


# Demographics of real participants

In [None]:
file_path = '/PATH/Redcap_data.csv'
df = pd.read_csv(file_path)  # Skip the first row (header)
#print(df)

# Age summary (mean ± std)
age_mean = df['demo_age'].mean()
age_std = df['demo_age'].std()

print(f"Mean Age: {age_mean:.2f} ± {age_std:.2f}")

# Gender distribution (counts and percentages)
gender_counts = df['demo_gender'].value_counts(dropna=False)
gender_percent = df['demo_gender'].value_counts(normalize=True, dropna=False) * 100

print("\nGender distribution:")
for val in gender_counts.index:
    label = f"{val}" if pd.notna(val) else "Missing"
    print(f"  {label}: {gender_counts[val]} ({gender_percent[val]:.2f}%)")

# Sex distribution (counts and percentages)
sex_counts = df['demo_sex'].value_counts(dropna=False)
sex_percent = df['demo_sex'].value_counts(normalize=True, dropna=False) * 100

print("\nSex distribution:")
for val in sex_counts.index:
    label = f"{val}" if pd.notna(val) else "Missing"
    print(f"  {label}: {sex_counts[val]} ({sex_percent[val]:.2f}%)")

# Run analysis:

[If all files have been generated click here](#skip-here)

REAL SAMPLE: 

Generate the participant trajectories
Optional plotting
Optional dropping of the first 5 trials

For: PSS = 100-400; Accuracy = 0 - 1.0

In [None]:
pss_value_list = list(range(100, 401, 50))  # [50, 100, 150, ..., 600]
accuracy_list = [round(i * 0.1, 1) for i in range(0, 11)]
n_iterations = 100

# Load simulation data
simulation_df = pd.read_csv('/PATH/Reference_simulations.tsv', sep='\t')

# Clean column names (remove tabs, strip spaces)
simulation_df.columns = [col.strip().replace('\t', '') for col in simulation_df.columns]

simulation_df['Staircase_name'] = simulation_df['Staircase_name'].astype(str)


data_directory = '/PATH/SUBJECT_DATA/'

for filename in tqdm(os.listdir(data_directory)):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        
        #plot_simulations_with_participant_data(simulation_df, participants_df, participant_id, pss_value_list, accuracy_list, n_iterations)
        plot_participant_trajectory(participants_df, participant_id)

        

In [None]:
#Data directory with real participant datafiles:
data_directory = '/PATH/SUBJECT_DATA/'

# Initialize DataFrames to store accumulated results
final_pred_pss_accuracy = pd.DataFrame(columns=['Participant_ID', 'Pred_PSS_Value', 'Pred_Accuracy', 'RMSE'])
all_mean_dfs = []

# Define fixed ranges for the structure
pred_pss_values = np.arange(100, 401, 50)  # Pred_PSS_Value: 0 to 400, increments of 50
pred_accuracies = np.arange(0, 1.1, 0.1).round(1)  # Pred_Accuracy: 0.1 to 1.0, increments of 0.1
pss_index = {v: i for i, v in enumerate(pred_pss_values)}
accuracy_index = {v: i for i, v in enumerate(pred_accuracies)}

# 🎲 IDENTIFY RANDOM VS NON-RANDOM RESPONSES: Real Sample

In [None]:
null_response_counts_df = pd.DataFrame()

for i in tqdm(range(0, 100)):  
    participants_file = f'/PATH/SIMULATED_NULL/10000_random_participant_simulations_{i}.tsv'
    all_null_df = pd.read_csv(participants_file, sep='\t')

    # Get participant IDs
    participant_ids = all_null_df['Participant_ID'].unique().tolist()
        
    # Loop over each participant ID
    for participant_id in participant_ids[0:100]:
        participants_df = all_null_df[all_null_df['Participant_ID'] == participant_id]
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]

        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )

        # Calculate response counts per Staircase_name and Response_code
        response_counts_df = compute_response_counts_ref(participants_df, participant_id, i)
        null_response_counts_df = pd.concat([null_response_counts_df, response_counts_df], ignore_index=True)

print(null_response_counts_df.head())


In [None]:
participant_response_counts_df = pd.DataFrame()

for filename in tqdm(os.listdir(data_directory)):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')

        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t', skiprows=1)
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        
        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )
        
        # Calculate response counts per Staircase_name and Response_code
        response_counts_df = compute_response_counts(participants_df, participant_id)
        participant_response_counts_df = pd.concat([participant_response_counts_df, response_counts_df], ignore_index=True)

print(participant_response_counts_df.head())
participant_response_counts_df.to_csv('participant_response_counts_df.csv')

In [None]:
# Create the 'Modified_Staircase_name' column if not done already
participant_response_counts_df['Modified_Staircase_name'] = participant_response_counts_df['Staircase_name'].str[:-2]

# Separate the data into two groups based on the 'Modified_Staircase_name' prefix
df_100 = participant_response_counts_df[participant_response_counts_df['Modified_Staircase_name'].str.startswith('100')]
df_400 = participant_response_counts_df[participant_response_counts_df['Modified_Staircase_name'].str.startswith('400')]

# Group by participant_id and Modified_Staircase_name, and sum the responses
summed_100 = df_100.groupby(['participant_id', 'Modified_Staircase_name'])[['Before_Response', 'Same_Response', 'After_Response']].sum()
summed_400 = df_400.groupby(['participant_id', 'Modified_Staircase_name'])[['Before_Response', 'Same_Response', 'After_Response']].sum()

# Now we concatenate the results for both 100 and 400 staircases
combined_sums = pd.concat([summed_100, summed_400], axis=0)

# Reset index to make 'participant_id' and 'Modified_Staircase_name' regular columns
combined_sums.reset_index(inplace=True)

# Pivot the table so that each participant has one row, and each staircase is a separate column
pivoted_df = combined_sums.pivot_table(index='participant_id', 
                                       columns='Modified_Staircase_name', 
                                       values=['Before_Response', 'Same_Response', 'After_Response'], 
                                       aggfunc='sum')

# Flatten the multi-index columns
pivoted_df.columns = ['_'.join(col).strip() for col in pivoted_df.columns.values]
#print(pivoted_df)



# Create the 'Modified_Staircase_name' column for null_response_counts_df
null_response_counts_df['Modified_Staircase_name'] = null_response_counts_df['Staircase_name'].str[:-2]

# Separate the data into two groups based on the 'Modified_Staircase_name' prefix
df_100_null = null_response_counts_df[null_response_counts_df['Modified_Staircase_name'].str.startswith('100')]
df_400_null = null_response_counts_df[null_response_counts_df['Modified_Staircase_name'].str.startswith('400')]

# Group by participant_id and Modified_Staircase_name, and sum the responses
summed_100_null = df_100_null.groupby(['participant_id', 'Modified_Staircase_name'])[['Before_Response', 'Same_Response', 'After_Response']].sum()
summed_400_null = df_400_null.groupby(['participant_id', 'Modified_Staircase_name'])[['Before_Response', 'Same_Response', 'After_Response']].sum()

# Now we concatenate the results for both 100 and 400 staircases
combined_sums_null = pd.concat([summed_100_null, summed_400_null], axis=0)
combined_sums_null.reset_index(inplace=True)

# Pivot the table so that each participant has one row, and each staircase is a separate column
pivoted_null_df = combined_sums_null.pivot_table(index='participant_id', 
                                                 columns='Modified_Staircase_name', 
                                                 values=['Before_Response', 'Same_Response', 'After_Response'], 
                                                 aggfunc='sum')




pivoted_null_df.columns = ['_'.join(col).strip() for col in pivoted_null_df.columns.values]
print(pivoted_null_df.head())

mean_null_row = pivoted_null_df.mean().to_frame().T
mean_null_row.index = ['Null']  # Rename the index to 'Null'
print(len(pivoted_null_df))
# Now merge the mean row with the pivoted_df (participant's data)
pivoted_df_with_null = pd.concat([pivoted_df, mean_null_row])

# Show the resulting DataFrame
print(pivoted_df_with_null)

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))  # 3 rows, 2 columns
axes = axes.flatten()
numeric_columns = pivoted_df_with_null.select_dtypes(include=['number']).columns
num_plots = min(len(numeric_columns), len(axes))
for i in range(num_plots):
    col = numeric_columns[i]
    ax = axes[i]  # Get the corresponding subplot
    sns.kdeplot(pivoted_df_with_null[col], ax=ax, fill=True, color='steelblue', alpha=0.6)
    ax.set_title(f'KDE of {col}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.set_xlim(0,30)
plt.tight_layout()
plt.show()




# Filter the DataFrame for '100' responses
pivoted_100_df = pivoted_df_with_null.filter(like='100', axis=1)

real_colors = [ '#e0736f', '#e69fab','#dc5746']
#fig, ax = plt.subplots(figsize=(20, 6), frameon=False)

# Plot for all '100' responses
pivoted_100_df.plot(kind='bar', stacked=True, figsize=(20, 6),color=real_colors)
plt.xlabel("Participant")
plt.ylabel("Sum of Responses")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Summed '100' Response Percentages by Participant")
plt.legend(title="Response Type", loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig("100_Responses.png", bbox_inches='tight')
plt.show()

# Filter the DataFrame for '400' responses
pivoted_400_df = pivoted_df_with_null.filter(like='400', axis=1)

# Plot for all '400' responses
pivoted_400_df.plot(kind='bar', stacked=True, figsize=(20, 6), color=real_colors)
plt.xlabel("Participant")
plt.ylabel("Sum of Responses")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Summed '400' Response Percentages by Participant")
plt.legend(title="Response Type", loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig("400_Responses.png", bbox_inches='tight')
plt.show()



# Get participants whose any response values for staircase '100' fall outside the range [5, 15]
participants_100_outside_range = count_participants_with_values_outside_range_for_staircase_type(pivoted_df_with_null, staircase_type='100')
print(f"Participants with any response values outside the range [5, 15] for staircase '100': {len(participants_100_outside_range)}")

# Get participants whose any response values for staircase '400' fall outside the range [5, 15]
participants_400_outside_range = count_participants_with_values_outside_range_for_staircase_type(pivoted_df_with_null, staircase_type='400')
print(f"Participants with any response values == or outside the range [5, 15] for staircase '400': {len(participants_400_outside_range)}")

# Get all unique participants whose any response values fall outside the range [5, 15] for either staircase type
overall_outside_range_participants = count_overall_outside_range(pivoted_df_with_null)
print(f"Overall unique participants with any response values == or outside the range [5, 15] (for both '100' or '400'): {len(overall_outside_range_participants)}")

perc = (len(overall_outside_range_participants)/ len(pivoted_df))
print(perc)

# Optionally print the actual IDs of participants
print("Keep - Unique participants who meet the condition (either or both staircase types):")
print(overall_outside_range_participants)
print(len(overall_outside_range_participants))



#participant_response_counts_df
all_ids = set(participant_response_counts_df['participant_id'].astype(str))
keep_ids = set(overall_outside_range_participants)

# Get those NOT in overall_outside_range_participants
exclude_ids_random = list(all_ids - keep_ids)
print('Exclude: ', exclude_ids_random)
print(len(exclude_ids_random))

In [None]:
#Load all the simulated participants to plot them all together

sim_participant_response_counts_df = pd.read_csv('sim_participant_response_counts_df.csv')
sim_participant_response_counts_df['Modified_Staircase_name'] = sim_participant_response_counts_df['Staircase_name'].str[:-2]
#print(sim_participant_response_counts_df)

# Separate the data into two groups based on the 'Modified_Staircase_name' prefix
df_100 = sim_participant_response_counts_df[sim_participant_response_counts_df['Modified_Staircase_name'].str.startswith('100')]
df_400 = sim_participant_response_counts_df[sim_participant_response_counts_df['Modified_Staircase_name'].str.startswith('400')]
#print(df_100)

# Group by participant_id and Modified_Staircase_name, and sum the responses
summed_100 = df_100.groupby(['participant_id', 'Modified_Staircase_name'])[['Before_Response', 'Same_Response', 'After_Response']].sum()
summed_400 = df_400.groupby(['participant_id', 'Modified_Staircase_name'])[['Before_Response', 'Same_Response', 'After_Response']].sum()

# Now we concatenate the results for both 100 and 400 staircases
combined_sums = pd.concat([summed_100, summed_400], axis=0)

# Reset index to make 'participant_id' and 'Modified_Staircase_name' regular columns
combined_sums.reset_index(inplace=True)

# Pivot the table so that each participant has one row, and each staircase is a separate column
pivoted_df_sim = combined_sums.pivot_table(index='participant_id', 
                                       columns='Modified_Staircase_name', 
                                       values=['Before_Response', 'Same_Response', 'After_Response'], 
                                       aggfunc='sum')

# Flatten the multi-index columns
pivoted_df_sim.columns = ['_'.join(col).strip() for col in pivoted_df_sim.columns.values]
#print(pivoted_df)


##COMBINED PLOT:
# Set font size to 7 points and update figure size (5cm wide = 1.97 inches)
plt.rcParams.update({'font.size': 7})

# Set plot dimensions: 5cm wide (~1.97 in), height to preserve aspect (e.g., 3 in)
fig_width_cm = 6
fig_height_cm = 7.5  # adjust as needed
fig_size_in = (fig_width_cm / 2.54, fig_height_cm / 2.54)  # convert cm to inches

null_colors = ['#87afe6', '#b1d8ec', '#6991e1']
sim_colors = ['#e49971', '#ecd1b7', '#e17a4b']
real_colors = [ '#e0736f', '#e69fab','#dc5746']

def get_cols_by_response_and_staircase(df, response_type, staircase_prefix):
    # Return list of columns matching exact pattern ResponseType_StaircasePrefix
    return [col for col in df.columns if col == f"{response_type}_{staircase_prefix}"]

# Plot for 100 staircases
plt.figure(figsize=fig_size_in, dpi = 300)
for i, response in enumerate(['Before_Response', 'Same_Response', 'After_Response']):
    cols = get_cols_by_response_and_staircase(pivoted_null_df, response, '100')
    if len(cols) == 0:
        print(f"No columns found for {response} 100")
        continue
    # cols is a list with one element (the exact column name)
    sns.kdeplot(pivoted_null_df[cols[0]], fill=True, alpha=0.3, color=null_colors[i], label=f'Null 100 - {response.split("_")[0]}')
    sns.kdeplot(pivoted_df_sim[cols[0]], fill=True, alpha=0.3, color=sim_colors[i], label=f'Sim 100 - {response.split("_")[0]}')
    sns.kdeplot(pivoted_df[cols[0]], fill=True, alpha=0.3, color=real_colors[i], label=f'Real 100 - {response.split("_")[0]}')
plt.title("KDE of '100' Staircases Responses")
plt.xlabel('Number of button presses')
plt.ylabel('Density')
plt.xlim(0, 30)
#plt.legend()
sns.despine()
plt.tight_layout()
plt.show()

# Plot for 400 staircases
plt.figure(figsize=fig_size_in, dpi = 300)
for i, response in enumerate(['Before_Response', 'Same_Response', 'After_Response']):
    cols = get_cols_by_response_and_staircase(pivoted_null_df, response, '400')
    if len(cols) == 0:
        print(f"No columns found for {response} 400")
        continue
    # cols is a list with one element (the exact column name)
    sns.kdeplot(pivoted_null_df[cols[0]], fill=True, alpha=0.3, color=null_colors[i], label=f'Null 400 - {response.split("_")[0]}')
    sns.kdeplot(pivoted_df_sim[cols[0]], fill=True, alpha=0.3, color=sim_colors[i], label=f'Sim 400 - {response.split("_")[0]}')
    sns.kdeplot(pivoted_df[cols[0]], fill=True, alpha=0.3, color=real_colors[i], label=f'Real 400 - {response.split("_")[0]}')
plt.title("KDE of '400' Staircases Responses")
plt.xlabel('Number of button presses')
plt.ylabel('Density')
plt.xlim(0, 30)
#plt.legend()
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
# Define response types
response_types = ["Before_Response", "Same_Response", "After_Response"]

# Extract unique participant IDs
participant_ids = participant_response_counts_df['participant_id'].unique()

# Create a color mapping
color_mapping = {}
colors = plt.cm.tab20.colors  # Get the tab20 colormap
for i, participant in enumerate(participant_ids):
    if participant == "Null":
        color_mapping[participant] = "gray"  # Assign gray to "Null"
    else:
        color_mapping[participant] = colors[i % len(colors)]  # Assign colors from tab20 cyclically

# Plot for each response type
for response in response_types:
    plt.figure(figsize=(20, 5))

    # Group data by participant_id and Staircase_name
    response_data = participant_response_counts_df[['participant_id', 'Staircase_name', response]]
    
    # Pivot the data
    pivot_data = response_data.pivot_table(index='participant_id', columns='Staircase_name', values=response, aggfunc='mean')

    # Create a list of colors for each bar
    bar_colors = []
    for participant in pivot_data.index:
        bar_colors.extend([color_mapping[participant]] * len(pivot_data.columns))

    # Plot the bars
    ax = pivot_data.plot(
        kind='bar',
        stacked=False,
        color=bar_colors,
        figsize=(20, 5),
        legend=True
    )

    # Add a horizontal line at y = 1
    ax.axhline(y=1, color='black', linestyle='--', linewidth=1)

    # Customize the plot
    plt.xlabel("Participant ID")
    plt.ylabel("Count")
    plt.title(f"{response} Responses")
    plt.xticks(rotation=45, fontsize=8)
    plt.legend(title="Staircase Name")

    plt.show()

In [None]:
# Count participants in the null_response_counts_df
null_count = count_participants_with_0_or_1(null_response_counts_df)

# Count participants in the participant_response_counts_df
participant_count = count_participants_with_0_or_1(participant_response_counts_df)

# Print the results
print(f"Number of unique participants with 0 or 1 response count in 'null_response_counts_df': {null_count}")
print(f"Number of unique participants with 0 or 1 response count in 'participant_response_counts_df': {participant_count}")


In [None]:
testme_participant_df = pd.DataFrame()

for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        #print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        participants_df['Participant_ID'] = participant_id
        # Clean column names (remove tabs, strip spaces)
        
        #print(participants_df)
        
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]

        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )
        
        testme_participant_df = pd.concat([testme_participant_df, participants_df], ignore_index=True)
        
        
print(testme_participant_df)
testme_participant_df.to_csv('trial_by_trial_button_press.tsv', sep = '\t')

# 🧠 IDENTIFY COGNITIVE STRATEGY: Real Sample

Check if trajectories are parallel; diverging; converging

In [None]:
null_diff_df = pd.DataFrame()

for i in tqdm(range(0,100)):  
    participants_file = f'/PATH/SIMULATED_NULL/10000_random_participant_simulations_{i}.tsv'
    all_null_df = pd.read_csv(participants_file, sep='\t')

    # Get participant IDs
    participant_ids = []
    participant_ids = all_null_df['Participant_ID'].unique().tolist()
    #print(participant_ids)
        
    # Loop over each participant ID
    for participant_id in participant_ids[0:100]:
        participants_df = all_null_df[all_null_df['Participant_ID'] == participant_id]
        #print(participants_df)
        participants_df = participants_df.copy()
        participants_df.loc[:, 'Trial'] = participants_df['Trial'].astype(int)
        participants_df.loc[:, 'Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )

        #print(participants_df)

         # For 400ms (trials 1-15)
        mean_delays_400 = participants_df[participants_df['Staircase_name'].str.contains('400')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # For 100ms (trials 16-30)
        mean_delays_100 = participants_df[participants_df['Staircase_name'].str.contains('100')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # Combine the two mean delay series (400_1, 400_2, and 100_1, 100_2) into one DataFrame
        mean_delays = pd.concat([mean_delays_400, mean_delays_100])

        # Reindex the trials: 1-15 for 400ms, 16-30 for 100ms
        mean_delays.index = np.arange(1, 31)

        start_diff = mean_delays[1] - mean_delays[16]
        end_diff = mean_delays[15] - mean_delays[30]

        new_row = pd.DataFrame([{
        'Participant_ID': participant_id,
        'start_400': mean_delays[1],
        'end_400':  mean_delays[15],
        'start_100':  mean_delays[16],
        'end_100': mean_delays[30],
        'start_diff': start_diff,
        'end_diff': end_diff
        }])

        null_diff_df = pd.concat([null_diff_df, new_row], ignore_index=True)

#Calculate: The distance moved (-ve = converging; +ve = diverging)
null_diff_df['movement'] = null_diff_df['end_diff'] - null_diff_df['start_diff'] 

In [None]:
real_participant_diff_df = pd.DataFrame()

for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        #print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        # Clean column names (remove tabs, strip spaces)
        
        #print(participants_df)
        
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]

        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )
         # For 400ms (trials 1-15)
        mean_delays_400 = participants_df[participants_df['Staircase_name'].str.contains('400')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # For 100ms (trials 16-30)
        mean_delays_100 = participants_df[participants_df['Staircase_name'].str.contains('100')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # Combine the two mean delay series (400_1, 400_2, and 100_1, 100_2) into one DataFrame
        mean_delays = pd.concat([mean_delays_400, mean_delays_100])
        
        # Reindex the trials: 1-15 for 400ms, 16-30 for 100ms
        mean_delays.index = np.arange(1, 31)

        start_diff = mean_delays[1] - mean_delays[16]
        end_diff = mean_delays[15] - mean_delays[30]
            
        new_row = pd.DataFrame({
            'Participant_ID': [participant_id],
            'start_400': [mean_delays[1]],
            'end_400': [mean_delays[15]],
            'start_100': [mean_delays[16]],
            'end_100': [mean_delays[30]],
            'start_diff': [start_diff],
            'end_diff': [end_diff]
                })

        # Use pd.concat to append the new row to the existing DataFrame
        real_participant_diff_df = pd.concat([real_participant_diff_df, new_row], ignore_index=True)

display(HTML(real_participant_diff_df.head().to_html()))

#⚠️ IF EXCLUDING PARTICIPANTS WHO HAVE RANDOM BUTTON PRESS: RUN THIS LINE
real_participant_diff_df = real_participant_diff_df[
    real_participant_diff_df['Participant_ID'].isin(overall_outside_range_participants)
]

print(len(real_participant_diff_df))


#Calculate: The distance moved (-ve = converging; +ve = diverging)
real_participant_diff_df['movement'] = real_participant_diff_df['end_diff'] - real_participant_diff_df['start_diff'] 

# Parallel: End_diff = Start_diff = 300ms
real_participant_parallel_movers = real_participant_diff_df[((real_participant_diff_df["end_diff"] == 300) & (real_participant_diff_df["end_400"] != 400))]

# Divergent: End_diff > Start_diff 
real_participant_divergent_movers = real_participant_diff_df[(real_participant_diff_df["end_diff"] >= 301)]

# Convergent: End_diff < Start_diff 
real_participant_converge_movers = real_participant_diff_df[(real_participant_diff_df["end_diff"] < 300)]

real_participant_diff_df.to_csv("Real_participant_diff_df_100_400.csv", index=False)  

In [None]:
#Summary:
print(len(real_participant_parallel_movers),':', len(real_participant_parallel_movers)/42*100,'%: Parallel lines')
print(len(real_participant_divergent_movers),':', len(real_participant_divergent_movers)/42*100,'%: Have directions away from converging i.e. acc less than 0.3')
print(len(real_participant_converge_movers),':', len(real_participant_converge_movers)/42*100,'%: Have directions towards converging i.e. acc > than 0.3')
print('\n\nMean end delay difference of null: ', real_participant_diff_df['end_diff'].mean())
print('STD end delay difference of null: ', real_participant_diff_df['end_diff'].std())

#Parallel:
print('\n\nMean end delay of 400 staircase of parallel movers: ', real_participant_parallel_movers['end_400'].mean())
print('Mean end delay 100 staircase of parallel movers: ',real_participant_parallel_movers['end_100'].mean())

#Divergent:
print('\n\nMean change from 300ms diff of divergent movers: ',real_participant_divergent_movers['movement'].mean())
print('STD change from 300ms diff of divergent movers: ',real_participant_divergent_movers['movement'].std())
print('Median end delay difference of divergent movers: ', real_participant_divergent_movers['end_diff'].median())
print('Min end delay difference of divergent movers: ',real_participant_divergent_movers['end_diff'].min())
print('Max end delay difference of divergent movers: ',real_participant_divergent_movers['end_diff'].max())

#Convergent:
print('\n\nMean change from 300ms diff of convergent movers: ', real_participant_converge_movers['movement'].mean())
print('STF change from 300ms diff of convergent movers: ', real_participant_converge_movers['movement'].std())
print('Median end delay difference of convergent movers: ', real_participant_converge_movers['end_diff'].median())
print('Mean end delay difference of convergent movers: ',real_participant_converge_movers['end_diff'].mean())
print('STD end delay difference of convergent movers: ',real_participant_converge_movers['end_diff'].std())
print('Min end delay difference of convergent movers: ',real_participant_converge_movers['end_diff'].min())
print('Max end delay difference of convergent movers: ',real_participant_converge_movers['end_diff'].max())
count = (real_participant_diff_df['end_diff'] <= 0).sum()
print('Number of convergent movers who converge completely:',count)

#REFERENCE FILES:
ref_diff_df = pd.read_csv("/PATH/Reference_diff_df_100_400.csv")  

# Plot histogram
plt.figure(figsize=(6,4))
sns.histplot(ref_diff_df['end_diff'], bins=150, kde=False, color="#4A738F")
#sns.histplot(null_diff_df['end_diff'], bins=150, kde=True, color="#5085E1")
sns.histplot(real_participant_diff_df['end_diff'], bins=150, kde=False, color="#e54e40")
plt.xlabel("End Difference (ms)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("End difference in \n Reference and Real Participants", fontsize=14)
plt.show()

# Scatter plot
plt.figure(figsize=(6, 4))
plt.axhspan(299, 301, color='grey', alpha=0.3)
plt.axhspan(301, 600, color='lightgrey', alpha=0.3)
scatter = sns.scatterplot(
    x=ref_diff_df['PSS_Value'],  
    y=ref_diff_df["end_diff"],
    hue=ref_diff_df["Accuracy"],
    palette="viridis",
    sizes=(20, 200),  
    edgecolor="black"
)
scatter = sns.scatterplot(
    x=real_participant_diff_df.index,  
    y=real_participant_diff_df["end_diff"],
    #hue=real_participant_diff_df["Participant_ID"],
    color="#e54e40",
    sizes=(20, 200),  
    edgecolor="black",
    alpha = 0.5
)

plt.xlabel("Participants / PSS")
plt.ylabel("End Difference (end_diff)")
plt.title("Scatter Plot of end_diff vs Index")

plt.legend(title="Accuracy / Participant", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

exclude_ids_parallel = set(real_participant_parallel_movers['Participant_ID'].astype(str))
print('Exclude: ', exclude_ids_parallel)


In [None]:
sim_participant_diff_df = pd.read_csv("10000_simulated_participants_diff_df_100_400.csv")
null_diff_df = pd.read_csv("Null_diff_df_100_400.csv")  
real_participant_diff_df = pd.read_csv("Real_participant_diff_df_100_400.csv")  


# Colors (third from each palette)
null_color = null_colors[2]   # '#6991e1'
sim_color = sim_colors[2]     # '#e17a4b'
real_color = real_colors[2] # '#dc5746'

# Plot KDE for 'end_diff' for each dataset
fig_width_cm = 6
fig_height_cm = 7.8  # adjust as needed
fig_size_in = (fig_width_cm / 2.54, fig_height_cm / 2.54)  # convert cm to inches

plt.figure(figsize=fig_size_in, dpi=300)

# KDE plots
sns.kdeplot(null_diff_df["end_diff"], fill=True, alpha=0.3, color=null_color, label="Null - end_diff")
sns.kdeplot(sim_participant_diff_df["end_diff"], fill=True, alpha=0.3, color=sim_color, label="Simulated - end_diff")
sns.kdeplot(real_participant_diff_df["end_diff"], fill=True, alpha=0.3, color=real_color, label="Real - end_diff")

# Styling
plt.title("Distribution of 'end_diff' Across Datasets")
plt.xlabel("Difference in mean delay value of \n100ms and 400ms staircase at trial 15")
plt.ylabel("Density")
plt.xlim(0, 600)  # Adjust based on your data range
sns.despine()
#plt.legend()
plt.tight_layout()
plt.show()


# ❤️ASSIGN PSS AND ACCURACY VALUES: Real Sample

In [None]:
#⚠️ IF EXCLUDING PARTICIPANTS WHO HAVE RANDOM BUTTON PRESS AND PARALLEL LINES
excluded_ids = set(exclude_ids_parallel) | set(exclude_ids_random)


# Load simulation data
simulation_df = pd.read_csv('/PATH/Reference_simulations.tsv', sep='\t')
simulation_df['Staircase_name'] = simulation_df['Staircase_name'].astype(str)
pivot_df = reference_matrix(simulation_df, pred_pss_values, pred_accuracies, drop = True, plots=False)

all_mean_dfs = pd.DataFrame()
final_pred_pss_accuracy = pd.DataFrame(columns=['Participant_ID', 'Pred_PSS_Value', 'Pred_Accuracy', 'RMSE'])

# Loop over each file in the directory
for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue

        participant_id = match.group(1)

        # ⚠️ Exclude if participant_id is in either exclusion set
        if participant_id in excluded_ids:
            print(f"Skipping excluded participant: {participant_id}")
            continue
        
        #print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        # Clean column names (remove tabs, strip spaces)
        
        #print(participants_df)
        
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]

        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )
        
        # Temporary storage for current iteration's results
        Pred_PSS_Accuracy = pd.DataFrame(columns=['Participant_ID', 'Pred_PSS_Value', 'Pred_Accuracy', 'RMSE'])
        
        # Process data for the current participant ID
        pred_pss_value, pred_accuracy, rmse, rmse_table = participant_matrix(
            participants_df, participant_id, pivot_df, drop=True, plots=False, verbose=False
        )
        
        # Append results to Pred_PSS_Accuracy
        new_row = pd.DataFrame({
            'Participant_ID': [participant_id],
            'Pred_PSS_Value': [pred_pss_value],
            'Pred_Accuracy': [pred_accuracy],
            'RMSE': [rmse]
        })

        # Use pd.concat to append the new row to the existing DataFrame
        Pred_PSS_Accuracy = pd.concat([Pred_PSS_Accuracy, new_row], ignore_index=True)
                
        # Create group structure for aligning means
        group_structure = pd.DataFrame(
            [(v, a) for v in pred_pss_values for a in pred_accuracies],
            columns=['Pred_PSS_Value', 'Pred_Accuracy']
        )
        
        # Group by and calculate means
        grouped_rmse_mean = Pred_PSS_Accuracy.groupby(["Pred_PSS_Value", "Pred_Accuracy"])["RMSE"].mean().reset_index()
        file_mean = group_structure.merge(grouped_rmse_mean, on=['Pred_PSS_Value', 'Pred_Accuracy'], how='left')

        # Pivot to create mean_df
        mean_df = file_mean.pivot(index="Pred_PSS_Value", columns="Pred_Accuracy", values="RMSE")

        # Convert mean_df to a slice of the 3D matrix
        slice_matrix = np.full((len(pred_pss_values), len(pred_accuracies)), np.nan)
        for pss_val, row in mean_df.iterrows():
            for acc_val, rmse in row.items():
                if not pd.isna(rmse):
                    slice_matrix[pss_index[pss_val], accuracy_index[acc_val]] = rmse

        # Add the slice to the list
        slice_df = pd.DataFrame(slice_matrix)
        all_mean_dfs = pd.concat([all_mean_dfs, slice_df], ignore_index=True)   
             
        # Add list of PSS_Accuracy to mega list
        final_pred_pss_accuracy = pd.concat([final_pred_pss_accuracy, Pred_PSS_Accuracy], ignore_index=True)
    
# Save the accumulated results to CSV
final_pred_pss_accuracy.to_csv("Real_participants_pred_pss_accuracy_100_400.tsv", index=False,  sep='\t')

print('Done!')


# If all files have been generated run from here on:
<a id="skip-here"></a>

In [None]:
#REFERENCE FILES:
ref_diff_df = pd.read_csv("/PATH/Reference_diff_df_100_400.csv")  

#NULL FILES
null_pred_pss_accuracy = pd.read_csv("/PATH/10000_null_simulations_100_400_pred_pss_accuracy.csv")  
null_all_mean_dfs = np.load("/PATH/10000_null_simulations_100_400_all_mean_rmse_plot.npy")
null_diff_df = pd.read_csv("/PATH/Null_diff_df_100_400.csv")  

## SIMULATED PARTICIPANT FILES:
sim_participant_pred_pss_accuracy = pd.read_csv('/PATH/10000_simulated_participants_100_400_pred_pss_accuracy.tsv', sep='\t')
sim_participant_all_mean_df = np.load("/PATH/10000_simulated_participants_100_400_all_mean_rmse_plot.npy")
sim_participant_diff_df = pd.read_csv("/PATH/10000_simulated_participants_diff_df_100_400.csv")  

## SIMULATED PARTICIPANT FILES:
real_participant_pred_pss_accuracy = pd.read_csv('/PATH/Real_participants_pred_pss_accuracy_100_400.tsv', sep='\t')
print(real_participant_pred_pss_accuracy)

In [None]:
True_and_Pred = real_participant_pred_pss_accuracy

# Ensure all PSS values and accuracy values are included
pss_value_list = list(range(100, 401, 50))  # [100, 150, 200, 250, 300, 350, 400]
accuracy_list = [round(i * 0.1, 1) for i in range(0, 11)]  # [0.0, 0.1, 0.2, ..., 1.0]

fig_width_cm = 8
fig_height_cm = 6
fig_size_in = (fig_width_cm / 2.54, fig_height_cm / 2.54)

# Frequency heatmap
grouped_pred = True_and_Pred.groupby(["Pred_PSS_Value", "Pred_Accuracy"]).size().reset_index(name="Frequency")
heatmap_data_pred = grouped_pred.pivot(index="Pred_PSS_Value", columns="Pred_Accuracy", values="Frequency")

# Reindex without filling (leave blanks for missing data)
heatmap_data_pred = heatmap_data_pred.reindex(index=pss_value_list[::-1], columns=accuracy_list)

plt.figure(figsize=fig_size_in, dpi=300)
ax = sns.heatmap(
    heatmap_data_pred, 
    cmap=real_participant_cmap, 
    annot=True, 
    fmt=".0f", 
    vmin=0, 
    vmax=4, 
    annot_kws={"size": 5}, 
    cbar_kws={'label': 'Frequency of sample'}
)
full_accuracy_vals = np.arange(0, 101, 10)
accuracy_percent_labels = [int(x) for x in full_accuracy_vals]

ax.set_xticks(np.arange(len(accuracy_list)) + 0.5)
ax.set_xticklabels([int(x*100) for x in accuracy_list], fontsize=7)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=7)
ax.set_xlabel("Predicted Accuracy (%)", fontsize=7)
ax.set_ylabel("Predicted PSS Value (ms)", fontsize=7)
ax.collections[0].colorbar.ax.tick_params(labelsize=7)
#plt.title("Predicted Accuracy vs Predicted PSS Value", fontsize=7)
plt.tight_layout()
plt.savefig("Real_participants_100_400_pred_pss_accuracy.png")
plt.show()


# Mean RMSE heatmap
mean_rmse = True_and_Pred.groupby(['Pred_PSS_Value', 'Pred_Accuracy'])['RMSE'].mean().unstack()

# Reindex without filling
mean_rmse = mean_rmse.reindex(index=pss_value_list[::-1], columns=accuracy_list)

plt.figure(figsize=fig_size_in, dpi=300)
ax2 = sns.heatmap(
    mean_rmse, 
    cmap=real_participant_cmap, 
    annot=True, 
    fmt='.1f', 
    vmin=0, 
    vmax=30, 
    annot_kws={"size": 5}, 
    cbar_kws={'label': 'Mean RMSE'}
)
ax2.set_xticks(np.arange(len(accuracy_list)) + 0.5)
ax2.set_xticklabels([int(x*100) for x in accuracy_list], fontsize=7)
ax2.set_yticklabels(ax2.get_yticklabels(), fontsize=7)
ax2.set_xlabel("Predicted Accuracy (%)", fontsize=7)
ax2.set_ylabel("Predicted PSS Value (ms)", fontsize=7)
ax2.collections[0].colorbar.ax.tick_params(labelsize=7)
#plt.title('Mean RMSE for Real Participants', fontsize=7)
plt.tight_layout()
plt.savefig("Real_participants_100_400_all_mean_rmse_plot.png")
plt.show()


In [None]:
# Initialize an empty list to store the result
vector_across_slices = []

# Loop over each (y, x) position
for y in range(null_all_mean_dfs.shape[1]):  # 12 rows
    row = []
    for x in range(null_all_mean_dfs.shape[2]):  # 11 columns
        # Get all values across slices for the current (y, x) position
        values_at_pos = null_all_mean_dfs[:, y, x].tolist()  # Values from axis 0 (across slices)
        row.append(values_at_pos)
    vector_across_slices.append(row)

# Convert the list of lists into a pandas DataFrame
df_vector_across_slices = pd.DataFrame(vector_across_slices)
df_vector_across_slices.index = pred_pss_values  # Assuming `pred_pss_values` is the list of index labels
df_vector_across_slices.columns = pred_accuracies  # Assuming `pred_accuracies` is the list of column labels

#If plotting - shows disribution and participant as line
real_participant_pred_pss_accuracy = compute_p_values(real_participant_pred_pss_accuracy, df_vector_across_slices, plotting=True)
#Saves p-values into the Pred_PSS_Accuracy_drop_real file
real_participant_pred_pss_accuracy.to_csv('Real_participants_pred_pss_accuracy_100_400_pvals.tsv', sep='\t', index=False)

In [None]:
## Assuming p_values is defined
p_values = real_participant_pred_pss_accuracy['p_value']

# Create the main histogram
plt.figure(figsize=(8, 6))
sns.histplot(p_values, bins=100, kde=False, color='#e54e40', alpha=0.6)

# Set plot labels and title
plt.title('Distribution of P-values')
plt.xlabel('P-value')
plt.ylabel('Frequency')

# Create inset axes
ax_inset = inset_axes(plt.gca(), width="60%", height="40%", loc="upper right")

# Plot the zoomed-in region
sns.histplot(p_values, bins=100, kde=False, color='#e54e40', alpha=0.6, ax=ax_inset)
ax_inset.set_xlim(0, 0.08)
#ax_inset.set_ylim(0, None)  # Adjust if needed
ax_inset.set_xticks([0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07])
#ax_inset.xlabel('P-value')
#ax_inset.ylabel('Frequency')

ax_inset.set_yticks([])  

# Show plot
plt.show()


Mean RMSE across paricipants, sig vs non sig

In [None]:
# Define the full range of PSS and Accuracy values
pss_value_list = list(range(100, 401, 50))  # [100, 150, 200, 250, 300, 350, 400]
accuracy_list = [round(i * 0.1, 1) for i in range(0, 11)]  # [0.4, 0.5, 0.6, ..., 1.0]

# Separate participants into two groups based on p-value
significant_participants = real_participant_pred_pss_accuracy[real_participant_pred_pss_accuracy['p_value'] < 0.05]
non_significant_participants = real_participant_pred_pss_accuracy[real_participant_pred_pss_accuracy['p_value'] >= 0.05]

# Calculate the mean RMSE for each group
mean_rmse_significant = significant_participants.groupby(['Pred_PSS_Value', 'Pred_Accuracy'])['RMSE'].mean().unstack()
mean_rmse_non_significant = non_significant_participants.groupby(['Pred_PSS_Value', 'Pred_Accuracy'])['RMSE'].mean().unstack()

# Reindex to include all PSS and Accuracy values with missing values as NaN
mean_rmse_significant = mean_rmse_significant.reindex(index=pss_value_list[::-1], columns=accuracy_list)
mean_rmse_non_significant = mean_rmse_non_significant.reindex(index=pss_value_list[::-1], columns=accuracy_list)

# Plot the heatmap for significant participants (p < 0.05)
plt.figure(figsize=(6, 4))
sns.heatmap(mean_rmse_significant, cmap=real_participant_cmap, annot=True, fmt='.1f', cbar_kws={'label': 'Mean RMSE'}, vmin=0, vmax=30)
plt.title('Heatmap of Mean RMSE for Participants with p < 0.05')
plt.xlabel('Pred Accuracy')
plt.ylabel('Pred PSS Value')
plt.show()

# Plot the heatmap for non-significant participants (p >= 0.05)
plt.figure(figsize=(6, 4))
sns.heatmap(mean_rmse_non_significant, cmap=real_participant_cmap, annot=True, fmt='.1f', cbar_kws={'label': 'Mean RMSE'}, vmin=0, vmax=30)
plt.title('Heatmap of Mean RMSE for Participants with p >= 0.05')
plt.xlabel('Pred Accuracy')
plt.ylabel('Pred PSS Value')
plt.show()


In [None]:
significant_participants = real_participant_pred_pss_accuracy[real_participant_pred_pss_accuracy['p_value'] < 0.05]
non_significant_participants = real_participant_pred_pss_accuracy[real_participant_pred_pss_accuracy['p_value'] >= 0.05]

# Set the number of rows and columns for the grid of subplots
num_rows = df_vector_across_slices.shape[0]
num_cols = df_vector_across_slices.shape[1]

# Calculate the min/max values for the x and y axis ranges
min_xvalue = 0
max_xvalue = math.ceil(np.nanmax([item for sublist in df_vector_across_slices.values for item in sublist]) + 0.05)
max_yvalue = 0

# Find max frequency for scaling y-axis
for cell_row in range(num_rows):
    for cell_col in range(num_cols):
        cell_values = df_vector_across_slices.iloc[cell_row, cell_col]
        hist, bin_edges = np.histogram(cell_values, bins=50, range=(min_xvalue, max_xvalue))
        max_yvalue = max(max_yvalue, hist.max())

# Create figure and axes for subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15,10))

# Loop through each subplot to create histograms
for cell_row in range(num_rows):
    for cell_col in range(num_cols):
        cell_values = df_vector_across_slices.iloc[cell_row, cell_col]
        ax = axes[num_rows - 1 - cell_row, cell_col]  # Flip the row order

        # Plot the histogram for the null distribution
        sns.histplot(cell_values, bins=50, kde=False, color='#5085E1', edgecolor='#5085E1', alpha=0.7, ax=ax, 
                     binrange=(min_xvalue, max_xvalue))

        # Overlay the RMSE values for significant and non-significant participants
        significant_rmse_values = significant_participants[
            (significant_participants['Pred_PSS_Value'] == pred_pss_values[cell_row]) &
            (significant_participants['Pred_Accuracy'] == pred_accuracies[cell_col])
        ]['RMSE']
        non_significant_rmse_values = non_significant_participants[
            (non_significant_participants['Pred_PSS_Value'] == pred_pss_values[cell_row]) &
            (non_significant_participants['Pred_Accuracy'] == pred_accuracies[cell_col])
        ]['RMSE']

        # Plot vertical lines for significant RMSE values in red
        for rmse_value in significant_rmse_values:
            ax.vlines(rmse_value, 0, max_yvalue, color='#e54e40', alpha=0.8, linewidth=1.5, label='Significant')

        # Plot vertical lines for non-significant RMSE values in blue
        for rmse_value in non_significant_rmse_values:
            ax.vlines(rmse_value, 0, max_yvalue, color='#e54e40', alpha=0.8, linewidth=1.5, linestyles= 'dashed',label='Non-Significant')
        
        # Remove axis labels and ticks for a clean look
        ax.set_xticks([]) 
        ax.set_yticks([]) 
        ax.set_ylabel('')

        # Set axis limits
        ax.set_xlim(min_xvalue, max_xvalue)
        ax.set_ylim(0, max_yvalue)

        # Clean grid appearance
        if cell_col == num_cols - 1:
            ax.spines['top'].set_visible(True)
            ax.spines['right'].set_visible(True)
            ax.spines['left'].set_visible(True)
            ax.spines['bottom'].set_visible(True)
        else:
            ax.spines['top'].set_visible(True)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_visible(True)
            ax.spines['bottom'].set_visible(True)

        if cell_row == 0 and cell_col == 0:
            ax.set_xticks(np.linspace(min_xvalue, max_xvalue, num=2))
        else:
            ax.set_xticks([])

        if cell_col == 0 and cell_row == 0:
            ax.set_yticks(np.linspace(0, max_yvalue, num=2))

# Left Y-axis (Pred_PSS_Value)
ax_left = fig.add_axes([0.065, 0.1, 0.01, 0.75])
ax_left.set_xticks([])  
ax_left.tick_params(axis="y", direction="out", length=12, width=1.2)
ax_left.set_yticks(np.arange(num_rows) + 0.5)
ax_left.set_yticklabels(pred_pss_values, fontsize=10, ha='right')
ax_left.set_ylabel("Predicted PSS Value\nNumber of instances", fontsize=12, labelpad=20)
ax_left.tick_params(left=False, labelleft=True, right=False, labelright=False)
ax_left.spines['top'].set_visible(False)
ax_left.spines['bottom'].set_visible(False)
ax_left.spines['left'].set_visible(False)
ax_left.spines['right'].set_visible(False)

# Bottom X-axis (Pred_Accuracy)
ax_bottom = fig.add_axes([0.16, 0.06, 0.7, 0.01])
ax_bottom.set_yticks([])  
ax_bottom.set_xticks(np.arange(num_cols))
ax_bottom.set_xticklabels(pred_accuracies, rotation=0, fontsize=10, ha='center')
ax_bottom.set_xlabel("RMSE\nPredicted Accuracy", fontsize=12, labelpad=15)
ax_bottom.tick_params(axis="x", direction="inout", length=6, width=1.2)
ax_bottom.spines['top'].set_visible(False)
ax_bottom.spines['bottom'].set_visible(False)
ax_bottom.spines['left'].set_visible(False)
ax_bottom.spines['right'].set_visible(False)

# Adjust layout to fit the grid tightly
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig("Real_participants_overlayed_vertical_line_null_distribution_plots.png")
plt.show()

### ➕ Additional Analyses: 

In [None]:
def calculate_auroc_shdt(participant_data, participant_id):
    # Calculate accuracy for each trial (1 = correct, 0 = incorrect)
    participant_data = participant_data.copy()
    
    # Extract test variable (confidence) and state variable (accuracy)
    y_true = participant_data['Recode_Accuracy']
    y_scores = participant_data['Confidence']
    
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    return {'Participant': participant_id, 'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc}


In [None]:
# Loop over each file in the directory
data_directory = '/PATH/SUBJECT_DATA/'

real_participant_pred_pss_accuracy = pd.read_csv('/PATH/Real_participants_pred_pss_accuracy_100_400.tsv', sep='\t')

roc_results = []
plt.figure(figsize=(8, 6))  # Prepare figure for group-level ROC plot

#⚠️ IF EXCLUDING PARTICIPANTS WHO HAVE RANDOM BUTTON PRESS AND PARALLEL LINES
excluded_ids = set(exclude_ids_parallel) | set(exclude_ids_random)

for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')

          # ⚠️ Exclude if participant_id is in either exclusion set
        if participant_id in excluded_ids:
            print(f"Skipping excluded participant: {participant_id}")
            continue
        
        #print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        # Clean column names (remove tabs, strip spaces)
        
        #print(participants_df)
        
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]

        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )
        
       
        
        pss_row = real_participant_pred_pss_accuracy.loc[
            real_participant_pred_pss_accuracy['Participant_ID'].apply(lambda x: str(x).zfill(3)) == str(participant_id).zfill(3),
            'Pred_PSS_Value'
        ]
        
        pss_value = pss_row.values[0]
        print('PSS:', pss_value)
        
        accuracy_value = real_participant_pred_pss_accuracy.loc[
            real_participant_pred_pss_accuracy['Participant_ID'].apply(lambda x: str(x).zfill(3)) == str(participant_id).zfill(3),
            'Pred_Accuracy'].values[0]

        print('Accuracy:', accuracy_value)

        # Function to determine correctness
        def classify_response(row):
            if row['Response_code'] == -1:  # 'same time'
                return 1 if row['Current Delay'] == pss_value else 0
            elif row['Response_code'] == 0:  # 'before'
                return 1 if row['Current Delay'] < pss_value else 0
            elif row['Response_code'] == 1:  # 'after'
                return 1 if row['Current Delay'] > pss_value else 0
            return None

        # Apply classification
        participants_df['Recode_Accuracy'] = participants_df.apply(classify_response, axis=1)

        # Create trial-by-trial DataFrame
        trial_data = participants_df[['Staircase_name', 'Recode_Trial', 'Current Delay', 'Button', 'Response_code', 'Recode_Accuracy', 'Confidence']]

        #print(trial_data)
        print('Recoded Accuracy:', sum(trial_data['Recode_Accuracy']) / 60, '\n')
        
        #Calculate ROC
        roc_result = calculate_auroc_shdt(trial_data, participant_id)
        roc_results.append(roc_result)
        plt.plot(roc_result['fpr'], roc_result['tpr'], alpha = 0.4, label=f'Participant {participant_id} (AUC = {roc_result["roc_auc"]:.2f})')        
        #print(roc_result)

        
# Finalize group-level ROC plot
plt.plot([0, 1], [0, 1], linestyle='--', color='black', alpha = 1)  # Diagonal line (random classifier)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
#plt.legend(loc='lower right')
plt.show()

df_auc = pd.DataFrame([{'Participant_ID': r['Participant'], 'AUC': r['roc_auc']} for r in roc_results])

#display(HTML(df_auc.to_html()))
# Create a box plot for AUC values
plt.figure(figsize=(6, 4))
sns.boxplot(y=df_auc['AUC'])
plt.title('Distribution of ROC AUC Values')
plt.ylabel('AUC')
plt.show()


df_auc['Participant_ID'] = df_auc['Participant_ID'].astype(int)  # or .astype(int) if needed
real_participant_pred_pss_accuracy['Participant_ID'] = real_participant_pred_pss_accuracy['Participant_ID'].astype(int)  # or 
real_participant_pred_pss_accuracy_auc = pd.merge(real_participant_pred_pss_accuracy, df_auc, on='Participant_ID', how='left')
print(real_participant_pred_pss_accuracy_auc)

# Calculate and plot correlation between AUC and real_participant_pred_pss_accuracy
correlation, p_value = pearsonr(real_participant_pred_pss_accuracy_auc['AUC'], real_participant_pred_pss_accuracy_auc['Pred_Accuracy'])

plt.figure(figsize=(6, 4))
sns.scatterplot(x=real_participant_pred_pss_accuracy_auc['AUC'], y=real_participant_pred_pss_accuracy_auc['Pred_Accuracy'])
plt.xlabel('AUC')
plt.ylabel('Real Participant PSS Accuracy')
plt.title(f'Correlation: {correlation:.2f} (p = {p_value:.3f})')
plt.show()

print(f'Pearson correlation: {correlation:.2f}, p-value: {p_value:.3f}')

In [None]:
real_participant_pred_pss_accuracy_auc['AUC'].mean()
real_participant_pred_pss_accuracy_auc['AUC'].std()
real_participant_pred_pss_accuracy_auc['AUC'].min()
real_participant_pred_pss_accuracy_auc['AUC'].max()

DIFF PER TRIAL

In [None]:
participant_per_trial_diff_df = pd.DataFrame()

for filename in tqdm(os.listdir(data_directory)):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        #print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] != 'Training']
        participants_df = participants_df[participants_df['Staircase_name'] != 'Post_task_question']
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        
        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )
        
         # For 400ms (trials 1-15)
        mean_delays_400 = participants_df[participants_df['Staircase_name'].str.contains('400')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # For 100ms (trials 16-30)
        mean_delays_100 = participants_df[participants_df['Staircase_name'].str.contains('100')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # Combine the two mean delay series (400_1, 400_2, and 100_1, 100_2) into one DataFrame
        mean_delays = mean_delays_400.append(mean_delays_100)

        # Reindex the trials: 1-15 for 400ms, 16-30 for 100ms
        mean_delays.index = np.arange(1, 31)
                
        participant_data = {'Participant_ID': participant_id}
        
        # Compute differences for each trial
        for trial in range(1, 16):
            value_400 = mean_delays[trial]
            value_100 = mean_delays[trial + 15]
            trial_diff = value_400 - value_100

            # Store values in dictionary
            #participant_data[f'trial_{trial}_start'] = start_value
            #participant_data[f'trial_{trial}_end'] = end_value
            participant_data[f'trial_{trial}_diff'] = trial_diff

        # Append the dictionary as a single row to the DataFrame
        participant_per_trial_diff_df = participant_per_trial_diff_df.append(participant_data, ignore_index=True)


display(HTML(participant_per_trial_diff_df.head().to_html()))       

In [None]:
sim_participants_per_trial_diff_df = pd.DataFrame()

sim_participants_file = f'/PATH/10000_simulated_participants_100_400.tsv'
sim_participants_df = pd.read_csv(sim_participants_file, sep='\t')

sim_participants_df['Trial'] = sim_participants_df['Trial'].astype(int)
sim_participants_df['Modified_Staircase_name'] = sim_participants_df['Staircase_name'].str[:-2]

# Recode trial numbers
sim_participants_df.loc[:, 'Recode_Trial'] = np.where(
    sim_participants_df['Staircase_name'].str.contains('400'),
    sim_participants_df['Trial'], 
    sim_participants_df['Trial'] + 15
)

# Get participant IDs
participant_ids = sim_participants_df['Participant_ID'].unique().tolist()

# Loop over each participant ID
for participant_id in participant_ids[0:10001]:
    participants_df = sim_participants_df[sim_participants_df['Participant_ID'] == participant_id]
    participants_df['Trial'] = participants_df['Trial'].astype(int)
    participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]
    
    # Recode trial numbers
    participants_df.loc[:, 'Recode_Trial'] = np.where(
        participants_df['Staircase_name'].str.contains('400'),
        participants_df['Trial'], 
        participants_df['Trial'] + 15
    )
    
    # Get participant IDs
    participant_ids = participants_df['Participant_ID'].unique().tolist()
    

     # For 400ms (trials 1-15)
    mean_delays_400 = participants_df[participants_df['Staircase_name'].str.contains('400')] \
                        .groupby('Recode_Trial')['Current Delay'].mean()

    # For 100ms (trials 16-30)
    mean_delays_100 = participants_df[participants_df['Staircase_name'].str.contains('100')] \
                        .groupby('Recode_Trial')['Current Delay'].mean()

    # Combine the two mean delay series (400_1, 400_2, and 100_1, 100_2) into one DataFrame
    mean_delays = mean_delays_400.append(mean_delays_100)

    # Reindex the trials: 1-15 for 400ms, 16-30 for 100ms
    mean_delays.index = np.arange(1, 31)

                
    participant_data = {'Participant_ID': participant_id}

    # Compute differences for each trial
    for trial in range(1, 16):
        value_400 = mean_delays[trial]
        value_100 = mean_delays[trial + 15]
        trial_diff = value_400 - value_100

        # Store values in dictionary
        #participant_data[f'trial_{trial}_start'] = start_value
        #participant_data[f'trial_{trial}_end'] = end_value
        participant_data[f'trial_{trial}_diff'] = trial_diff

    # Append the dictionary as a single row to the DataFrame
    sim_participants_per_trial_diff_df = sim_participants_per_trial_diff_df.append(participant_data, ignore_index=True)


display(HTML(sim_participants_per_trial_diff_df.head().to_html()))       

In [None]:
# Plot KDEs in a 3x5 subplot
fig, axes = plt.subplots(3, 5, figsize=(15, 9))
fig.tight_layout(pad=4.0)

for trial, ax in zip(range(1, 16), axes.flatten()):
    sns.kdeplot(null_per_trial_diff_df[f'trial_{trial}_diff'], color="#5085E1", ax=ax)
    sns.kdeplot(sim_participants_per_trial_diff_df[f'trial_{trial}_diff'], color="#e86a36", ax=ax)
    sns.kdeplot(participant_per_trial_diff_df[f'trial_{trial}_diff'], color="#e54e40",ax=ax)
    ax.set_xlabel(f"Difference at Trial {trial} (ms)", fontsize=10)
    ax.set_ylabel("Density", fontsize=10)
    ax.set_title(f"Trial {trial}", fontsize=12)

plt.show()

In [None]:
#display(HTML(null_per_trial_diff_df.head().to_html()))       
threshold = 100

filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_3_diff'] < threshold]
print(f"Null with trial_3_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_3_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_3_diff']])/len(null_per_trial_diff_df) * 100)

filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_5_diff'] < threshold]
print(f"Null with trial_5_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_5_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_5_diff']])/len(null_per_trial_diff_df) * 100)


filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_7_diff'] < threshold]
print(f"Null with trial_7_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_7_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_7_diff']]) / len(null_per_trial_diff_df) * 100)

filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_9_diff'] < threshold]
print(f"Null with trial_9_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_9_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_9_diff']]) / len(null_per_trial_diff_df) * 100)

filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_11_diff'] < threshold]
print(f"Null with trial_11_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_11_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_11_diff']]) / len(null_per_trial_diff_df) * 100)

filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_13_diff'] < threshold]
print(f"Null with trial_13_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_13_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_13_diff']]) / len(null_per_trial_diff_df) * 100)

filtered_participants = null_per_trial_diff_df[null_per_trial_diff_df['trial_15_diff'] < threshold]
print(f"Null with trial_15_diff < {threshold}:")
#print(filtered_participants[['Participant_ID', 'trial_15_diff']])
print(len(filtered_participants[['Participant_ID', 'trial_15_diff']]) / len(null_per_trial_diff_df) * 100)




#display(HTML(sim_participants_per_trial_diff_df.head().to_html()))       
#display(HTML(participant_per_trial_diff_df.head().to_html()))       

# Confidence Ratings:

In [None]:
#Add mean confidence ratings
data_directory = '/PATH/SUBJECT_DATA/'
conf_list = []  # List to store individual DataFrames
conf_df = pd.DataFrame()

#⚠️ IF EXCLUDING PARTICIPANTS WHO HAVE RANDOM BUTTON PRESS AND PARALLEL LINES
excluded_ids = set(exclude_ids_parallel) | set(exclude_ids_random)

for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        
        # Read the participant's data file
        file_path = os.path.join(data_directory, filename)
        participants_df = pd.read_csv(file_path, sep='\t', skiprows=1)  
        
        # Clean column names
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
        
        # Add a new column for Participant_ID
        participants_df.insert(0, 'Participant_ID', participant_id)  # Insert as first column
        
        # Append to the list
        conf_list.append(participants_df)

conf_df = pd.concat(conf_list, ignore_index=True)
conf_df['Participant_ID'] = conf_df['Participant_ID'].astype(int)

# Calculate mean confidence for each participant
mean_confidence = conf_df.groupby('Participant_ID')['Confidence'].mean().reset_index()
#mean_confidence.rename(columns={'Confidence': 'SHDT_Mean_Confidence'}, inplace=True)
real_participant_pred_pss_accuracy_auc_conf = real_participant_pred_pss_accuracy_auc = pd.merge(real_participant_pred_pss_accuracy_auc, mean_confidence, on='Participant_ID', how='left')
real_participant_pred_pss_accuracy_auc_conf.to_csv('/PATH/Real_participant_pred_pss_accuracy_auc_conf.tsv', sep='\t', index=False)

print(real_participant_pred_pss_accuracy_auc_conf)

In [None]:
## Add Subjective Reports
posttask_subjective_ratings_list = []  # List to store individual DataFrames
posttask_subjective_ratings_df = pd.DataFrame()
for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        
        # Read the participant's data file
        file_path = os.path.join(data_directory, filename)
        participants_df = pd.read_csv(file_path, sep='\t', skiprows=1, usecols=['Staircase_name', 'Trial', 'Current Delay'])  
        
        # Clean column names
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
        
        # Add a new column for Participant_ID
        participants_df.insert(0, 'Participant_ID', participant_id)  # Insert as first column

        # Filter rows where 'Staircase_name' is 'Post_task_question'
        participants_df = participants_df[participants_df['Staircase_name'] == 'Post_task_question']
        
        # Convert 'Current Delay' to integers
        participants_df['Current Delay'] = participants_df['Current Delay'].astype(int)
        participants_df.rename(columns={'Current Delay': 'Rating'}, inplace=True)

        # Append to the list
        posttask_subjective_ratings_list.append(participants_df)

# Combine all DataFrames into a single one
posttask_subjective_ratings_df = pd.concat(posttask_subjective_ratings_list, ignore_index=True)
# Pivot the DataFrame
posttask_subjective_ratings_df = posttask_subjective_ratings_df.pivot(index='Participant_ID', columns='Trial', values='Rating')
# Reset index to make it a normal DataFrame
posttask_subjective_ratings_df.reset_index(inplace=True)
posttask_subjective_ratings_df.columns.name = None  # Remove the automatic name given to columns
posttask_subjective_ratings_df['Participant_ID'] = posttask_subjective_ratings_df['Participant_ID'].astype(int)

#merged = HDT.merge(posttask_subjective_ratings_df, on='Participant_ID', how='left')
#display(HTML(merged.head().to_html()))


# Get unique Question_Types
question_types = ['TaskGeneral', 'Difficulty', 'Breathless']
print(question_types)

# Set up the plot style
sns.set(style="whitegrid")

# Set the number of subplots needed
n = len(question_types)
cols = 1  # Number of columns in the subplot grid
rows = n  # Number of rows based on the number of Question_Types

# Create a figure with subplots
fig, axes = plt.subplots(rows, cols, figsize=(15,5))
axes = axes.flatten()  # Flatten axes for easy iteration

# Loop over each Question_Type to create subplots
for i, question in enumerate(question_types):
    ax = axes[i] # Set up the plot style
    sns.set(style="whitegrid")
    # Select the subplot
    question_data = posttask_subjective_ratings_df[question]  # Filter data for the current Question_Type
    
    # Create a box plot (ensure 'Instance' is treated as a categorical variable)
    sns.boxplot(
        data=question_data,
        #y=question,             # y-axis: Trial
        #x=question,         # y-axis: Instance (grouped)
        orient='h', 
        dodge = False, # Make the box plot horizontal
        color='white', 
        linewidth = 1.5,
        ax=ax,
        showfliers=False,
        width = 0.2
    )   


    # Add individual participant responses as a scatter plot (strip plot)
    sns.stripplot(
        data=question_data,
        #y=question,             # y-axis: Trial
        #x=question,     # y-axis: Instance (grouped)
        #hue = 'Participant_ID',
        #palette='viridis',
        color='red',
        orient='h',             # Align with the box plot horizontally
        dodge=False,             # Separate the points by Instance
        jitter=False,           # Add jitter to avoid overlap
        size=6,                 # Size of the scatter points
        alpha=1.0,              # Transparency for the points
        ax=ax
    )
        
    # Customize the subplot
    ax.set_xlabel('', fontsize=12)
    ax.set_ylabel((f"{question}"), fontsize=12)
    ax.legend([], [], frameon=False)  # Remove redundant legend from individual plots
    ax.grid(axis='y', linestyle='--', alpha=0.7)  
    ax.set_xlim(-5, 105) 
plt.xlabel('Score')

plt.tight_layout()

plt_name = 'CARDIAC_subjective_ratings.png'
#plt_save_path = save_dir + '/'+ plt_name
#plt.savefig(plt_save_path, dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Set up color palette
real_colors = ['#e0736f', '#e69fab', '#dc5746']  
excluded_participants = set(exclude_ids_parallel) | set(exclude_ids_random)

# Function to assign dot color
def color_by_participant(pid):
    return real_colors[1] if pid in excluded_participants else real_colors[2]

# Define questions and styling
question_types = ['TaskGeneral', 'Difficulty', 'Breathless']
sns.set(style="whitegrid")


fig_width_cm = 16
fig_height_cm = 5
fig_size_in = (fig_width_cm / 2.54, fig_height_cm / 2.54)
fig, axes = plt.subplots(len(question_types), 1, figsize=(fig_size_in), gridspec_kw={'height_ratios': [1]*len(question_types)}, dpi=300)

for i, question in enumerate(question_types):
    ax = axes[i]

    data = posttask_subjective_ratings_df[question].dropna()
    participant_ids = posttask_subjective_ratings_df.loc[data.index, 'Participant_ID']

    # --- KDE Plot (above boxplot) ---
    ax_kde = ax.inset_axes([0, 1.15, 1, 0.2], sharex=ax)
    sns.kdeplot(data, fill=True, alpha=0.3, color=real_colors[0], ax=ax_kde)
    ax_kde.set_ylabel('')
    ax_kde.set_xlabel('')
    ax_kde.set_yticks([])
    ax_kde.set_xticks([])
    ax_kde.tick_params(bottom=False, labelbottom=False)  # Ensure no x ticks or labels
    ax_kde.spines['bottom'].set_visible(False)
    ax_kde.spines['top'].set_visible(False)
    ax_kde.spines['left'].set_visible(False)
    ax_kde.spines['right'].set_visible(False)
    ax_kde.grid(False)

    # --- Boxplot ---
    sns.boxplot(
        x=data,
        orient='h',
        color='white',
        linewidth=1.5,
        width=0.3,
        showfliers=False,
        ax=ax
    )

    # Dot coloring based on inclusion
    dot_colors = [color_by_participant(pid) for pid in participant_ids]
    jitter = np.random.uniform(-0.01, 0.01, size=len(data))
    ax.scatter(data.values, jitter, c=dot_colors, edgecolor='gray', linewidth=0, s=25, alpha=0.5, zorder=5)

    # Labeling and styling
    ax.set_ylabel(question, fontsize=7)
    ax.set_xlabel('', fontsize=7)
    ax.set_xlim(-5, 105)
    ax.set_yticks([])
    ax.set_xticks([0,10,20,30,40,50,60,70,80,90,100])
    ax.tick_params(axis='x', labelsize=7)
    ax.grid(False)
    ax.spines['left'].set_visible(False)

# Legend for included/excluded
legend_elements = [
    Patch(facecolor=real_colors[2], edgecolor='gray', label='Included Participants'),
    Patch(facecolor=real_colors[1], edgecolor='gray', label='Excluded Participants')
]
#fig.legend(handles=legend_elements, loc='upper right', fontsize=12, frameon=False)

plt.xlabel('Score', fontsize=7)
#plt.tight_layout(rect=[0, 0, 0.9, 1])
sns.despine()
plt.subplots_adjust(hspace=1.15)  # more vertical spacing between rows
plt.show()


In [None]:
display(HTML(merged.describe().to_html()))

In [None]:
# Set plot style
sns.set(style="white")
#print(merged)

# Create scatter plot 1
plt.figure(figsize=(10, 6))
norm = plt.Normalize(0, 1)
cbar_ticks = np.arange(0, 1.1, 0.1) 
fig, ax = plt.subplots()
scatter = sns.scatterplot(
    data=merged, 
    x='Difficulty', 
    y='SHDT_Mean_Confidence',
    #size='Pred_PSS_Value',  # Scale dot size
    hue='Pred_Accuracy',    # Color by participant
    palette='viridis',
    hue_norm=(0, 1),
    alpha=1.0               # Transparency for better visibility
)
plt.xlim(0, 100)  
plt.ylim(0,100)

ax.legend_.remove()  # Remove legend


S_HDT_pearsonR_conf_difficulty = pearsonr(merged['SHDT_Mean_Confidence'], merged['Difficulty'])
print('S_HDT: Is confidence related to difficulty? \n R =', S_HDT_pearsonR_conf_difficulty[0], '\n P-value:',S_HDT_pearsonR_conf_difficulty[1], '\n R2 = ', S_HDT_pearsonR_conf_difficulty[0]**2 )

In [None]:
# Set plot style
sns.set(style="white")

# Create scatter plot 1
plt.figure(figsize=(10, 6))
norm = plt.Normalize(0, 1)
cbar_ticks = np.arange(0, 1.1, 0.1) 
fig, ax = plt.subplots()
scatter = sns.scatterplot(
    data=merged, 
    x='TaskGeneral', 
    y='SHDT_Mean_Confidence',
    #size='Pred_PSS_Value',  # Scale dot size
    hue='Pred_Accuracy',    # Color by participant
    palette='viridis',
    hue_norm=(0, 1),
    alpha=1.0               # Transparency for better visibility
)
plt.xlim(0, 100)  
plt.ylim(0,100)

ax.legend_.remove()  # Remove legend


S_HDT_pearsonR_conf_taskG = pearsonr(merged['SHDT_Mean_Confidence'], merged['TaskGeneral'])
print('S_HDT: Is confidence related to how pleasant they found the task? \n R =', S_HDT_pearsonR_conf_taskG[0], '\n P-value:',S_HDT_pearsonR_conf_taskG[1], '\n R2 = ', S_HDT_pearsonR_conf_taskG[0]**2 )

In [None]:
# Set plot style
sns.set(style="white")

# Create scatter plot 1
plt.figure(figsize=(10, 6))
norm = plt.Normalize(0, 1)
cbar_ticks = np.arange(0, 1.1, 0.1) 
fig, ax = plt.subplots()
scatter = sns.scatterplot(
    data=merged, 
    x=(merged['Pred_Accuracy'] * 100), 
    y='SHDT_Mean_Confidence',
    #size='Pred_PSS_Value',  # Scale dot size
    #hue='Participant_ID',    # Color by participant
    #palette='viridis',
    #hue_norm=(0, 1),
    color = 'red',
    alpha=1.0               # Transparency for better visibility
)
plt.xlim(0, 100)  
plt.ylim(0,100)

#ax.legend_.remove()  # Remove legend

S_HDT_pearsonR_acc_conf = pearsonr(merged['SHDT_Mean_Confidence'], merged['Pred_Accuracy'])
print('S_HDT: Is confidence related to how accurate they are? \n R =', S_HDT_pearsonR_acc_conf[0], '\n P-value:',S_HDT_pearsonR_acc_conf[1], '\n R2 = ', S_HDT_pearsonR_acc_conf[0]**2 )


# Create scatter plot 1
plt.figure(figsize=(10, 6))
norm = plt.Normalize(0, 1)
cbar_ticks = np.arange(0, 1.1, 0.1) 
fig, ax = plt.subplots()
scatter = sns.scatterplot(
    data=merged, 
    x='Pred_PSS_Value', 
    y='SHDT_Mean_Confidence',
    #size='Pred_PSS_Value',  # Scale dot size
    #hue='Participant_ID',    # Color by participant
    #palette='viridis',
    #hue_norm=(0, 1),
    color = 'red',
    alpha=1.0               # Transparency for better visibility
)
plt.xlim(95, 405)  
plt.ylim(0,100)

#ax.legend_.remove()  # Remove legend

S_HDT_pearsonR_pss_conf = pearsonr(merged['SHDT_Mean_Confidence'], merged['Pred_PSS_Value'])
print('S_HDT: Is confidence related to their PSS? \n R =', S_HDT_pearsonR_pss_conf[0], '\n P-value:',S_HDT_pearsonR_pss_conf[1], '\n R2 = ', S_HDT_pearsonR_pss_conf[0]**2 )

# Training: 

In [None]:
data_directory = '/PATH/SUBJECT_PATH/'
training_list = []  # List to store individual DataFrames
training_df = pd.DataFrame()
for filename in os.listdir(data_directory):
    if filename.endswith('.tsv'):  # Process only TSV files
        # Extract participant ID: Keep only the numeric part (e.g., 'sub-005d' → '005')
        match = re.search(r'sub-(\d+)', filename)
        if not match:
            continue  # Skip files that do not match the expected format
        participant_id = match.group(1)  # Extract the numeric part (e.g., '005')
        #print(participant_id)
        # Read the participant's data file
        participants_df = pd.read_csv(os.path.join(data_directory, filename), sep='\t',  skiprows=1)  
        participants_df.columns = [col.strip().replace('\t', '') for col in participants_df.columns]
    
        participants_df = participants_df[participants_df['Staircase_name'] == 'Training']
        participants_df['Participant_ID'] = participant_id
         # Append to the list
        training_list.append(participants_df)

# Combine all DataFrames into a single one
training_df = pd.concat(training_list, ignore_index=True)
training_df['Trial'] = training_df['Trial'].astype(int)

#print(training_df)

print('TRAINING: How many trials were completed: \n', training_df.groupby('Participant_ID')['Trial'].max().describe())
print('Median: \n', training_df.groupby('Participant_ID')['Trial'].max().median())


print('\n TRAINING: What delays were used: \n', training_df['Current Delay'].describe())
print('Median:', training_df['Current Delay'].median())

plt.figure(figsize=(8, 6))
sns.histplot(training_df['Current Delay'], bins=100, kde=True, color='#4C72B0', alpha=0.6)
plt.show()


print('TRAINING: How confident were people: \n', training_df.groupby('Participant_ID')['Confidence'].max().describe())
print('Median: \n', training_df.groupby('Participant_ID')['Trial'].max().median())

# Create the main histogram
plt.figure(figsize=(8, 6))
sns.histplot(training_df['Confidence'], bins=100, kde=True, color='#4C72B0', alpha=0.6)
plt.show()



In [None]:

# Scatter plot of AUC_x vs AUC_y
plt.figure(figsize=(8, 6))
sns.scatterplot(x='AUC_x', y='AUC_y', data=aucs_merged, s=100)

# Adding labels and title
plt.xlabel('AUC_sHDT')
plt.ylabel('AUC_HDT')
plt.title('Scatter Plot of AUC_x vs AUC_y by Participant ID')

# Display the plot
plt.show()

print(aucs_merged)
insight_pearsonR = pearsonr(aucs_merged['AUC_x'], aucs_merged['AUC_y'])
print('S_HDT: Is AUC_SHDT related to AUC_HDT on task? \n R =', insight_pearsonR[0], '\n P-value:',insight_pearsonR[1], '\n R2 = ', insight_pearsonR[0]**2 )

In [None]:
null_per_trial_diff_df = pd.DataFrame()

for i in tqdm(range(0,100)):  
    participants_file = f'/PATH/SIMULATED_NULL/10000_random_participant_simulations_{i}.tsv'
    all_null_df = pd.read_csv(participants_file, sep='\t')

    # Get participant IDs
    participant_ids = []
    participant_ids = all_null_df['Participant_ID'].unique().tolist()
    #print(participant_ids)
        
    # Loop over each participant ID
    for participant_id in participant_ids[0:100]:
        participants_df = all_null_df[all_null_df['Participant_ID'] == participant_id]
        #print(participants_df)
        participants_df['Trial'] = participants_df['Trial'].astype(int)
        participants_df['Modified_Staircase_name'] = participants_df['Staircase_name'].str[:-2]

        # Recode trial numbers
        participants_df.loc[:, 'Recode_Trial'] = np.where(
            participants_df['Staircase_name'].str.contains('400'),
            participants_df['Trial'], 
            participants_df['Trial'] + 15
        )

        #print(participants_df)

         # For 400ms (trials 1-15)
        mean_delays_400 = participants_df[participants_df['Staircase_name'].str.contains('400')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # For 100ms (trials 16-30)
        mean_delays_100 = participants_df[participants_df['Staircase_name'].str.contains('100')] \
                            .groupby('Recode_Trial')['Current Delay'].mean()

        # Combine the two mean delay series (400_1, 400_2, and 100_1, 100_2) into one DataFrame
        mean_delays = mean_delays_400.append(mean_delays_100)

        # Reindex the trials: 1-15 for 400ms, 16-30 for 100ms
        mean_delays.index = np.arange(1, 31)
                
        participant_data = {'Participant_ID': participant_id}
        
        # Compute differences for each trial
        for trial in range(1, 16):
            value_400 = mean_delays[trial]
            value_100 = mean_delays[trial + 15]
            trial_diff = value_400 - value_100

            # Store values in dictionary
            #participant_data[f'trial_{trial}_start'] = start_value
            #participant_data[f'trial_{trial}_end'] = end_value
            participant_data[f'trial_{trial}_diff'] = trial_diff

        # Append the dictionary as a single row to the DataFrame
        null_per_trial_diff_df = null_per_trial_diff_df.append(participant_data, ignore_index=True)


display(HTML(null_per_trial_diff_df.head().to_html()))       