In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os.path as op
import glob
from datetime import timedelta
from sklearn.preprocessing import StandardScaler

In [2]:
# Setup directory and visualization style
home_dir = op.abspath('./')
data_files = glob.glob(op.join(home_dir, 'data', '*.csv'))
sns.set_context('talk')

column_params = {
    'v2': {'n_cats': 5, 'labels': ['Least Similar', '', '  ', '   ', 'Most Similar']},
    'it': {'n_cats': 5, 'labels': ['Least Similar', '', '  ', '   ', 'Most Similar']}
}

scaler = StandardScaler()

In [3]:
def remove_unit_variance(df, col, unit, group=None, suffix="_within"):

    new_col = col + suffix
    df_copy = df.copy()

    def demean(x):
        return x - x.mean()

    if group is None:
        new = df_copy.groupby(unit)[col].transform(demean)
        new += df_copy[col].mean()
        df_copy[new_col] = new
    else:
        df_copy[new_col] = np.nan
        for level, df_level in df_copy.groupby(group):
            new = df_level.groupby(unit)[col].transform(demean)
            new += df_level[col].mean()
            df_copy.loc[new.index, new_col] = new

    return df_copy

def parse_dates(series):
    """Parse date strings, handling special cases like '24h'."""
    date_str = series.iloc[0]
    if "24h" in date_str:
        corrected_date_str = date_str.replace("24h", "00h")
        dt = pd.to_datetime(corrected_date_str, format='%Y-%m-%d_%Hh%M.%S.%f')
        dt += timedelta(days=1)
    else:
        dt = pd.to_datetime(date_str, format='%Y-%m-%d_%Hh%M.%S.%f')
    return dt

In [4]:
def process_values_time(value):
    """Process mouse time values from string to list of numbers."""
    try:
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            cleaned = value.strip('[]').split(',')
            if cleaned == ['']:  # Check if the list after stripping is empty
                return None
            numbers = [float(num) for num in cleaned if num.strip()]
            return numbers
        return None
    except ValueError:
        return None

def process_values_click(value):
    """Process mouse click values from string to list of items."""
    try:
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            cleaned = value.strip('[]').split(',')
            if cleaned == ['']:  # Check if the list after stripping is empty
                return None
            return [item for item in cleaned if item.strip()]
        return None
    except ValueError:
        return None

In [5]:
def df_creation(data_files, start_date, end_date):
    """Create a dataframe from the data files within a date range."""
    processed_dfs = []
    
    # Loop through files and try to read them
    for file_path in data_files:
        try:
            temp_df = pd.read_csv(file_path)
            temp_df['filename'] = file_path
            processed_dfs.append(temp_df)
        except Exception as e:
            # print(f"Error with {file_path}: {e}")
            continue

    # Process the loaded dataframes
    if processed_dfs:
        # Concatenate all dataframes
        df = pd.concat(processed_dfs, ignore_index=True)
        
        # Convert date column and filter by date range
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d_%Hh%M.%S.%f', errors='coerce')
        df.dropna(subset=['date'], inplace=True)
        df = df[(df['date'] >= pd.to_datetime(start_date)) & (df['date'] <= pd.to_datetime(end_date))]
        
        # Filter rows with non-null V2_diff values
        df = df.loc[df['V2_diff'].notnull()].reset_index(drop=True)
        
        # Convert reliability to float and create Retrocue Reliability
        df['reliability'] = df['reliability'].astype(float)
        df['Retrocue Reliability'] = np.where(df['reliability'] > 0.75, 'high', 'low')
        
        # Get numeric columns for later use
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        
        return df, numeric_columns
    else:
        # Return empty DataFrame if no files were processed
        return pd.DataFrame(), []

# Load data from both pilot periods
df1, numeric_columns1 = df_creation(data_files, '2024-10-08', '2024-10-30')  # pilot5
df2, numeric_columns2 = df_creation(data_files, '2024-11-22', '2025-01-30')  # pilot6
# df = df2
# Concatenate the data from both pilots
df = pd.concat([df1, df2], axis=0)
numeric_columns = numeric_columns1  # Use columns from first dataset

print(f"Loaded {len(df)} rows of data")
# print(f"Data from pilot 1: {len(df1)} rows")
# print(f"Data from pilot 2: {len(df2)} rows")
print(f"Total unique participants: {df['participant'].nunique()}")

Loaded 99014 rows of data
Total unique participants: 330


In [6]:
first_files = df.groupby('participant', sort=False)['filename'].first().reset_index()
first_files = first_files.rename(columns={'filename': 'earliest_filename'})
print("\nEarliest file for each participant:")
print(first_files)

df = pd.merge(df, first_files, on='participant', how='left')

df = df[df['filename'] == df['earliest_filename']].copy()

df.drop('earliest_filename', axis=1, inplace=True)

print("\nCleaned DataFrame (only earliest file entries per participant):")



Earliest file for each participant:
     participant                                  earliest_filename
0         156250  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
1         157210  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
2         152017  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
3         148645  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
4         156991  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
..           ...                                                ...
325       145480  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
326       154459  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
327       153394  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
328       136771  /Users/lana/Desktop/psychoPyExperiments/wm_dee...
329       167431  /Users/lana/Desktop/psychoPyExperiments/wm_dee...

[330 rows x 2 columns]

Cleaned DataFrame (only earliest file entries per participant):


In [7]:
len(set(df['participant']))

330

In [8]:
df = df[df['taskPhase'] == 'mainTask']


In [9]:
def identify_participants_in_multiple_files(dataframe):
    # Count unique files per participant
    participant_file_count = dataframe.groupby('participant')['filename'].nunique().reset_index()
    
    # Filter to only keep participants who appear in more than one file
    multi_file_participants = participant_file_count[participant_file_count['filename'] > 1]
    
    print(f"Found {len(multi_file_participants)} participants who appear in multiple files:")
    if len(multi_file_participants) > 0:
        print(multi_file_participants)
    
    return multi_file_participants

# Add this after your other cleaning steps
multi_file_participants = identify_participants_in_multiple_files(df)

# If you want to get the actual data for these participants:
multi_file_participant_data = df[df['participant'].isin(multi_file_participants['participant'])]


Found 0 participants who appear in multiple files:


In [10]:
len(set(df['participant']))

330

In [11]:
def add_trial_info(df, participant_col, trials_per_batch=30):
    """Add trial number and batch information to the dataframe."""
    df_copy = df.copy()
    df_copy['Trial_Number'] = df_copy.groupby(participant_col).cumcount() + 1
    df_copy['Trial_Batch'] = ((df_copy['Trial_Number'] - 1) // trials_per_batch) + 1
    return df_copy

df = add_trial_info(df, participant_col='participant')

# Add condition batch information
# Extract batch info from trial 182
df_trial_182 = df[df['trial'] == 182][['participant', 'trial', 'cond_file', 'root', 'IT_diff']].copy()
df_trial_182['conditions_batch'] = df_trial_182.groupby(['cond_file', 'root', 'IT_diff']).ngroup() + 1

# Count participants per batch
batch_info = df_trial_182.groupby('conditions_batch').agg(
    participants_count=('participant', 'nunique'),
    participants_list=('participant', 'unique')
).reset_index()

# First merge the conditions_batch to df_trial_182_with_counts
df_trial_182_with_counts = df_trial_182.merge(
    batch_info[['conditions_batch', 'participants_count', 'participants_list']],
    on='conditions_batch',
    how='left'
)

# Then merge this back to the main dataframe
df = df.merge(
    df_trial_182_with_counts[['participant', 'conditions_batch', 'participants_count', 'participants_list']],
    on='participant',
    how='left'
)

print(f"Added trial numbers and batch information. Max trial number: {df['Trial_Number'].max()}")

Added trial numbers and batch information. Max trial number: 300


In [12]:
len(set(df['participant']))

330

In [13]:
def calculate_differences(df):
    """Calculate differences between conditions for IT and V2 signals."""
    # Create a copy to avoid fragmentation warnings
    df_diff = df.copy()
    
    # Create all new columns in a single dictionary
    new_columns = {}
    
    # Attended and unattended conditions
    new_columns['it_sim_dis_attend'] = np.where(df['attend'] == 'img1', df['IT_root_im1'], df['IT_root_im2'])
    new_columns['v2_sim_dis_attend'] = np.where(df['attend'] == 'img1', df['V2_root_im1'], df['V2_root_im2'])
    new_columns['it_sim_dis_test'] = np.where(df['test_item'] == 'img1', df['IT_root_im1'], df['IT_root_im2'])
    new_columns['v2_sim_dis_test'] = np.where(df['test_item'] == 'img1', df['V2_root_im1'], df['V2_root_im2'])
    new_columns['it_sim_dis_unattend'] = np.where(df['attend'] != 'img1', df['IT_root_im1'], df['IT_root_im2'])
    new_columns['v2_sim_dis_unattend'] = np.where(df['attend'] != 'img1', df['V2_root_im1'], df['V2_root_im2'])
    new_columns['it_sim_dis_untest'] = np.where(df['test_item'] != 'img1', df['IT_root_im1'], df['IT_root_im2'])
    new_columns['v2_sim_dis_untest'] = np.where(df['test_item'] != 'img1', df['V2_root_im1'], df['V2_root_im2'])
    
    # Calculate differences
    new_columns['it_sim_dis_diff'] = np.where(
        df['attend'] == 'img1', 
        df['IT_root_im1'] - df['IT_root_im2'], 
        df['IT_root_im2'] - df['IT_root_im1']
    )
    new_columns['v2_sim_dis_diff'] = np.where(
        df['attend'] == 'img1', 
        df['V2_root_im1'] - df['V2_root_im2'],
        df['V2_root_im2'] - df['V2_root_im1']
    )
    
    new_columns['it_sim_dis_diff_test'] = np.where(
        df['test_item'] == 'img1', 
        df['IT_root_im1'] - df['IT_root_im2'], 
        df['IT_root_im2'] - df['IT_root_im1']
    )
    new_columns['v2_sim_dis_diff_test'] = np.where(
        df['test_item'] == 'img1', 
        df['V2_root_im1'] - df['V2_root_im2'], 
        df['V2_root_im2'] - df['V2_root_im1']
    )
    
    # Copy image similarity values
    new_columns['it_im1_im2'] = df['IT_im1_im2']
    new_columns['v2_im1_im2'] = df['V2_im1_im2']
    
    # Add all new columns at once to avoid fragmentation
    for col_name, values in new_columns.items():
        df_diff[col_name] = values
    
    return df_diff

In [14]:
# Continue the calculate_differences function
def add_preferences(df):
    """Add preference indicators based on similarity values."""
    df_pref = df.copy()
    
    # Determine convergence and preferences
    df_pref['v2_converges'] = np.where(
        (df_pref['it_sim_dis_diff'] > 0) & (df_pref['v2_sim_dis_diff'] > 0) | 
        (df_pref['it_sim_dis_diff'] < 0) & (df_pref['v2_sim_dis_diff'] < 0), 
        'V2/IT agree', 'V2/IT disagree'
    )
    
    # Determine preferences
    df_pref['v2_prefers'] = np.where(df_pref['v2_sim_dis_diff'] > 0, 'Prioritized', 'Deprioritized')
    df_pref['it_prefers'] = np.where(df_pref['it_sim_dis_diff'] > 0, 'Prioritized', 'Deprioritized')
    df_pref['v2_prefers_test'] = np.where(df_pref['v2_sim_dis_diff_test'] > 0, 'Tested', 'Untested')
    df_pref['it_prefers_test'] = np.where(df_pref['it_sim_dis_diff_test'] > 0, 'Tested', 'Untested')
    
    # Add preference columns with better names
    df_pref['Distractor V2 Similarity Preference Tested'] = df_pref['v2_prefers_test']
    df_pref['Distractor IT Similarity Preference Tested'] = df_pref['it_prefers_test']
    df_pref['Distractor V2 Similarity Preference'] = df_pref['v2_prefers']
    df_pref['Distractor IT Similarity Preference'] = df_pref['it_prefers']
    
    # Create binned versions of differences
    df_pref['IT_diff_binned'] = pd.qcut(df_pref['it_sim_dis_diff'], 5, duplicates='drop')
    df_pref['V2_diff_binned'] = pd.qcut(df_pref['v2_sim_dis_diff'], 5, duplicates='drop')
    df_pref['IT_diff_binned_test'] = pd.qcut(df_pref['it_sim_dis_diff_test'], 5, duplicates='drop')
    df_pref['V2_diff_binned_test'] = pd.qcut(df_pref['v2_sim_dis_diff_test'], 5, duplicates='drop')
    
    return df_pref

# Apply signal difference calculations
df = calculate_differences(df)
df = add_preferences(df)
print(f"Calculated signal differences and added preferences")

Calculated signal differences and added preferences


In [15]:
len(set(df['participant']))

330

In [16]:
def categorize_columns(df, column_params):
    """Categorize specified columns into discrete categories based on quantiles."""
    df_cat = df.copy()
    
    # Create all categorized columns at once
    cat_columns = {}
    
    for label in [
        'it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend',
        'it_sim_dis_diff', 'v2_sim_dis_diff', 'it_im1_im2', 'v2_im1_im2',
        'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest',
        'it_sim_dis_diff_test', 'v2_sim_dis_diff_test'
    ]:
        # Determine the column prefix
        column_prefix = 'v2' if 'v2' in label else 'it'
        
        # Get parameters
        n_cats = column_params[column_prefix]['n_cats']
        labels = column_params[column_prefix]['labels']
        
        # Create categorized column
        cat_columns[label + '_cat'] = pd.qcut(
            df_cat[label], 
            q=n_cats, 
            labels=labels, 
            duplicates='drop'
        )
    
    # Add all categorized columns at once
    for col_name, values in cat_columns.items():
        df_cat[col_name] = values
    
    return df_cat

def validity_assignment(df):
    """Create 'Tested Item' column based on validity."""
    df_validity = df.copy()
    df_validity['Tested Item'] = np.where(df_validity['validity'] == 'valid', 'prioritized', 'deprioritized')
    return df_validity

def df_column_addition(df):
    """Add user-friendly column names for plots and analyses."""
    df_add = df.copy()
    
    # Create a dictionary of all new columns
    new_columns = {
        'V2 Distractor Similarity\nto Prioritized Item': df['v2_sim_dis_attend_cat'],
        'IT Distractor Similarity\nto Prioritized Item': df['it_sim_dis_attend_cat'],
        'V2 Distractor Similarity\nto Deprioritized Item': df['v2_sim_dis_unattend_cat'],
        'IT Distractor Similarity\nto Deprioritized Item': df['it_sim_dis_unattend_cat'],
        'Prioritized - Deprioritized IT Distractor Similarity': df['it_sim_dis_diff_cat'],
        'Prioritized - Deprioritized V2 Distractor Similarity': df['v2_sim_dis_diff_cat'],
        'V2 Distractor Similarity\nto Tested Item': df['v2_sim_dis_test_cat'],
        'IT Distractor Similarity\nto Tested Item': df['it_sim_dis_test_cat'],
        'V2 Distractor Similarity\nto Untested Item': df['v2_sim_dis_untest_cat'],
        'IT Distractor Similarity\nto Untested Item': df['it_sim_dis_untest_cat'],
        'Tested - Untested IT Distractor Similarity': df['it_sim_dis_diff_test_cat'],
        'Tested - Untested V2 Distractor Similarity': df['v2_sim_dis_diff_test_cat'],
        'Prioritized - Deprioritized V2 Distractor Similarity Ranges': df['V2_diff_binned'],
        'Prioritized - Deprioritized IT Distractor Similarity Ranges': df['IT_diff_binned'],
        'Tested - Untested V2 Distractor Similarity Ranges': df['V2_diff_binned_test'],
        'Tested - Untested IT Distractor Similarity Ranges': df['IT_diff_binned_test'],
        'tested_item': df['Tested Item'],
        'ret_rel': df['Retrocue Reliability'],
        'validity_binary': df['Tested Item'].apply(lambda x: 1 if x == 'prioritized' else 0),
        'reliability_binary': df['Retrocue Reliability'].apply(lambda x: 1 if x == 'high' else 0)
    }
    
    # Add all new columns at once
    for col_name, values in new_columns.items():
        df_add[col_name] = values
    
    return df_add

# Apply processing
df = validity_assignment(df)
df = categorize_columns(df, column_params)
df = df_column_addition(df)

print(f"Added categorized columns and user interface labels")

Added categorized columns and user interface labels


In [17]:
def count_empty_clicked_names(df):
    df['none_clicked'] = df['mouse.clicked_name'].apply(
        lambda x: (isinstance(x, list) and len(x) == 0) or 
                  (isinstance(x, str) and x.strip() == '[]')
    )
    # Group by participant and sum the Booleans, where True counts as 1.
    result = df.groupby('participant')['none_clicked'].sum().reset_index()
    
    # Rename the column to something more descriptive.
    result.rename(columns={'none_clicked': 'none_clicked_count'}, inplace=True)
    
    return result

In [18]:

def clean_participants_by_empty_clicked(df, threshold=1/3):
   
    df['empty_clicked'] = df['mouse.clicked_name'].apply(
        lambda x: (isinstance(x, list) and len(x) == 0) or (isinstance(x, str) and x.strip() == '[]')
    )
    
    participant_stats = df.groupby('participant').agg(
        total_entries=('participant', 'size'),
        empty_count=('empty_clicked', 'sum')
    ).reset_index()
    
    participant_stats['empty_ratio'] = participant_stats['empty_count'] / participant_stats['total_entries']
    
    participants_to_drop = participant_stats.loc[participant_stats['empty_ratio'] > threshold, 'participant']
    
    df_cleaned = df[~df['participant'].isin(participants_to_drop)].copy()
    
    return df_cleaned, participant_stats

cleaned_df, stats_df = clean_participants_by_empty_clicked(df, 0.33)
    
print("\nParticipant statistics (total entries, empty count, empty ratio):")
# print(stats_df)

print("\nCleaned DataFrame (removed participants with >2/3 empty entries):")
# print(cleaned_df)



Participant statistics (total entries, empty count, empty ratio):

Cleaned DataFrame (removed participants with >2/3 empty entries):


In [19]:
len(set(cleaned_df['participant']))

297

In [20]:
participant_stats = cleaned_df.groupby('participant').agg(
        total_entries=('participant', 'size'),
        empty_count=('empty_clicked', 'sum')
    ).reset_index()
participant_stats[participant_stats['participant'] == 159733]

Unnamed: 0,participant,total_entries,empty_count


In [21]:
df = cleaned_df
len(set(cleaned_df['participant']))

297

In [22]:
def process_mouse_data(df):
    """Process mouse data to extract time and click information."""
    df_mouse = df.copy()
    
    # Process mouse time and click data
    df_mouse['processed_mouse.time'] = df['mouse.time'].apply(process_values_time)
    df_mouse['processed_mouse.click'] = df['mouse.clicked_name'].apply(process_values_click)
    
    # Extract length, first and last values
    df_mouse['mouse.time_length'] = df_mouse['processed_mouse.time'].apply(
        lambda x: len(x) if isinstance(x, list) else 0
    )
    df_mouse['mouse.time_first'] = df_mouse['processed_mouse.time'].apply(
        lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
    )
    df_mouse['mouse.time_last'] = df_mouse['processed_mouse.time'].apply(
        lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None
    )
    df_mouse['mouse.clicked_name_first'] = df_mouse['processed_mouse.click'].apply(
        lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
    )
    df_mouse['mouse.clicked_name_last'] = df_mouse['processed_mouse.click'].apply(
        lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None
    )
    
    return df_mouse

# Process mouse data
df = process_mouse_data(df)
print(f"Processed mouse data")

Processed mouse data


In [23]:
len(set(df['participant']))

297

In [24]:
def clean_dataframe_from_nonresp(df):
    """Filter out rows with missing mouse response data."""
    # Define a filter to check if mouse data exists
    def filter_ranges(numbers):
        if numbers is None:
            return False
        return all(num for num in numbers)
    
    # Apply filters for mouse time and click
    filtered_df = df[df['processed_mouse.time'].apply(filter_ranges)]
    filtered_df = filtered_df[filtered_df['processed_mouse.click'].apply(filter_ranges)]
    
    return filtered_df

def df_with_threshold(df, numeric_columns, threshold=0.4):
    """Filter to include only participants with accuracy above threshold."""
    # Calculate mean accuracy by participant
    sub_df = df[numeric_columns].groupby('participant').mean().reset_index()
    
    # Find participants above threshold
    above_threshold_subs = sub_df.loc[sub_df['resp_correct'] >= threshold, 'participant']
    
    # Filter dataframe
    df_filtered = df[df['participant'].isin(above_threshold_subs)]
    
    # Remove unit variance in accuracy
    df_filtered = remove_unit_variance(df_filtered, 'resp_correct', 'participant')
    df_filtered['Accuracy'] = df_filtered['resp_correct_within']
    
    return df_filtered


In [25]:
# Remove files with too few entries
participant_file_counts_cleaned = df.groupby(['participant', 'filename']).agg(
    entry_count=pd.NamedAgg(column='participant', aggfunc='size')
).reset_index()
participant_file_counts_cleaned

Unnamed: 0,participant,filename,entry_count
0,116851,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
1,121021,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
2,123262,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
3,123790,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
4,123985,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
...,...,...,...
292,170536,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
293,170665,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
294,171031,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300
295,171253,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300


In [26]:
len(set(df['participant']))

297

In [27]:
# Check for participants with high rates of missing mouse data
participant_file_counts = df.groupby(['participant', 'filename']).agg(
    entry_count=pd.NamedAgg(column='participant', aggfunc='size'),
    none_mouse=pd.NamedAgg(column='processed_mouse.time', aggfunc=lambda x: x.isna().sum())
).reset_index()

# Remove files with too many missing mouse responses
files_with_high_none_mouse = participant_file_counts[participant_file_counts['none_mouse'] > 100]
df = df[~df.set_index(['participant', 'filename']).index.isin(
    files_with_high_none_mouse.set_index(['participant', 'filename']).index
)]

# Remove files with too few entries
participant_file_counts_cleaned = df.groupby(['participant', 'filename']).agg(
    entry_count=pd.NamedAgg(column='participant', aggfunc='size')
).reset_index()



In [28]:
len(set(df['participant']))

297

In [29]:
files_with_low_entries = participant_file_counts_cleaned[participant_file_counts_cleaned['entry_count'] < 200]
print(files_with_low_entries)
df = df[~df.set_index(['participant', 'filename']).index.isin(
    files_with_low_entries.set_index(['participant', 'filename']).index
)]

     participant                                           filename  \
101       149227  /Users/lana/Desktop/psychoPyExperiments/wm_dee...   
113       151639  /Users/lana/Desktop/psychoPyExperiments/wm_dee...   
194       164275  /Users/lana/Desktop/psychoPyExperiments/wm_dee...   
269       168529  /Users/lana/Desktop/psychoPyExperiments/wm_dee...   

     entry_count  
101           30  
113          108  
194           23  
269          197  


In [30]:
len(set(df['participant']))

293

In [31]:
participant_counts = df['participant'].value_counts()
participants_over_300 = participant_counts[participant_counts > 300]
participants_over_300

Series([], Name: count, dtype: int64)

In [32]:
len(set(df['participant']))

293

In [33]:
# Apply cleaning operations
df_nonresp = clean_dataframe_from_nonresp(df)
print(f"After filtering non-responses: {len(df_nonresp)} rows")


df_nonresp = df_with_threshold(df_nonresp, numeric_columns, 0.4)
print(f"After accuracy threshold filtering: {len(df_nonresp)} rows with {df_nonresp['participant'].nunique()} participants")

# Convert to numeric and standardize within participants
df['mouse.time_last'] = pd.to_numeric(df['mouse.time_last'], errors='coerce')
df = remove_unit_variance(df, 'mouse.time_last', 'participant')
df['analysis_rt'] = df['mouse.time_last_within']
df['rt'] = df['mouse.time_last']



print(f"After filtering non-responses and thresholding: {len(df_nonresp)} rows")


df = df_with_threshold(df, numeric_columns, 0.4)
print(f"After accuracy threshold filtering: {len(df)} rows with {df['participant'].nunique()} participants")


After filtering non-responses: 83781 rows
After accuracy threshold filtering: 78410 rows with 273 participants
After filtering non-responses: 78410 rows
After accuracy threshold filtering: 81000 rows with 270 participants


In [34]:
len(set(df['participant']))

270

In [None]:
def df_z_score(df, list_of_variables):
    df_z = df.copy()
    new_columns = {}
    
    for l in list_of_variables:
        # ----- Process squared version -----
        # Square the variable and store in a new column name
        squared_col = f"{l}_sq"
        new_columns[squared_col] = df_z[l] ** 2
        
        # Standardize (z score) the squared variable
        scaler_sq = StandardScaler()
        squared_z = f"{squared_col}_z"
        new_columns[squared_z] = scaler_sq.fit_transform(
            pd.DataFrame(new_columns[squared_col])
        ).flatten()
        scaler_orig = StandardScaler()
        original_z = f"{l}_z"
        new_columns[original_z] = scaler_orig.fit_transform(
            pd.DataFrame(df_z[l])
        ).flatten()
    
    # Add all new columns at once
    for col_name, values in new_columns.items():
        df_z[col_name] = values
        
    return df_z

# Apply standardization to similarity metrics
sim_variables = [
    'it_sim_dis_diff', 'v2_sim_dis_diff', 
    'it_sim_dis_attend', 'v2_sim_dis_attend',
    'it_sim_dis_unattend', 'v2_sim_dis_unattend', 
    'it_sim_dis_test', 'v2_sim_dis_test',
    'it_sim_dis_untest', 'v2_sim_dis_untest', 
    'it_sim_dis_diff_test', 'v2_sim_dis_diff_test'
]

# # Apply standardization
# df = df_demean(df, sim_variables)
df = df_z_score(df, sim_variables)


scaler_v = StandardScaler()
scaler_r = StandardScaler()

# Update binary indicators to z-scored versions
df['validity_binary'] = (df['validity'] == 'valid').astype(int)
df['reliability_binary'] = (df['reliability'] > 0.7).astype(int)
df['validity_binary_z'] = scaler_v.fit_transform(df[['validity_binary']])
df['reliability_binary_z'] = scaler_r.fit_transform(df[['reliability_binary']])

# Create additional UI columns with z-scored values
ui_columns = {
    'V2 Distractor Similarity to Tested Item': df['v2_sim_dis_test'],
    'IT Distractor Similarity to Tested Item': df['it_sim_dis_test'],
    'Tested - Untested V2 Distractor Similarity': df['v2_sim_dis_diff_test'],
    'Tested - Untested IT Distractor Similarity': df['it_sim_dis_diff_test'],
    'V2 Distractor Similarity\nto Prioritized Item': df['v2_sim_dis_attend'],
    'IT Distractor Similarity\nto Prioritized Item': df['it_sim_dis_attend'],
    'V2 Distractor Similarity\nto Deprioritized Item': df['v2_sim_dis_unattend'],
    'IT Distractor Similarity\nto Deprioritized Item': df['it_sim_dis_unattend'],
    'Prioritized - Deprioritized IT Distractor Similarity': df['it_sim_dis_diff'],
    'Prioritized - Deprioritized V2 Distractor Similarity': df['v2_sim_dis_diff']
}

# Add UI columns at once
for col_name, values in ui_columns.items():
    df[col_name] = values

print(f"Added standardized variables and transformations")

In [None]:
def flip_z_sq_z(df, column_name):
    """
    Create sign-preserving transformations for a column.
    """
    df_flip = df.copy()
    
    # Dictionary to store all new columns
    new_columns = {}
    
    # Create sign-preserved value
    new_columns[column_name + '_sign'] = np.where(df[column_name] <= 0, -1, 1) * df[column_name]
    scaler_z = StandardScaler()

    # Z-score the sign-preserved value
    new_columns[column_name + '_sign_z'] = scaler_z.fit_transform(
        pd.DataFrame(new_columns[column_name + '_sign'])
    ).flatten()
    scaler_sq_z = StandardScaler()

    # Square and z-score
    new_columns[column_name + '_sign_sq'] = new_columns[column_name + '_sign_z'] ** 2
    new_columns[column_name + '_sign_sq_z'] = scaler_sq_z.fit_transform(
        pd.DataFrame(new_columns[column_name + '_sign_sq'])
    ).flatten()
    
    # Add all new columns at once
    for col_name, values in new_columns.items():
        df_flip[col_name] = values
    
    return df_flip

# Apply sign-preserving transformations
for column in ['it_sim_dis_diff_test', 'v2_sim_dis_diff_test', 'it_sim_dis_test', 'v2_sim_dis_test']:
    df = flip_z_sq_z(df, column)

print(f"Added sign-preserving transformations")

In [None]:
# Create positive/negative indicators and interaction terms
pos_neg_columns = {
    'it_pos_neg': np.where(df['it_sim_dis_diff_test'] <= 0, -1, 1),
    'v2_pos_neg': np.where(df['v2_sim_dis_diff_test'] <= 0, -1, 1),
    # 'it_pos_neg_abs': np.where(df['it_sim_dis_test'] <= 0, -1, 1),
    # 'v2_pos_neg_abs': np.where(df['v2_sim_dis_test'] <= 0, -1, 1)
}

# Add positive/negative columns
for col_name, values in pos_neg_columns.items():
    df[col_name] = values
    scaler_pn = StandardScaler()
    df[col_name + '_z'] = scaler_pn.fit_transform(df[[col_name]])

# Create interaction terms
interaction_terms = {
    'it_int_rel': df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_z'],
    'v2_int_rel': df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_z'],
    # 'it_int_abs': df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_z'],
    # 'v2_int_abs': df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_z'],
    'it_int_rel_sq': df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_sq_z'],
    'v2_int_rel_sq': df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_sq_z'],
    # 'it_int_abs_sq': df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_sq_z'],
    # 'v2_int_abs_sq': df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_sq_z']
}

# Add interaction terms at once
for col_name, values in interaction_terms.items():
    df[col_name] = values

print(f"Added interaction terms")

In [None]:
nonresponse_count = df['empty_clicked'].sum()
total_trials = len(df)
nonresponse_percentage = (nonresponse_count / total_trials) * 100

print(f"Number of nonresponse trials in final dataframe: {nonresponse_count}")
print(f"Percentage of nonresponse trials: {nonresponse_percentage:.2f}%")

# Check distribution across participants
participant_nonresponse = df.groupby('participant')['empty_clicked'].sum().reset_index()
participant_nonresponse.columns = ['participant', 'nonresponse_count']
participant_nonresponse['total_trials'] =df.groupby('participant').size().values
participant_nonresponse['nonresponse_percentage'] = (participant_nonresponse['nonresponse_count'] / participant_nonresponse['total_trials']) * 100

print("\nSummary of nonresponses by participant:")
print(f"Mean nonresponse percentage per participant: {participant_nonresponse['nonresponse_percentage'].mean():.2f}%")
print(f"Max nonresponse percentage for any participant: {participant_nonresponse['nonresponse_percentage'].max():.2f}%")
print(f"Number of participants with any nonresponses: {(participant_nonresponse['nonresponse_count'] > 0).sum()}")

# See participants with highest nonresponse rates
print("\nParticipants with highest nonresponse rates:")
print(participant_nonresponse.sort_values('nonresponse_percentage', ascending=False).head(10))

In [None]:
df