In [None]:
# OPTIONAL: Visualization of data distribution
# Uncomment to create exploratory visualizations

# def explore_data_distributions(df):
#     """Create exploratory visualizations of key variables."""
#     # Set up a figure with subplots
#     fig, axes = plt.subplots(2, 2, figsize=(16, 12))
#     
#     # Plot IT and V2 difference distributions
#     sns.histplot(df['it_sim_dis_diff'], kde=True, ax=axes[0, 0])
#     axes[0, 0].set_title('IT Similarity Difference Distribution')
#     
#     sns.histplot(df['v2_sim_dis_diff'], kde=True, ax=axes[0, 1])
#     axes[0, 1].set_title('V2 Similarity Difference Distribution')
#     
#     # Plot accuracy by condition
#     sns.boxplot(x='Retrocue Reliability', y='Accuracy', data=df, ax=axes[1, 0])
#     axes[1, 0].set_title('Accuracy by Retrocue Reliability')
#     
#     sns.boxplot(x='Tested Item', y='Accuracy', data=df, ax=axes[1, 1])
#     axes[1, 1].set_title('Accuracy by Tested Item')
#     
#     plt.tight_layout()
#     plt.show()
# 
# # Create exploratory visualizations
# explore_data_distributions(merged_df)

def summarize_final_dataset(df):
    """Print a summary of the final dataset."""
    print("\n=== Final Dataset Summary ===")
    print(f"Total rows: {len(df)}")
    print(f"Unique participants: {len(df['participant'].unique())}")
    print(f"Task phases present: {df['taskPhase'].unique()}")
    print(f"Pilot 1 participants: {len(df[df['pilot_number'] == 1]['participant'].unique())}")
    print(f"Pilot 2 participants: {len(df[df['pilot_number'] == 2]['participant'].unique())}")
    print(f"Columns in final dataset: {len(df.columns)}")
    print("Data types in final dataset:")
    print(df.dtypes.value_counts())
    print("\nMemory usage:", df.memory_usage().sum() / 1e6, "MB")

# Summarize the final dataset
summarize_final_dataset(merged_df)

# Save the final processed dataframe
merged_df.to_csv('pilot_TOTAL_merged_questionnaires.csv', index=False)
print("Saved merged dataset to pilot_TOTAL_merged_questionnaires.csv")

def merge_with_questionnaires(df, questionnaire_file='wm_questionnaires_preprocessed.csv'):
    """Merge behavioral data with questionnaire data."""
    # Load questionnaire data
    df_quest = pd.read_csv(questionnaire_file)
    print(f"Loaded questionnaire data with {len(df_quest)} rows")
    
    # Check for correspondence between datasets
    unique_subject_ids = df_quest['subject_id'].unique()
    unique_participants = df['participant'].unique()
    
    print(f"Number of unique subject_ids in questionnaire data: {len(unique_subject_ids)}")
    print(f"Number of unique participants in behavioral data: {len(unique_participants)}")
    
    common_participants = np.intersect1d(unique_participants, unique_subject_ids)
    print(f"Number of participants in both datasets: {len(common_participants)}")
    print(f"Percentage of behavioral participants in questionnaire data: {len(common_participants)/len(unique_participants)*100:.2f}%")
    
    # Merge dataframes
    merged_df = df.merge(
        df_quest,
        left_on='participant',  # Column in df
        right_on='subject_id',  # Column in df_quest
        how='left'              # Keep all rows from df
    )
    
    # Check for missing questionnaire data
    na_count = merged_df['vviq_sum'].isna().sum()
    print(f"Number of rows with missing vviq_sum: {na_count}")
    
    # Remove rows with missing questionnaire data
    merged_df = merged_df.dropna(subset=['vviq_sum'])
    merged_df = merged_df.dropna(subset=['osivq_visual_mean'])
    
    print(f"Merged dataframe has {len(merged_df)} rows after removing missing questionnaire data")
    print(f"Merged dataframe has {len(merged_df['participant'].unique())} unique participants")
    
    return merged_df

# Merge with questionnaire data
merged_df = merge_with_questionnaires(df)

def add_memorability_features(df):
    """Add memorability features from separate prediction files for each pilot."""
    # Mark each row with its pilot number
    df['pilot_number'] = np.where(df['date'] <= '2024-10-31', 1, 2)
    
    # Split dataframe by pilot
    df_1 = df[df['pilot_number'] == 1].copy()
    df_2 = df[df['pilot_number'] == 2].copy()
    
    # Print participant counts
    print(f"Pilot 1 participants: {len(set(df_1['participant']))}")
    print(f"Pilot 2 participants: {len(set(df_2['participant']))}")
    
    def add_memorability_to_df(df_subset, mem_file):
        """Add memorability features to a dataframe subset."""
        # Load memorability data
        df_memorability = pd.read_csv(mem_file)
        df_memorability['filename'] = df_memorability['filename'].str.replace('new_stimuli', 'stimuli')
        memorability_dict = dict(zip(df_memorability['filename'], df_memorability['predictions']))
        
        # Create memorability columns
        df_subset['tested_memorability_resmem'] = np.where(
            df_subset['test_item'] == 'img1',
            df_subset['img1'].astype(str).map(memorability_dict),
            df_subset['img2'].astype(str).map(memorability_dict)
        )
        
        df_subset['untested_memorability_resmem'] = np.where(
            df_subset['test_item'] == 'img1',
            df_subset['img2'].astype(str).map(memorability_dict),
            df_subset['img1'].astype(str).map(memorability_dict)
        )
        
        df_subset['attended_memorability_resmem'] = np.where(
            df_subset['attend'] == 'img1',
            df_subset['img1'].astype(str).map(memorability_dict),
            df_subset['img2'].astype(str).map(memorability_dict)
        )
        
        df_subset['unattended_memorability_resmem'] = np.where(
            df_subset['attend'] == 'img1',
            df_subset['img2'].astype(str).map(memorability_dict),
            df_subset['img1'].astype(str).map(memorability_dict)
        )
        
        df_subset['distractor_memorability'] = df_subset['ping_img'].astype(str).map(memorability_dict)
        df_subset['tested_memorability_resmem_z'] = scaler.fit_transform(df_subset[['tested_memorability_resmem']])
        
        return df_subset
    
    # Add memorability features to each pilot subset
    df_1 = add_memorability_to_df(df_1, 'predictions_pilot5.csv')
    df_2 = add_memorability_to_df(df_2, 'predictions_pilot6.csv')
    
    # Combine the datasets
    combined_df = pd.concat([df_1, df_2], ignore_index=True)
    
    return combined_df

# Add memorability features
df = add_memorability_features(df)
print(f"Added memorability features. Total columns: {len(df.columns)}")

def handle_participants_over_300(df):
    """Handle participants with more than 300 entries by keeping only the first 300."""
    participant_counts = df['participant'].value_counts()
    participants_over_300 = participant_counts[participant_counts > 300]
    
    if not participants_over_300.empty:
        print(f"Found {len(participants_over_300)} participants appearing in multiple files:")
        print(participants_over_300)
        
        rows_to_keep = []
        for participant in participants_over_300.index:
            files = df[df['participant'] == participant]['filename'].unique()
            print(f"\nParticipant {participant} appears in {len(files)} files:")
            
            for file in files:
                count = df[(df['participant'] == participant) & (df['filename'] == file)].shape[0]
                print(f"  - {file}: {count} times")
                
            participant_data = df[df['participant'] == participant]
            participant_data_limited = participant_data.head(300)
            rows_to_keep.extend(participant_data_limited.index.tolist())
            
            print(f"Keeping first 300 rows for participant {participant}, dropping {len(participant_data) - 300} rows")
    
        # Also keep rows for participants with 300 or fewer entries
        for participant in participant_counts[participant_counts <= 300].index:
            participant_data = df[df['participant'] == participant]
            rows_to_keep.extend(participant_data.index.tolist())
            
        # Create the cleaned dataframe by selecting only the rows to keep
        df_cleaned = df.loc[rows_to_keep]
        return df_cleaned
    else:
        return df

# Handle participants with more than 300 entries
df = handle_participants_over_300(df)
print(f"After handling participants with >300 entries: {len(df)} rows")

# Filter to main task only
df = df[df['taskPhase'] == 'mainTask']
print(f"After filtering to main task: {len(df)} rows")

def clean_dataframe_from_nonresp(df):
    """Filter out rows with missing mouse response data."""
    # Define a filter to check if mouse data exists
    def filter_ranges(numbers):
        if numbers is None:
            return False
        return all(num for num in numbers)
    
    # Apply filters for mouse time and click
    filtered_df = df[df['processed_mouse.time'].apply(filter_ranges)]
    filtered_df = filtered_df[filtered_df['processed_mouse.click'].apply(filter_ranges)]
    
    return filtered_df

def df_with_threshold(df, numeric_columns, threshold=0.4):
    """Filter to include only participants with accuracy above threshold."""
    # Calculate mean accuracy by participant
    sub_df = df[numeric_columns].groupby('participant').mean().reset_index()
    
    # Find participants above threshold
    above_threshold_subs = sub_df.loc[sub_df['resp_correct'] >= threshold, 'participant']
    
    # Filter dataframe
    df_filtered = df[df['participant'].isin(above_threshold_subs)]
    
    # Remove unit variance in accuracy
    df_filtered = remove_unit_variance(df_filtered, 'resp_correct', 'participant')
    df_filtered['Accuracy'] = df_filtered['resp_correct_within']
    
    return df_filtered

# Apply cleaning operations
df_nonresp = clean_dataframe_from_nonresp(df)
print(f"After filtering non-responses: {len(df_nonresp)} rows")

df = df_with_threshold(df, numeric_columns, 0.4)
print(f"After accuracy threshold filtering: {len(df)} rows with {df['participant'].nunique()} participants")

def process_mouse_data(df):
    """Process mouse data to extract time and click information."""
    df_mouse = df.copy()
    
    # Process mouse time and click data
    df_mouse['processed_mouse.time'] = df['mouse.time'].apply(process_values_time)
    df_mouse['processed_mouse.click'] = df['mouse.clicked_name'].apply(process_values_click)
    
    # Extract length, first and last values
    df_mouse['mouse.time_length'] = df_mouse['processed_mouse.time'].apply(
        lambda x: len(x) if isinstance(x, list) else 0
    )
    df_mouse['mouse.time_first'] = df_mouse['processed_mouse.time'].apply(
        lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
    )
    df_mouse['mouse.time_last'] = df_mouse['processed_mouse.time'].apply(
        lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None
    )
    df_mouse['mouse.clicked_name_first'] = df_mouse['processed_mouse.click'].apply(
        lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
    )
    df_mouse['mouse.clicked_name_last'] = df_mouse['processed_mouse.click'].apply(
        lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None
    )
    
    return df_mouse

# Process mouse data
df = process_mouse_data(df)
print(f"Processed mouse data. Total columns: {len(df.columns)}")

# Set reaction time for analysis
df['rt'] = df['mouse.time_last']
df = remove_unit_variance(df, 'mouse.time_last', 'participant')
df['analysis_rt'] = df['mouse.time_last_within']

print("Added reaction time analysis variables")

# Create positive/negative indicators and interaction terms
pos_neg_columns = {
    'it_pos_neg': np.where(df['it_sim_dis_diff_test'] <= 0, -1, 1),
    'v2_pos_neg': np.where(df['v2_sim_dis_diff_test'] <= 0, -1, 1),
    'it_pos_neg_abs': np.where(df['it_sim_dis_test'] <= 0, -1, 1),
    'v2_pos_neg_abs': np.where(df['v2_sim_dis_test'] <= 0, -1, 1)
}

# Add positive/negative columns
for col_name, values in pos_neg_columns.items():
    df[col_name] = values
    df[col_name + '_z'] = scaler.fit_transform(df[[col_name]])

# Create interaction terms
interaction_terms = {
    'it_int_rel': df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_z'],
    'v2_int_rel': df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_z'],
    'it_int_abs': df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_z'],
    'v2_int_abs': df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_z'],
    'it_int_rel_sq': df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_sq_z'],
    'v2_int_rel_sq': df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_sq_z'],
    'it_int_abs_sq': df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_sq_z'],
    'v2_int_abs_sq': df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_sq_z']
}

# Add interaction terms at once
for col_name, values in interaction_terms.items():
    df[col_name] = values

print(f"Added interaction terms. Total columns: {len(df.columns)}")

def flip_z_sq_z(df, column_name):
    """
    Create sign-preserving transformations for a column.
    
    Parameters:
    -----------
    df : DataFrame
        Input dataframe
    column_name : str
        Column to transform
        
    Returns:
    --------
    DataFrame with new columns
    """
    df_flip = df.copy()
    
    # Dictionary to store all new columns
    new_columns = {}
    
    # Create sign-preserved value
    new_columns[column_name + '_sign'] = np.where(df[column_name] <= 0, -1, 1) * df[column_name]
    
    # Z-score the sign-preserved value
    new_columns[column_name + '_sign_z'] = scaler.fit_transform(
        pd.DataFrame(new_columns[column_name + '_sign'])
    ).flatten()
    
    # Square and z-score
    new_columns[column_name + '_sign_sq'] = new_columns[column_name + '_sign_z'] ** 2
    new_columns[column_name + '_sign_sq_z'] = scaler.fit_transform(
        pd.DataFrame(new_columns[column_name + '_sign_sq'])
    ).flatten()
    
    # Add all new columns at once
    for col_name, values in new_columns.items():
        df_flip[col_name] = values
    
    return df_flip

# Apply sign-preserving transformations
for column in ['it_sim_dis_diff_test', 'v2_sim_dis_diff_test', 'it_sim_dis_test', 'v2_sim_dis_test']:
    df = flip_z_sq_z(df, column)

print(f"Added sign-preserving transformations. Total columns: {len(df.columns)}"){
 "cells": [
  {
   "cell_type": "markdown",
   "id": "intro-markdown",
   "metadata": {},
   "source": [
    "# Optimized Data Processing Notebook\n",
    "\n",
    "This notebook processes experimental data, calculates various metrics, and merges with questionnaire data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "import-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import os.path as op\n",
    "import glob\n",
    "from datetime import timedelta\n",
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "setup-markdown",
   "metadata": {},
   "source": [
    "## 1. Setup and Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "setup-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup directory and visualization style\n",
    "home_dir = op.abspath('./')\n",
    "data_files = glob.glob(op.join(home_dir, 'data', '*.csv'))\n",
    "sns.set_context('talk')\n",
    "\n",
    "# Define parameters for similarity categorization\n",
    "column_params = {\n",
    "    'v2': {'n_cats': 5, 'labels': ['Least Similar', '', '  ', '   ', 'Most Similar']},\n",
    "    'it': {'n_cats': 5, 'labels': ['Least Similar', '', '  ', '   ', 'Most Similar']}\n",
    "}\n",
    "\n",
    "# Create a global StandardScaler\n",
    "scaler = StandardScaler()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "helper-functions-markdown",
   "metadata": {},
   "source": [
    "## 2. Helper Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "helper-functions-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_unit_variance(df, col, unit, group=None, suffix=\"_within\"):\n",
    "    \"\"\"Remove variance between sampling units.\n",
    "\n",
    "    This is useful for plotting repeated-measures data using within-unit\n",
    "    error bars.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df : DataFrame\n",
    "        Input data. Will have a new column added.\n",
    "    col : column name\n",
    "        Column in dataframe with quantitative measure to modify.\n",
    "    unit : column name\n",
    "        Column in dataframe defining sampling units (e.g., subjects).\n",
    "    group : column name(s), optional\n",
    "        Columns defining groups to remove unit variance within.\n",
    "    suffix : string, optional\n",
    "        Suffix appended to ``col`` name to create new column.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df : DataFrame\n",
    "        Returns modified dataframe.\n",
    "    \"\"\"\n",
    "    new_col = col + suffix\n",
    "    df_copy = df.copy()\n",
    "\n",
    "    def demean(x):\n",
    "        return x - x.mean()\n",
    "\n",
    "    if group is None:\n",
    "        new = df_copy.groupby(unit)[col].transform(demean)\n",
    "        new += df_copy[col].mean()\n",
    "        df_copy[new_col] = new\n",
    "    else:\n",
    "        df_copy[new_col] = np.nan\n",
    "        for level, df_level in df_copy.groupby(group):\n",
    "            new = df_level.groupby(unit)[col].transform(demean)\n",
    "            new += df_level[col].mean()\n",
    "            df_copy.loc[new.index, new_col] = new\n",
    "\n",
    "    return df_copy\n",
    "\n",
    "def parse_dates(series):\n",
    "    \"\"\"Parse date strings, handling special cases like '24h'.\"\"\"\n",
    "    date_str = series.iloc[0]\n",
    "    if \"24h\" in date_str:\n",
    "        corrected_date_str = date_str.replace(\"24h\", \"00h\")\n",
    "        dt = pd.to_datetime(corrected_date_str, format='%Y-%m-%d_%Hh%M.%S.%f')\n",
    "        dt += timedelta(days=1)\n",
    "    else:\n",
    "        dt = pd.to_datetime(date_str, format='%Y-%m-%d_%Hh%M.%S.%f')\n",
    "    return dt\n",
    "\n",
    "def process_values_time(value):\n",
    "    \"\"\"Process mouse time values from string to list of numbers.\"\"\"\n",
    "    try:\n",
    "        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):\n",
    "            cleaned = value.strip('[]').split(',')\n",
    "            if cleaned == ['']:  # Check if the list after stripping is empty\n",
    "                return None\n",
    "            numbers = [float(num) for num in cleaned if num.strip()]\n",
    "            return numbers\n",
    "        return None\n",
    "    except ValueError:\n",
    "        return None\n",
    "\n",
    "def process_values_click(value):\n",
    "    \"\"\"Process mouse click values from string to list of items.\"\"\"\n",
    "    try:\n",
    "        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):\n",
    "            cleaned = value.strip('[]').split(',')\n",
    "            if cleaned == ['']:  # Check if the list after stripping is empty\n",
    "                return None\n",
    "            return [item for item in cleaned if item.strip()]\n",
    "        return None\n",
    "    except ValueError:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "data-loading-markdown",
   "metadata": {},
   "source": [
    "## 3. Data Loading and Initial Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "data-loading-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "def df_creation(data_files, start_date, end_date):\n",
    "    \"\"\"Create a dataframe from the data files within a date range.\"\"\"\n",
    "    processed_dfs = []\n",
    "    \n",
    "    # Loop through files and try to read them\n",
    "    for file_path in data_files:\n",
    "        try:\n",
    "            temp_df = pd.read_csv(file_path)\n",
    "            temp_df['filename'] = file_path\n",
    "            processed_dfs.append(temp_df)\n",
    "        except Exception:\n",
    "            continue\n",
    "\n",
    "    # Process the loaded dataframes\n",
    "    if processed_dfs:\n",
    "        # Concatenate all dataframes\n",
    "        df = pd.concat(processed_dfs, ignore_index=True)\n",
    "        \n",
    "        # Convert date column and filter by date range\n",
    "        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d_%Hh%M.%S.%f', errors='coerce')\n",
    "        df.dropna(subset=['date'], inplace=True)\n",
    "        df = df[(df['date'] >= pd.to_datetime(start_date)) & (df['date'] <= pd.to_datetime(end_date))]\n",
    "        \n",
    "        # Filter rows with non-null V2_diff values\n",
    "        df = df.loc[df['V2_diff'].notnull()].reset_index(drop=True)\n",
    "        \n",
    "        # Convert reliability to float and create Retrocue Reliability\n",
    "        df['reliability'] = df['reliability'].astype(float)\n",
    "        df['Retrocue Reliability'] = np.where(df['reliability'] > 0.75, 'high', 'low')\n",
    "        \n",
    "        # Get numeric columns for later use\n",
    "        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
    "        \n",
    "        return df, numeric_columns\n",
    "    else:\n",
    "        # Return empty DataFrame if no files were processed\n",
    "        return pd.DataFrame(), []\n",
    "\n",
    "# Load data from both pilot periods\n",
    "df1, numeric_columns1 = df_creation(data_files, '2024-10-08', '2024-10-30')  # pilot5\n",
    "df2, numeric_columns2 = df_creation(data_files, '2024-11-22', '2025-01-30')  # pilot6\n",
    "\n",
    "# Concatenate the data from both pilots\n",
    "df = pd.concat([df1, df2], axis=0)\n",
    "numeric_columns = numeric_columns1  # Use columns from first dataset\n",
    "\n",
    "# Print basic information about the loaded data\n",
    "print(f\"Loaded {len(df)} rows of data\")\n",
    "print(f\"Data from pilot 1: {len(df1)} rows\")\n",
    "print(f\"Data from pilot 2: {len(df2)} rows\")\n",
    "print(f\"Total unique participants: {df['participant'].nunique()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "add-trial-info-markdown",
   "metadata": {},
   "source": [
    "## 4. Add Trial Information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "add-trial-info-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add trial number and batch information\n",
    "def add_trial_info(df, participant_col, trials_per_batch=30):\n",
    "    \"\"\"Add trial number and batch information to the dataframe.\"\"\"\n",
    "    df['Trial_Number'] = df.groupby(participant_col).cumcount() + 1\n",
    "    df['Trial_Batch'] = ((df['Trial_Number'] - 1) // trials_per_batch) + 1\n",
    "    return df\n",
    "\n",
    "df = add_trial_info(df, participant_col='participant')\n",
    "\n",
    "# Add condition batch information\n",
    "# Extract batch info from trial 182\n",
    "df_trial_182 = df[df['trial'] == 182][['participant', 'trial', 'cond_file', 'root', 'IT_diff']].copy()\n",
    "df_trial_182['conditions_batch'] = df_trial_182.groupby(['cond_file', 'root', 'IT_diff']).ngroup() + 1\n",
    "\n",
    "# Count participants per batch\n",
    "batch_info = df_trial_182.groupby('conditions_batch').agg(\n",
    "    participants_count=('participant', 'nunique'),\n",
    "    participants_list=('participant', 'unique')\n",
    ").reset_index()\n",
    "\n",
    "# Merge batch info back to the main dataframe\n",
    "df = df.merge(\n",
    "    df_trial_182[['participant', 'conditions_batch', 'participants_count', 'participants_list']],\n",
    "    on='participant',\n",
    "    how='left'\n",
    ")\n",
    "\n",
    "print(f\"Added trial numbers and batch information. Max trial number: {df['Trial_Number'].max()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "signal-diff-markdown",
   "metadata": {},
   "source": [
    "## 5. Calculate Signal Differences and Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "signal-diff-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_differences(df):\n",
    "    \"\"\"Calculate differences between conditions for IT and V2 signals.\"\"\"\n",
    "    # Create a copy to avoid fragmentation warnings\n",
    "    df_diff = df.copy()\n",
    "    \n",
    "    # Create all new columns in a single dictionary\n",
    "    new_columns = {}\n",
    "    \n",
    "    # Attended and unattended conditions\n",
    "    new_columns['it_sim_dis_attend'] = np.where(df['attend'] == 'img1', df['IT_root_im1'], df['IT_root_im2'])\n",
    "    new_columns['v2_sim_dis_attend'] = np.where(df['attend'] == 'img1', df['V2_root_im1'], df['V2_root_im2'])\n",
    "    new_columns['it_sim_dis_test'] = np.where(df['test_item'] == 'img1', df['IT_root_im1'], df['IT_root_im2'])\n",
    "    new_columns['v2_sim_dis_test'] = np.where(df['test_item'] == 'img1', df['V2_root_im1'], df['V2_root_im2'])\n",
    "    new_columns['it_sim_dis_unattend'] = np.where(df['attend'] != 'img1', df['IT_root_im1'], df['IT_root_im2'])\n",
    "    new_columns['v2_sim_dis_unattend'] = np.where(df['attend'] != 'img1', df['V2_root_im1'], df['V2_root_im2'])\n",
    "    new_columns['it_sim_dis_untest'] = np.where(df['test_item'] != 'img1', df['IT_root_im1'], df['IT_root_im2'])\n",
    "    new_columns['v2_sim_dis_untest'] = np.where(df['test_item'] != 'img1', df['V2_root_im1'], df['V2_root_im2'])\n",
    "    \n",
    "    # Calculate differences\n",
    "    new_columns['it_sim_dis_diff'] = np.where(\n",
    "        df['attend'] == 'img1', \n",
    "        df['IT_root_im1'] - df['IT_root_im2'], \n",
    "        df['IT_root_im2'] - df['IT_root_im1']\n",
    "    )\n",
    "    new_columns['v2_sim_dis_diff'] = np.where(\n",
    "        df['attend'] == 'img1', \n",
    "        df['V2_root_im1'] - df['V2_root_im2'], \n",
    "        df['V2_root_im2'] - df['V2_root_im1']\n",
    "    )\n",
    "    \n",
    "    new_columns['it_sim_dis_diff_test'] = np.where(\n",
    "        df['test_item'] == 'img1', \n",
    "        df['IT_root_im1'] - df['IT_root_im2'], \n",
    "        df['IT_root_im2'] - df['IT_root_im1']\n",
    "    )\n",
    "    new_columns['v2_sim_dis_diff_test'] = np.where(\n",
    "        df['test_item'] == 'img1', \n",
    "        df['V2_root_im1'] - df['V2_root_im2'], \n",
    "        df['V2_root_im2'] - df['V2_root_im1']\n",
    "    )\n",
    "    \n",
    "    # Copy image similarity values\n",
    "    new_columns['it_im1_im2'] = df['IT_im1_im2']\n",
    "    new_columns['v2_im1_im2'] = df['V2_im1_im2']\n",
    "    \n",
    "    # Add all new columns at once to avoid fragmentation\n",
    "    for col_name, values in new_columns.items():\n",
    "        df_diff[col_name] = values\n",
    "    \n",
    "    # Determine convergence and preferences\n",
    "    df_diff['v2_converges'] = np.where(\n",
    "        (df_diff['it_sim_dis_diff'] > 0) & (df_diff['v2_sim_dis_diff'] > 0) | \n",
    "        (df_diff['it_sim_dis_diff'] < 0) & (df_diff['v2_sim_dis_diff'] < 0), \n",
    "        'V2/IT agree', 'V2/IT disagree'\n",
    "    )\n",
    "    \n",
    "    # Determine preferences\n",
    "    df_diff['v2_prefers'] = np.where(df_diff['v2_sim_dis_diff'] > 0, 'Prioritized', 'Deprioritized')\n",
    "    df_diff['it_prefers'] = np.where(df_diff['it_sim_dis_diff'] > 0, 'Prioritized', 'Deprioritized')\n",
    "    df_diff['v2_prefers_test'] = np.where(df_diff['v2_sim_dis_diff_test'] > 0, 'Tested', 'Untested')\n",
    "    df_diff['it_prefers_test'] = np.where(df_diff['it_sim_dis_diff_test'] > 0, 'Tested', 'Untested')\n",
    "    \n",
    "    # Add preference columns with better names\n",
    "    df_diff['Distractor V2 Similarity Preference Tested'] = df_diff['v2_prefers_test']\n",
    "    df_diff['Distractor IT Similarity Preference Tested'] = df_diff['it_prefers_test']\n",
    "    df_diff['Distractor V2 Similarity Preference'] = df_diff['v2_prefers']\n",
    "    df_diff['Distractor IT Similarity Preference'] = df_diff['it_prefers']\n",
    "    \n",
    "    # Create binned versions of differences\n",
    "    df_diff['IT_diff_binned'] = pd.qcut(df_diff['it_sim_dis_diff'], 5, duplicates='drop')\n",
    "    df_diff['V2_diff_binned'] = pd.qcut(df_diff['v2_sim_dis_diff'], 5, duplicates='drop')\n",
    "    df_diff['IT_diff_binned_test'] = pd.qcut(df_diff['it_sim_dis_diff_test'], 5, duplicates='drop')\n",
    "    df_diff['V2_diff_binned_test'] = pd.qcut(df_diff['v2_sim_dis_diff_test'], 5, duplicates='drop')\n",
    "    \n",
    "    return df_diff\n",
    "\n",
    "# Apply signal difference calculations\n",
    "df = calculate_differences(df)\n",
    "print(f\"Calculated signal differences and added {len(df.columns) - len(numeric_columns)} new columns\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "categorize-markdown",
   "metadata": {},
   "source": [
    "## 6. Categorize Columns and Add User Interface Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "categorize-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "def categorize_columns(df, column_params):\n",
    "    \"\"\"Categorize specified columns into discrete categories based on quantiles.\"\"\"\n",
    "    df_cat = df.copy()\n",
    "    \n",
    "    # Create all categorized columns at once\n",
    "    cat_columns = {}\n",
    "    \n",
    "    for label in [\n",
    "        'it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend',\n",
    "        'it_sim_dis_diff', 'v2_sim_dis_diff', 'it_im1_im2', 'v2_im1_im2',\n",
    "        'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest',\n",
    "        'it_sim_dis_diff_test', 'v2_sim_dis_diff_test'\n",
    "    ]:\n",
    "        # Determine the column prefix\n",
    "        column_prefix = 'v2' if 'v2' in label else 'it'\n",
    "        \n",
    "        # Get parameters\n",
    "        n_cats = column_params[column_prefix]['n_cats']\n",
    "        labels = column_params[column_prefix]['labels']\n",
    "        \n",
    "        # Create categorized column\n",
    "        cat_columns[label + '_cat'] = pd.qcut(\n",
    "            df_cat[label], \n",
    "            q=n_cats, \n",
    "            labels=labels, \n",
    "            duplicates='drop'\n",
    "        )\n",
    "    \n",
    "    # Add all categorized columns at once\n",
    "    for col_name, values in cat_columns.items():\n",
    "        df_cat[col_name] = values\n",
    "    \n",
    "    return df_cat\n",
    "\n",
    "def df_column_addition(df):\n",
    "    \"\"\"Add user-friendly column names for plots and analyses.\"\"\"\n",
    "    df_add = df.copy()\n",
    "    \n",
    "    # Create a dictionary of all new columns\n",
    "    new_columns = {\n",
    "        'V2 Distractor Similarity\\nto Prioritized Item': df['v2_sim_dis_attend_cat'],\n",
    "        'IT Distractor Similarity\\nto Prioritized Item': df['it_sim_dis_attend_cat'],\n",
    "        'V2 Distractor Similarity\\nto Deprioritized Item': df['v2_sim_dis_unattend_cat'],\n",
    "        'IT Distractor Similarity\\nto Deprioritized Item': df['it_sim_dis_unattend_cat'],\n",
    "        'Prioritized - Deprioritized IT Distractor Similarity': df['it_sim_dis_diff_cat'],\n",
    "        'Prioritized - Deprioritized V2 Distractor Similarity': df['v2_sim_dis_diff_cat'],\n",
    "        'V2 Distractor Similarity\\nto Tested Item': df['v2_sim_dis_test_cat'],\n",
    "        'IT Distractor Similarity\\nto Tested Item': df['it_sim_dis_test_cat'],\n",
    "        'V2 Distractor Similarity\\nto Untested Item': df['v2_sim_dis_untest_cat'],\n",
    "        'IT Distractor Similarity\\nto Untested Item': df['it_sim_dis_untest_cat'],\n",
    "        'Tested - Untested IT Distractor Similarity': df['it_sim_dis_diff_test_cat'],\n",
    "        'Tested - Untested V2 Distractor Similarity': df['v2_sim_dis_diff_test_cat'],\n",
    "        'Prioritized - Deprioritized V2 Distractor Similarity Ranges': df['V2_diff_binned'],\n",
    "        'Prioritized - Deprioritized IT Distractor Similarity Ranges': df['IT_diff_binned'],\n",
    "        'Tested - Untested V2 Distractor Similarity Ranges': df['V2_diff_binned_test'],\n",
    "        'Tested - Untested IT Distractor Similarity Ranges': df['IT_diff_binned_test'],\n",
    "        'tested_item': df['Tested Item'],\n",
    "        'ret_rel': df['Retrocue Reliability'],\n",
    "        'validity_binary': df['Tested Item'].apply(lambda x: 1 if x == 'prioritized' else 0),\n",
    "        'reliability_binary': df['Retrocue Reliability'].apply(lambda x: 1 if x == 'high' else 0)\n",
    "    }\n",
    "    \n",
    "    # Add all new columns at once\n",
    "    for col_name, values in new_columns.items():\n",
    "        df_add[col_name] = values\n",
    "    \n",
    "    return df_add\n",
    "\n",
    "def validity_assignment(df):\n",
    "    \"\"\"Create 'Tested Item' column based on validity.\"\"\"\n",
    "    df_validity = df.copy()\n",
    "    df_validity['Tested Item'] = np.where(df_validity['validity'] == 'valid', 'prioritized', 'deprioritized')\n",
    "    return df_validity\n",
    "\n",
    "# Apply processing\n",
    "df = validity_assignment(df)\n",
    "df = categorize_columns(df, column_params)\n",
    "df = df_column_addition(df)\n",
    "\n",
    "print(f\"Added categorized columns and user interface labels. Total columns: {len(df.columns)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "standardize-markdown",
   "metadata": {},
   "source": [
    "## 7. Standardize Variables and Create Transformed Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "standardize-cell",
   "metadata": {},
   "outputs": [],
   "source": [
    "def df_demean(df, list_of_variables):\n",
    "    \"\"\"Demean variables and create z-scored versions.\"\"\"\n",
    "    # Process all variables at once to avoid fragmentation\n",
    "    df_std = df.copy()\n",
    "    \n",
    "    # Dictionary to store all new columns\n",
    "    new_columns = {}\n",
    "    \n",
    "    for l in list_of_variables:\n",
    "        # Demean the variable\n",
    "        new_columns[l] = df_std[l] - np.mean(df_std[l])\n",
    "        # Create z-scored version\n",
    "        z_scored = f\"{l}_z\"\n",
    "        new_columns[z_scored] = scaler.fit_transform(pd.DataFrame(new_columns[squared_col])).flatten()\n",
    "    \n",
    "    # Add all new columns at once\n",
    "    for col_name, values in new_columns.items():\n",
    "        df_sq[col_name] = values\n",
    "        \n",
    "    return df_sq\n",
    "\n",
    "# Apply standardization to similarity metrics\n",
    "sim_variables = [\n",
    "    'it_sim_dis_diff', 'v2_sim_dis_diff', \n",
    "    'it_sim_dis_attend', 'v2_sim_dis_attend',\n",
    "    'it_sim_dis_unattend', 'v2_sim_dis_unattend', \n",
    "    'it_sim_dis_test', 'v2_sim_dis_test',\n",
    "    'it_sim_dis_untest', 'v2_sim_dis_untest', \n",
    "    'it_sim_dis_diff_test', 'v2_sim_dis_diff_test'\n",
    "]\n",
    "\n",
    "# Apply standardization\n",
    "df = df_demean(df, sim_variables)\n",
    "df = df_square_and_mean(df, sim_variables)\n",
    "\n",
    "# Update binary indicators to z-scored versions\n",
    "df['validity_binary'] = (df['validity'] == 'valid').astype(int)\n",
    "df['reliability_binary'] = (df['reliability'] > 0.7).astype(int)\n",
    "df['validity_binary_z'] = scaler.fit_transform(df[['validity_binary']])\n",
    "df['reliability_binary_z'] = scaler.fit_transform(df[['reliability_binary']])\n",
    "\n",
    "# Create additional UI columns with z-scored values\n",
    "ui_columns = {\n",
    "    'V2 Distractor Similarity to Tested Item': df['v2_sim_dis_test_z'],\n",
    "    'IT Distractor Similarity to Tested Item': df['it_sim_dis_test_z'],\n",
    "    'Tested - Untested V2 Distractor Similarity': df['v2_sim_dis_diff_test_z'],\n",
    "    'Tested - Untested IT Distractor Similarity': df['it_sim_dis_diff_test_z'],\n",
    "    'V2 Distractor Similarity\\nto Prioritized Item': df['v2_sim_dis_attend_z'],\n",
    "    'IT Distractor Similarity\\nto Prioritized Item': df['it_sim_dis_attend_z'],\n",
    "    'V2 Distractor Similarity\\nto Deprioritized Item': df['v2_sim_dis_unattend_z'],\n",
    "    'IT Distractor Similarity\\nto Deprioritized Item': df['it_sim_dis_unattend_z'],\n",
    "    'Prioritized - Deprioritized IT Distractor Similarity': df['it_sim_dis_diff_z'],\n",
    "    'Prioritized - Deprioritized V2 Distractor Similarity': df['v2_sim_dis_diff_z']\n",
    "}\n",
    "\n",
    "# Add UI columns at once\n",
    "for col_name, values in ui_columns.items():\n",
    "    df[col_name] = values\n",
    "\n",
    "print(f\"Added standardized variables and transformations. Total columns: {len(df.columns)}\")_scored] = scaler.fit_transform(df_std[[l]]).flatten()\n",
    "    \n",
    "    # Add all new columns at once\n",
    "    for col_name, values in new_columns.items():\n",
    "        df_std[col_name] = values\n",
    "        \n",
    "    return df_std\n",
    "\n",
    "def df_square_and_mean(df, list_of_variables):\n",
    "    \"\"\"Square variables, demean them, and create z-scored versions.\"\"\"\n",
    "    # Process all variables at once to avoid fragmentation\n",
    "    df_sq = df.copy()\n",
    "    \n",
    "    # Dictionary to store all new columns\n",
    "    new_columns = {}\n",
    "    \n",
    "    for l in list_of_variables:\n",
    "        # Square the variable\n",
    "        squared_col = f\"{l}_sq\"\n",
    "        new_columns[squared_col] = df_sq[l]**2\n",
    "        # Demean the squared variable\n",
    "        new_columns[squared_col] = new_columns[squared_col] - np.mean(new_columns[squared_col])\n",
    "        # Create z-scored version\n",
    "        z_scored = f\"{squared_col}_z\"\n",
    "        new_columns[z