# RSA Experiment Data Processing

This notebook processes individual participant CSV files from DataPipe/OSF 
and creates a single wide-format dataframe with one row per participant.

**Works with both pilot data and main study data.**

## Cell 1: Imports and Helper Functions

In [1]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
from datetime import datetime
from glob import glob


def parse_json_safe(s):
    """Safely parse JSON string, return None if fails."""
    if pd.isna(s):
        return None
    try:
        return json.loads(s)
    except:
        return None


def calculate_duration_minutes(start_time, end_time):
    """Calculate duration in minutes between two ISO timestamps."""
    try:
        start = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
        end = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
        return (end - start).total_seconds() / 60
    except:
        return None


def scenario_to_short(scenario):
    """Convert scenario name to short form."""
    mapping = {
        'informative': 'inf',
        'pers_plus': 'persp',
        'pers_minus': 'persm'
    }
    return mapping.get(scenario, scenario)

## Cell 2: Main Processing Function

In [2]:
def process_participant(filepath):
    """
    Process a single participant's CSV file and extract key data.
    
    Works with both pilot data (attention checks have stimulus info)
    and main study data (attention checks have no stimulus).
    
    Parameters
    ----------
    filepath : str or Path
        Path to the participant's CSV file
        
    Returns
    -------
    dict or None
        Dictionary with all relevant data for this participant
    """
    try:
        df = pd.read_csv(filepath)
    except Exception as e:
        print(f"  Error reading {filepath}: {e}")
        return None
    
    if len(df) == 0:
        print(f"  Empty file: {filepath}")
        return None
    
    result = {}
    first_row = df.iloc[0]
    
    # === METADATA ===
    result['subject_id'] = first_row.get('subject_id', '')
    result['prolific_pid'] = first_row.get('prolific_pid', '')
    result['study_id'] = first_row.get('study_id', '')
    result['session_id'] = first_row.get('session_id', '')
    result['experiment_version'] = first_row.get('experiment_version', '')
    result['start_time'] = first_row.get('start_time', '')
    result['completion_status'] = first_row.get('completion_status', '')
    result['completion_time'] = first_row.get('completion_time', '')
    result['terminated_early'] = first_row.get('terminated_early', False)
    result['termination_reason'] = first_row.get('termination_reason', '')
    result['duration_minutes'] = calculate_duration_minutes(
        str(result['start_time']), str(result['completion_time']))
    result['total_time_elapsed_ms'] = df['time_elapsed'].max()
    
    # === BLOCK ORDER ===
    scenarios_by_block = {}
    role_comp = df[df['task'] == 'role_comprehension'].sort_values('block')
    for _, row in role_comp.iterrows():
        block = int(row.get('block', -1))
        scenario = row.get('scenario', '')
        if block >= 0 and scenario:
            scenarios_by_block[block] = scenario
    
    speaker = df[df['task'] == 'speaker']
    if len(speaker) > 0:
        for block in speaker['block'].dropna().unique():
            block_int = int(block)
            if block_int not in scenarios_by_block:
                scenarios_by_block[block_int] = speaker[speaker['block'] == block]['scenario'].iloc[0]
    
    result['block_1_scenario'] = scenarios_by_block.get(0, '')
    result['block_2_scenario'] = scenarios_by_block.get(1, '')
    result['block_3_scenario'] = scenarios_by_block.get(2, '')
    block_order = [scenario_to_short(scenarios_by_block.get(i, '')) for i in range(3) if scenarios_by_block.get(i, '')]
    result['block_order'] = '_'.join(block_order)
    
    # === ATTENTION CHECKS ===
    attn = df[df['task'] == 'attention_check'].sort_values('block')
    result['attention_total_failures'] = 0
    for _, row in attn.iterrows():
        block = int(row.get('block', 0))
        result[f'attention_block_{block+1}_passed'] = row.get('attention_passed', None)
        result[f'attention_block_{block+1}_round'] = row.get('round', None)
        result[f'attention_block_{block+1}_time_elapsed'] = row.get('time_elapsed', None)
        # These will be real values for pilot, NaN for main study
        result[f'attention_block_{block+1}_num_effective'] = row.get('num_effective', None)
        result[f'attention_block_{block+1}_stimulus_variant'] = row.get('stimulus_variant', None)
        result[f'attention_block_{block+1}_required_description'] = row.get('required_description', '')
        failures = row.get('total_failures', 0)
        if pd.notna(failures):
            result['attention_total_failures'] = max(result['attention_total_failures'], int(failures))
    for i in range(1, 4):
        if f'attention_block_{i}_passed' not in result:
            result[f'attention_block_{i}_passed'] = None
            result[f'attention_block_{i}_round'] = None
            result[f'attention_block_{i}_time_elapsed'] = None
            result[f'attention_block_{i}_num_effective'] = None
            result[f'attention_block_{i}_stimulus_variant'] = None
            result[f'attention_block_{i}_required_description'] = ''
    
    # === COMPREHENSION MODULE 1 ===
    comp1_some = df[df['task'] == 'comp1_some']
    if len(comp1_some) > 0:
        row = comp1_some.iloc[0]
        result['comp1_some_correct'] = row.get('comp1_some_correct', None)
        result['comp1_some_rt'] = row.get('rt', None)
        resp = parse_json_safe(row.get('response', ''))
        result['comp1_some_response'] = resp.get('some_def', '') if resp else ''
    else:
        result['comp1_some_correct'] = result['comp1_some_rt'] = None
        result['comp1_some_response'] = ''
    
    comp1_most = df[df['task'] == 'comp1_most']
    if len(comp1_most) > 0:
        row = comp1_most.iloc[0]
        result['comp1_most_correct'] = row.get('comp1_most_correct', None)
        result['comp1_most_rt'] = row.get('rt', None)
        resp = parse_json_safe(row.get('response', ''))
        result['comp1_most_response'] = resp.get('most_def', '') if resp else ''
    else:
        result['comp1_most_correct'] = result['comp1_most_rt'] = None
        result['comp1_most_response'] = ''
    
    # === COMPREHENSION MODULE 2 ===
    comp2 = df[df['task'] == 'comp2'].sort_values('item_index')
    for i, (_, row) in enumerate(comp2.iterrows()):
        idx = int(row.get('item_index', i))
        result[f'comp2_{idx+1}_correct'] = row.get('comp2_correct', None)
        result[f'comp2_{idx+1}_time_elapsed'] = row.get('time_elapsed', None)
        item = parse_json_safe(row.get('item', ''))
        if item:
            result[f'comp2_{idx+1}_num_effective'] = item.get('numEffective', None)
            result[f'comp2_{idx+1}_statement'] = item.get('statementPlain', '')
    for i in range(1, 3):
        if f'comp2_{i}_correct' not in result:
            result[f'comp2_{i}_correct'] = None
            result[f'comp2_{i}_time_elapsed'] = None
            result[f'comp2_{i}_num_effective'] = None
            result[f'comp2_{i}_statement'] = ''
    
    # === COMPREHENSION MODULE 3 ===
    comp3 = df[df['task'] == 'comp3']
    if len(comp3) > 0:
        row = comp3.iloc[0]
        result['comp3_correct'] = row.get('comp3_correct', None)
        result['comp3_time_elapsed'] = row.get('time_elapsed', None)
        selected = row.get('selected', '')
        result['comp3_selected'] = selected if pd.notna(selected) else ''
    else:
        result['comp3_correct'] = None
        result['comp3_time_elapsed'] = None
        result['comp3_selected'] = ''
    
    # === ROLE COMPREHENSION ===
    role_comp = df[df['task'] == 'role_comprehension']
    for scenario in ['informative', 'pers_plus', 'pers_minus']:
        scenario_data = role_comp[role_comp['scenario'] == scenario]
        prefix = scenario_to_short(scenario)
        if len(scenario_data) > 0:
            row = scenario_data.iloc[0]
            result[f'{prefix}_role_comp_correct'] = row.get('role_comp_correct', None)
            result[f'{prefix}_role_comp_selected'] = row.get('selected_option', '')
            result[f'{prefix}_role_comp_time_elapsed'] = row.get('time_elapsed', None)
        else:
            result[f'{prefix}_role_comp_correct'] = None
            result[f'{prefix}_role_comp_selected'] = ''
            result[f'{prefix}_role_comp_time_elapsed'] = None
    
    # === SPEAKER TRIALS ===
    speaker = df[df['task'] == 'speaker']
    result['n_speaker_trials'] = len(speaker)
    
    for scenario in ['informative', 'pers_plus', 'pers_minus']:
        scenario_speaker = speaker[speaker['scenario'] == scenario].sort_values('round')
        prefix = scenario_to_short(scenario)
        result[f'{prefix}_n_trials'] = len(scenario_speaker)
        
        prev_time = None
        for _, row in scenario_speaker.iterrows():
            round_num = int(row.get('round', 1))
            col_prefix = f'{prefix}_r{round_num}'
            
            result[f'{col_prefix}_num_effective'] = row.get('num_effective', None)
            result[f'{col_prefix}_variant'] = row.get('stimulus_variant', None)
            result[f'{col_prefix}_positions'] = row.get('stimulus_positions', '')
            result[f'{col_prefix}_predicate'] = row.get('predicate', '')
            result[f'{col_prefix}_quantifier'] = row.get('quantifier', '')
            
            current_time = row.get('time_elapsed', None)
            result[f'{col_prefix}_time_elapsed'] = current_time
            
            if prev_time is not None and current_time is not None:
                result[f'{col_prefix}_rt_approx'] = current_time - prev_time
            else:
                result[f'{col_prefix}_rt_approx'] = None
            prev_time = current_time
        
        for r in range(1, 11):
            col_prefix = f'{prefix}_r{r}'
            if f'{col_prefix}_num_effective' not in result:
                result[f'{col_prefix}_num_effective'] = None
                result[f'{col_prefix}_variant'] = None
                result[f'{col_prefix}_positions'] = ''
                result[f'{col_prefix}_predicate'] = ''
                result[f'{col_prefix}_quantifier'] = ''
                result[f'{col_prefix}_time_elapsed'] = None
                result[f'{col_prefix}_rt_approx'] = None
    
    # === FEEDBACK ===
    feedback_rows = df[df['trial_type'] == 'survey-text']
    if len(feedback_rows) > 0:
        last_feedback = feedback_rows.iloc[-1]
        resp = parse_json_safe(last_feedback.get('response', ''))
        if resp and 'feedback' in resp:
            result['feedback_text'] = resp['feedback']
        else:
            result['feedback_text'] = ''
    else:
        result['feedback_text'] = ''
    
    return result

## Cell 3: Batch Processing Function

In [3]:
def process_all_participants(input_path, verbose=True):
    """
    Process all CSV files in a folder or a single CSV file.
    
    Parameters
    ----------
    input_path : str or Path
        Path to folder containing CSV files, or path to a single CSV file
    verbose : bool
        Whether to print progress
        
    Returns
    -------
    pd.DataFrame
        Wide-format dataframe with one row per participant
    """
    input_path = Path(input_path)
    
    if input_path.is_file():
        csv_files = [input_path]
    elif input_path.is_dir():
        csv_files = list(input_path.glob('*.csv'))
    else:
        raise ValueError(f"Path does not exist: {input_path}")
    
    if verbose:
        print(f"Found {len(csv_files)} CSV file(s)")
    
    all_data = []
    for i, filepath in enumerate(csv_files):
        if verbose:
            print(f"Processing {i+1}/{len(csv_files)}: {filepath.name}")
        
        result = process_participant(filepath)
        if result:
            result['source_file'] = filepath.name
            all_data.append(result)
    
    if verbose:
        print(f"\nSuccessfully processed {len(all_data)} participant(s)")
    
    return pd.DataFrame(all_data)

## Cell 4: Summary Statistics Function

In [4]:
def print_summary(df):
    """Print summary statistics for the processed data."""
    print("=" * 60)
    print("DATA SUMMARY")
    print("=" * 60)
    
    print(f"\nTotal participants: {len(df)}")
    
    print(f"\nCompletion status:")
    print(df['completion_status'].value_counts().to_string())
    
    completed = df[df['completion_status'] == 'completed']
    if len(completed) > 0:
        print(f"\nDuration (completed):")
        print(f"  Mean: {completed['duration_minutes'].mean():.1f} min")
        print(f"  Median: {completed['duration_minutes'].median():.1f} min")
        print(f"  Range: {completed['duration_minutes'].min():.1f} - {completed['duration_minutes'].max():.1f} min")
    
    print(f"\nBlock order distribution:")
    print(df['block_order'].value_counts().to_string())
    
    print(f"\nComprehension accuracy:")
    for col in ['comp1_some_correct', 'comp1_most_correct', 'comp2_1_correct', 'comp2_2_correct', 'comp3_correct']:
        if col in df.columns:
            valid = df[col].dropna()
            if len(valid) > 0:
                print(f"  {col}: {valid.mean()*100:.1f}% (n={len(valid)})")
    
    print(f"\nRole comprehension accuracy:")
    for col in ['inf_role_comp_correct', 'persp_role_comp_correct', 'persm_role_comp_correct']:
        if col in df.columns:
            valid = df[col].dropna()
            if len(valid) > 0:
                print(f"  {col}: {valid.mean()*100:.1f}% (n={len(valid)})")
    
    print(f"\nAttention check pass rate:")
    for col in ['attention_block_1_passed', 'attention_block_2_passed', 'attention_block_3_passed']:
        if col in df.columns:
            valid = df[col].dropna()
            if len(valid) > 0:
                print(f"  {col}: {valid.mean()*100:.1f}% (n={len(valid)})")

## Cell 5: Add Participant ID

In [None]:
def add_participant_id(df):
    """Add participant_id column (P001, P002, etc.) at the beginning."""
    df = df.copy()
    df.insert(0, 'participant_id', [f'P{i+1:03d}' for i in range(len(df))])
    return df

## Cell 6: Run Processing

**Option A: Process a folder of CSV files**
```python
df = process_all_participants('./raw_data/')
```

**Option B: Process a single CSV file**
```python
df = process_all_participants('./path/to/file.csv')
```

In [12]:
# Process the pilot study data
input_path_pilot = './raw_do_not_track/prag_net_speaker_n1_pilot/'  # Folder containing CSV files, or single CSV file path
df_raw_pilot = process_all_participants(input_path_pilot)

# process the main study data
input_path_main = './raw_do_not_track/prag_net_speaker_n1_main/'  # Folder containing CSV files, or single CSV file path
df_raw_main = process_all_participants(input_path_main)

# Combine pilot and main data
df_raw_pilot.insert(0, 'study', 'pilot')
df_raw_main.insert(0, 'study', 'main')
df_raw = pd.concat([df_raw_pilot, df_raw_main], ignore_index=True)

# Add participant_id
df = add_participant_id(df_raw)

# Print summary
print_summary(df)

# Display first few rows
print("\n--- Full data (first 5 columns) ---")
print(df[['participant_id', 'subject_id', 'prolific_pid', 'completion_status', 'duration_minutes']].head())

Found 10 CSV file(s)
Processing 1/10: 5f1c2ebf1d8b7c48ad3ec3db.csv
Processing 2/10: 64337cc5c12109dc730de8c6.csv
Processing 3/10: 696bb3922216926553e87730.csv
Processing 4/10: 58d4c4c287b2420001263b6a.csv
Processing 5/10: 650b03136ab3d4c832d98b71.csv
Processing 6/10: 671e5804035494e1d10359ca.csv
Processing 7/10: 59f71f797086f80001941493.csv
Processing 8/10: 695c063ad8348d9dbf6896cb.csv
Processing 9/10: 627ebc31883c7dd7c2220c7e.csv
Processing 10/10: 6713eeea64ca1fb59cbcbce4.csv

Successfully processed 10 participant(s)
Found 101 CSV file(s)
Processing 1/101: 56b8de29e1d0a200051517f8.csv
Processing 2/101: 66afcaa60f7d8f58dc21db8e.csv
Processing 3/101: 66720d50c394601b811a0e49.csv
Processing 4/101: 673feeafa80a279f5cc63e33.csv
Processing 5/101: 5cf804f0c5cf0e00010c02d5.csv
Processing 6/101: 66cc4ba25cc59f1c17f1a2b4.csv
Processing 7/101: 67722f8e3a4f08a288a1f640.csv
Processing 8/101: 6709224f70eacdb20761ae3c.csv
Processing 9/101: 66e9236f775eae67be04229d.csv
Processing 10/101: 63e55f08844f

## Cell 7: View Key Data

In [27]:
print("Checking uniqueness of prolific_pid: ", df_raw["prolific_pid"].is_unique)

Checking uniqueness of prolific_pid:  True


In [18]:
# View key columns (full version with IDs)
df.columns.tolist()

['participant_id',
 'study',
 'subject_id',
 'prolific_pid',
 'study_id',
 'session_id',
 'experiment_version',
 'start_time',
 'completion_status',
 'completion_time',
 'terminated_early',
 'termination_reason',
 'duration_minutes',
 'total_time_elapsed_ms',
 'block_1_scenario',
 'block_2_scenario',
 'block_3_scenario',
 'block_order',
 'attention_total_failures',
 'attention_block_1_passed',
 'attention_block_1_round',
 'attention_block_1_time_elapsed',
 'attention_block_1_num_effective',
 'attention_block_1_stimulus_variant',
 'attention_block_1_required_description',
 'attention_block_2_passed',
 'attention_block_2_round',
 'attention_block_2_time_elapsed',
 'attention_block_2_num_effective',
 'attention_block_2_stimulus_variant',
 'attention_block_2_required_description',
 'attention_block_3_passed',
 'attention_block_3_round',
 'attention_block_3_time_elapsed',
 'attention_block_3_num_effective',
 'attention_block_3_stimulus_variant',
 'attention_block_3_required_description',
 '

## Cell 8: Save Processed Data

In [28]:
# === EDIT THESE PATHS ===
OUTPUT_PATH_FULL = './raw_do_not_track/processed_speaker_n1_full.csv'

# Save full version (with all IDs)
df.to_csv(OUTPUT_PATH_FULL, index=False)
print(f"Saved full data to: {OUTPUT_PATH_FULL}")
print(f"  Columns: {len(df.columns)}")
print(f"  Includes: participant_id, subject_id, prolific_pid, timestamps, etc.")

Saved full data to: ./raw_do_not_track/processed_speaker_n1_full.csv
  Columns: 279
  Includes: participant_id, subject_id, prolific_pid, timestamps, etc.
