# Generate word key

In [None]:
import pandas as pd
import uuid
import os
import glob

# define relative path
path = '../res'
# Find all CSV files in the directory
csv_pattern = os.path.join(path, '*.csv')
csv_files = glob.glob(csv_pattern)

for csv_file in csv_files:
    
    print(f"\nProcessing: {os.path.basename(csv_file)}")
    
    # Load the CSV file
    df = pd.read_csv(csv_file)
    print(f"  Loaded {len(df)} rows")
    
    # Check if word_key column exists
    if 'word_key' in df.columns:
        print(f"  ✓ word_key column already exists - skipping")
        continue
    
    # Add UUID keys
    print(f"  Adding word_key column with UUID keys...")
    df['word_key'] = [str(uuid.uuid4()) for _ in range(len(df))]
    
    # Save back to the same file
    df.to_csv(csv_file, index=False)
    print(f"  ✓ Saved {len(df)} rows with UUID keys")



# Manage files for OSF

## Copy raw data

In [1]:
import os
import re
import shutil
# Define the root directory containing subject folders
# root_dir = "F:/MindlessReading/Data"
root_dir = r"/Volumes/GBLDrive/MindlessReading/Data"

# Define the expected number of files in each folder
expected_files_count = 5

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) and subject_pattern.match(d)]

# Define the destination directory for raw data
raw_data_dir = os.path.join(r"/Users/hsun11/Desktop/raw_data")

print(f"Found {len(subjects)} subjects matching pattern")
print(f"Raw data will be saved to: {raw_data_dir}")
print("-" * 50)

# Create the main raw_data directory
os.makedirs(raw_data_dir, exist_ok=True)

success_count = 0
skipped_count = 0
total_copied_files = {"eye": 0, "log": 0, "eeg": 0}

for subject in sorted(subjects):
    print(f"Processing subject: {subject}")
    
    # Source paths
    source_subject_dir = os.path.join(root_dir, subject)
    source_eye_dir = os.path.join(source_subject_dir, "eye")
    source_log_dir = os.path.join(source_subject_dir, "log")
    source_eeg_dir = os.path.join(source_subject_dir, "eeg")
    
    # Destination paths
    dest_subject_dir = os.path.join(raw_data_dir, subject)
    dest_eye_dir = os.path.join(dest_subject_dir, "eye")
    dest_log_dir = os.path.join(dest_subject_dir, "log")
    dest_eeg_dir = os.path.join(dest_subject_dir, "eeg")
    
    # Create destination directories
    os.makedirs(dest_eye_dir, exist_ok=True)
    os.makedirs(dest_log_dir, exist_ok=True)
    os.makedirs(dest_eeg_dir, exist_ok=True)
    
    # Check if destination folders already have the complete set of files
    existing_asc_files = [f for f in os.listdir(dest_eye_dir) if f.endswith('.asc')] if os.path.exists(dest_eye_dir) else []
    existing_csv_files = [f for f in os.listdir(dest_log_dir) if f.endswith('.csv')] if os.path.exists(dest_log_dir) else []
    existing_bdf_files = [f for f in os.listdir(dest_eeg_dir) if f.endswith('.bdf')] if os.path.exists(dest_eeg_dir) else []
    
    if (len(existing_asc_files) == expected_files_count and 
        len(existing_csv_files) == expected_files_count and 
        len(existing_bdf_files) == expected_files_count):
        print(f"  ⏭️  Skipping {subject} - already has complete set:")
        print(f"    - {len(existing_asc_files)} .asc files in eye folder")
        print(f"    - {len(existing_csv_files)} .csv files in log folder")
        print(f"    - {len(existing_bdf_files)} .bdf files in eeg folder")
        skipped_count += 1
        print()
        continue
    
    copied_files = {"eye": [], "log": [], "eeg": []}
    errors = []
    
    try:
        # Copy .asc files from eye folder
        if os.path.exists(source_eye_dir):
            asc_files = [f for f in os.listdir(source_eye_dir) if f.endswith('.asc')]
            
            if len(asc_files) == expected_files_count:
                for asc_file in asc_files:
                    source_file = os.path.join(source_eye_dir, asc_file)
                    dest_file = os.path.join(dest_eye_dir, asc_file)
                    shutil.copyfile(source_file, dest_file)
                    copied_files["eye"].append(asc_file)
            else:
                errors.append(f"Expected {expected_files_count} .asc files, found {len(asc_files)}")
        else:
            errors.append("Eye folder does not exist")
        
        # Copy .csv files from log folder
        if os.path.exists(source_log_dir):
            csv_files = [f for f in os.listdir(source_log_dir) if f.endswith('.csv')]
            
            if len(csv_files) == expected_files_count:
                for csv_file in csv_files:
                    source_file = os.path.join(source_log_dir, csv_file)
                    dest_file = os.path.join(dest_log_dir, csv_file)
                    shutil.copyfile(source_file, dest_file)
                    copied_files["log"].append(csv_file)
            else:
                errors.append(f"Expected {expected_files_count} .csv files, found {len(csv_files)}")
        else:
            errors.append("Log folder does not exist")
        
        # Copy .bdf files from eeg folder
        if os.path.exists(source_eeg_dir):
            bdf_files = [f for f in os.listdir(source_eeg_dir) if f.endswith('.bdf')]
            
            if len(bdf_files) == expected_files_count:
                for bdf_file in bdf_files:
                    source_file = os.path.join(source_eeg_dir, bdf_file)
                    dest_file = os.path.join(dest_eeg_dir, bdf_file)
                    shutil.copyfile(source_file, dest_file)
                    copied_files["eeg"].append(bdf_file)
            else:
                errors.append(f"Expected {expected_files_count} .bdf files, found {len(bdf_files)}")
        else:
            errors.append("EEG folder does not exist")
            
    except Exception as e:
        errors.append(f"Error copying files: {str(e)}")
    
    # Report results for this subject
    if errors:
        print(f"  ❌ Errors for {subject}:")
        for error in errors:
            print(f"    - {error}")
    else:
        print(f"  ✅ Successfully copied:")
        print(f"    - {len(copied_files['eye'])} .asc files from eye folder")
        print(f"    - {len(copied_files['log'])} .csv files from log folder")
        print(f"    - {len(copied_files['eeg'])} .bdf files from eeg folder")
        success_count += 1
        total_copied_files["eye"] += len(copied_files["eye"])
        total_copied_files["log"] += len(copied_files["log"])
        total_copied_files["eeg"] += len(copied_files["eeg"])
    
    print()

# Summary
print("=" * 50)
print("SUMMARY:")
print(f"Successfully processed: {success_count}/{len(subjects)} subjects")
print(f"Skipped (already had files): {skipped_count}/{len(subjects)} subjects")
print(f"Total .asc files copied: {total_copied_files['eye']}")
print(f"Total .csv files copied: {total_copied_files['log']}")
print(f"Total .bdf files copied: {total_copied_files['eeg']}")
print(f"Raw data directory: {raw_data_dir}")

Found 45 subjects matching pattern
Raw data will be saved to: /Users/hsun11/Desktop/raw_data
--------------------------------------------------
Processing subject: s10014
  ✅ Successfully copied:
    - 5 .asc files from eye folder
    - 5 .csv files from log folder
    - 5 .bdf files from eeg folder

Processing subject: s10052
  ✅ Successfully copied:
    - 5 .asc files from eye folder
    - 5 .csv files from log folder
    - 5 .bdf files from eeg folder

Processing subject: s10059
  ✅ Successfully copied:
    - 5 .asc files from eye folder
    - 5 .csv files from log folder
    - 5 .bdf files from eeg folder

Processing subject: s10073
  ✅ Successfully copied:
    - 5 .asc files from eye folder
    - 5 .csv files from log folder
    - 5 .bdf files from eeg folder

Processing subject: s10081
  ✅ Successfully copied:
    - 5 .asc files from eye folder
    - 5 .csv files from log folder
    - 5 .bdf files from eeg folder

Processing subject: s10084
  ✅ Successfully copied:
    - 5 .asc f

## Generate demographic information

In [8]:
import pandas as pd
import os

# Load the Excel file
path = r"/Volumes/GBLDrive/MindlessReading/Data/Files"
file_name = "demo_info.xlsx"  # Update path if needed
df = pd.read_excel(os.path.join(path, file_name))

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Find all subject folders from raw_data folder
raw_data_dir = r"/Users/hsun11/Desktop/raw_data"
subject_pattern = re.compile(r"^s\d{5}$")

# Get subject IDs from raw_data folder
subject_ids = [d for d in os.listdir(raw_data_dir) 
               if os.path.isdir(os.path.join(raw_data_dir, d)) and subject_pattern.match(d)]

print(f"Found {len(subject_ids)} subjects in raw_data folder")
subject_nums = [int(s[1:]) for s in subject_ids]

# Filter by participant ID
df_filtered = df[df['participant_id'].isin(subject_nums)].copy()

# Age: ensure numeric and compute stats
df_filtered['age'] = pd.to_numeric(df_filtered['age'], errors='coerce')
age_min = df_filtered['age'].min()
age_max = df_filtered['age'].max()
age_median = df_filtered['age'].median()
age_mean = df_filtered['age'].mean()

# Gender: clean and count
df_filtered['gender_clean'] = df_filtered['gender'].str.strip().str.lower()
df_filtered['gender_clean'] = df_filtered['gender_clean'].replace({
    'female': 'female',
    'male': 'male',
    'nonbinary': 'non-binary',
    'non-binary': 'non-binary',
    'gender-nonconforming': 'non-binary',
    'transgender male': 'non-binary'
})
gender_counts = df_filtered['gender_clean'].value_counts()

# Handedness: clean and count
df_filtered['handness_clean'] = df_filtered['handness'].str.strip().str.lower()
df_filtered['handness_clean'] = df_filtered['handness_clean'].replace({
    'right': 'right',
    'left': 'left',
    'left, but used right-handed setup': 'left',
    'left-handed setup': 'left',
    'left, but used right-handed setup for mouse and keyboard': 'left',
    'ambidextrous (left-hand dominant)': 'left'
})
handness_counts = df_filtered['handness_clean'].value_counts()

# Print results
print("=" * 50)
print("DEMOGRAPHIC ANALYSIS")
print("=" * 50)
print(f"Total subjects analyzed: {len(df_filtered)}")
print()
print(f"Age range: {age_min} - {age_max}")
print(f"Median age: {age_median}")
print(f"Mean age: {age_mean:.2f}")
print()
print("Gender distribution:")
for gender, count in gender_counts.items():
    percentage = (count / len(df_filtered)) * 100
    print(f"  {gender}: {count} ({percentage:.1f}%)")
print()
print("Handedness distribution:")
for hand, count in handness_counts.items():
    percentage = (count / len(df_filtered)) * 100
    print(f"  {hand}: {count} ({percentage:.1f}%)")

# Optional: Show subjects that are in raw_data but missing from demographics
missing_subjects = [s for s in subject_nums if s not in df['participant_id'].values]
if missing_subjects:
    print()
    print(f"Warning: {len(missing_subjects)} subjects in raw_data have no demographic data:")
    print(f"Missing: {sorted(missing_subjects)}")

# Save filtered demographics to CSV in raw_data folder
output_file = os.path.join(raw_data_dir, "subject_demographics_information.csv")
df_filtered.to_csv(output_file, index=False)
print()
print(f"✅ Saved filtered demographics to: {output_file}")
print(f"   Contains {len(df_filtered)} subjects with {len(df_filtered.columns)} columns")

Found 45 subjects in raw_data folder
DEMOGRAPHIC ANALYSIS
Total subjects analyzed: 45

Age range: 18 - 64
Median age: 20.0
Mean age: 22.49

Gender distribution:
  female: 31 (68.9%)
  male: 9 (20.0%)
  non-binary: 5 (11.1%)

Handedness distribution:
  right: 38 (84.4%)
  left: 7 (15.6%)

✅ Saved filtered demographics to: /Users/hsun11/Desktop/raw_data/subject_demographics_information.csv
   Contains 45 subjects with 8 columns


## Copy processed data

In [None]:
import os
import re
import shutil
import glob

# Define the root directory containing subject folders
# root_dir = "F:/MindlessReading/Data"
root_dir = r"/Volumes/GBLDrive/MindlessReading/Data"

# Define the expected number of files in each folder
expected_files_count = 5

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) and subject_pattern.match(d)]

# Define the destination directory for processed data
processed_data_dir = os.path.join(r"/Users/hsun11/Desktop/processed_data")

print(f"Found {len(subjects)} subjects matching pattern")
print(f"Processed data will be saved to: {processed_data_dir}")
print("-" * 50)

# Create the main processed_data directory
os.makedirs(processed_data_dir, exist_ok=True)

# Define CSV file patterns to copy
csv_patterns = [
    '*end2_sr*.csv',
    '*end2_tp*.csv', 
    '*end5_sr*.csv',
    '*end5_tp*.csv',
    '*mw_fixed_sr*.csv',
    '*mw_fixed_tp*.csv',
    '*page_fixed_sr*.csv',
    '*page_fixed_tp*.csv',
    '*default_sr*.csv',
    '*slide2.0*.csv'
]

for subject in sorted(subjects):
    print(f"Processing subject: {subject}")
    
    subject_path = os.path.join(root_dir, subject)
    subject_processed_dir = os.path.join(processed_data_dir, subject)
    
    # Create subject directory in processed_data
    os.makedirs(subject_processed_dir, exist_ok=True)
    
    # Copy 5 folders from eye subdirectory
    eye_dir = os.path.join(subject_path, "eye")
    if os.path.exists(eye_dir):
        eye_folders = [d for d in os.listdir(eye_dir) if os.path.isdir(os.path.join(eye_dir, d))]
        
        if len(eye_folders) >= 5:
            # Copy first 5 folders (or you can modify this logic to select specific folders)
            for i, folder in enumerate(sorted(eye_folders)[:5]):
                src_folder = os.path.join(eye_dir, folder)
                dst_folder = os.path.join(subject_processed_dir, 'eye', folder)
                
                try:
                    shutil.copytree(src_folder, dst_folder)
                    # print(f"  Copied eye folder: {folder}")
                except Exception as e:
                    print(f"  Error copying eye folder {folder}: {e}")
        else:
            print(f"  Warning: Only found {len(eye_folders)} folders in eye directory, expected 5")
    else:
        print(f"  Warning: Eye directory not found for subject {subject}")
    
    # Copy CSV files with specified patterns
    for pattern in csv_patterns:
        matching_files = glob.glob(os.path.join(subject_path, pattern))
        
        for file_path in matching_files:
            file_name = os.path.basename(file_path)
            dst_path = os.path.join(subject_processed_dir, file_name)
            
            try:
                shutil.copy2(file_path, dst_path)
                # print(f"  Copied CSV: {file_name}")
                csv_files_copied += 1
            except Exception as e:
                print(f"  Error copying CSV {file_name}: {e}")
    
    print("-" * 30)

print("Processing complete!")

Found 45 subjects matching pattern
Processed data will be saved to: /Users/hsun11/Desktop/processed_data
--------------------------------------------------
Processing subject: s10014
  Copied eye folder: s014_r1_2023_11_29_09_28_data
  Copied eye folder: s014_r2_2023_11_29_09_56_data
  Copied eye folder: s014_r3_2023_11_29_10_15_data
  Copied eye folder: s014_r4_2023_11_29_10_28_data
  Copied eye folder: s014_r5_2023_11_29_10_43_data
  Copied CSV: s10014_R_features_end2_sr.csv
  Copied CSV: s10014_L_features_end2_sr.csv
  Copied CSV: s10014_R_features_end2_tp.csv
  Copied CSV: s10014_L_features_end2_tp.csv
  Copied CSV: s10014_L_features_end5_sr.csv
  Copied CSV: s10014_R_features_end5_sr.csv
  Copied CSV: s10014_L_features_end5_tp.csv
  Copied CSV: s10014_R_features_end5_tp.csv
  Copied CSV: s10014_L_features_mw_fixed_sr.csv
  Copied CSV: s10014_R_features_mw_fixed_sr.csv
  Copied CSV: s10014_L_features_mw_fixed_tp.csv
  Copied CSV: s10014_R_features_mw_fixed_tp.csv
  Copied CSV: s100

KeyboardInterrupt: 

# Save EEG to csv

In [None]:
import mne
import pandas as pd

# Read the .set file
file_path = '/Volumes/GBLDrive/MindlessReading/Data/s10014/eeg/ICAPruned/MR_s10014_r1_ICAPruned.set'
raw = mne.io.read_raw_eeglab(file_path, preload=True)

  from scipy.io.matlab.miobase import get_matfile_version
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.matlab.mio5_params.MatlabOpaque):
  if isinstance(data, scipy.io.m

In [13]:
# Get the data as a numpy array
data, times = raw.get_data(return_times=True)
events, event_id = mne.events_from_annotations(raw)
fs = raw.info['sfreq']
print(f"Converted to {len(events)} events")
print("Event ID mapping:", event_id)


# Check if condition10 is in event_id
if 'condition 10' in event_id:
    condition10_code = event_id['condition 10']
    
    # Find first occurrence
    condition10_events = events[events[:, 2] == condition10_code]
    if len(condition10_events) != 10:
        raise ValueError('not 10 pages found')
    else:
        for page_index in range(len(condition10_events)):

            page_sample = condition10_events[page_index, 0]
            page_time = page_sample / fs
            print(page_index)
            print(page_time)
    


Used Annotations descriptions: ['30', '40', 'MW_offset', 'MW_onset', 'condition 1', 'condition 10', 'condition 5', 's10014_1_0_MW_onset', 's10014_1_0_self_report', 's10014_1_1_MW_onset', 's10014_1_1_self_report', 's10014_1_2_MW_onset', 's10014_1_2_self_report', 's10014_1_3_MW_onset', 's10014_1_3_self_report', 's10014_1_4_MW_onset', 's10014_1_4_self_report', 's10014_1_5_MW_onset', 's10014_1_5_self_report', 's10014_1_6_MW_onset', 's10014_1_6_self_report', 's10014_1_7_MW_onset', 's10014_1_7_self_report', 's10014_1_8_MW_onset', 's10014_1_8_self_report', 's10014_1_9_MW_onset', 's10014_1_9_self_report', 'self_report']
Converted to 93 events
Event ID mapping: {'30': 1, '40': 2, 'MW_offset': 3, 'MW_onset': 4, 'condition 1': 5, 'condition 10': 6, 'condition 5': 7, 's10014_1_0_MW_onset': 8, 's10014_1_0_self_report': 9, 's10014_1_1_MW_onset': 10, 's10014_1_1_self_report': 11, 's10014_1_2_MW_onset': 12, 's10014_1_2_self_report': 13, 's10014_1_3_MW_onset': 14, 's10014_1_3_self_report': 15, 's10014_