# File Integrity Check

In [None]:
import os
import pandas as pd
import re
# run this in terminal to delete hidden files generated by MacOS
# find /Volumes/GBLDrive/MindlessReading -name '._*' -type f -delete

# Define the root directory containing subject folders
# root_dir = "F:/MindlessReading/Data"
root_dir = r"/Volumes/GBLDrive/MindlessReading/Data"

# Define the expected number of files in each folder
expected_files_count = 5

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) and subject_pattern.match(d)]

# Initialize a list to store the file status data
data = []

# Function to check file count and extension in a given folder
def check_folder(folder_path, extension, expected_count, no_extension=False):
    if not os.path.exists(folder_path):
        return False
    
    if no_extension:
        # List only files that have no dot in their names
        files = [f for f in os.listdir(folder_path)]
    else:
        # List files that match the given extension
        files = [f for f in os.listdir(folder_path) if f.endswith(extension)]
    
    return len(files) == expected_count

# Check each subject folder
for subject in subjects:
    subject_path = os.path.join(root_dir, subject)

    # Define expected subdirectories
    eye_path = os.path.join(subject_path, "eye")
    log_path = os.path.join(subject_path, "log")
    eeg_path = os.path.join(subject_path, "eeg")
    page_path = os.path.join(subject_path, "page")

    # Validate folder contents
    eye_status = check_folder(eye_path, ".asc", expected_files_count)
    log_status = check_folder(log_path, ".csv", expected_files_count)
    eeg_status = check_folder(eeg_path, ".bdf", expected_files_count)
    page_status = check_folder(page_path, "", expected_files_count, no_extension=True)

    # Append the results to the data list
    data.append([subject, eye_status, log_status, eeg_status, page_status])

# Create a DataFrame and save it to a CSV file
df = pd.DataFrame(data, columns=["subject", "eye_complete", "log_complete", "eeg_complete", "page_objects"])

df.to_csv(os.path.join(root_dir, "Files", "file_integrity_report.csv"), index=False)

# Delete Files (Netfiles)

In [3]:
import os
import glob
import re

# Define your online directory
online_root = r"Z:\Mindless Reading\Data"

# Regular expression to match subject folder names like "sXXXXX" (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Patterns of CSV files to delete in the online folder
csv_patterns = ["*features_last.csv", "*features_same.csv", "*features_whole.csv"]

# Get the list of subject folders in the online directory
subjects = [d for d in os.listdir(online_root) if os.path.isdir(os.path.join(online_root, d)) and subject_pattern.match(d)]

for subject in subjects:
    sub_folder = os.path.join(online_root, subject)
    
    # Delete specific CSV files in the online ICAPruned folder
    if os.path.exists(sub_folder):
        for pattern in csv_patterns:
            for file in glob.glob(os.path.join(sub_folder, pattern)):
                os.remove(file)
                print(f"Deleted {file}")

FileNotFoundError: [Errno 2] No such file or directory: 'Z:\\Mindless Reading\\Data'

# Delete Files (Hard Drive)

In [None]:
import os
import glob
import re

# Define your online directory
root = r"F:\MindlessReading\Data"

# Regular expression to match subject folder names like "sXXXXX" (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Patterns of CSV files to delete in the online folder
csv_patterns = ["*features_last.csv", "*features_same.csv", "*features_whole.csv", "*PupilTrace.csv", "*slide_wlen*", "*default*", "*same-dur*"]

# Get the list of subject folders in the online directory
subjects = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d)) and subject_pattern.match(d)]

for subject in subjects:
    sub_folder = os.path.join(root, subject)
    
    # Delete specific CSV files in the online ICAPruned folder
    if os.path.exists(sub_folder):
        for pattern in csv_patterns:
            for file in glob.glob(os.path.join(sub_folder, pattern)):
                os.remove(file)
                print(f"Deleted {file}")

Deleted F:\MindlessReading\Data\s10014\s10014_L_features_slide_wlen2.csv
Deleted F:\MindlessReading\Data\s10014\s10014_L_features_slide_wlen5.csv
Deleted F:\MindlessReading\Data\s10014\s10014_R_features_slide_wlen2.csv
Deleted F:\MindlessReading\Data\s10014\s10014_R_features_slide_wlen5.csv
Deleted F:\MindlessReading\Data\s10014\s10014_L_features_default.csv
Deleted F:\MindlessReading\Data\s10014\s10014_R_features_default.csv
Deleted F:\MindlessReading\Data\s10014\s10014_L_features_same-dur.csv
Deleted F:\MindlessReading\Data\s10014\s10014_R_features_same-dur.csv
Deleted F:\MindlessReading\Data\s10052\s10052_L_features_slide_wlen2.csv
Deleted F:\MindlessReading\Data\s10052\s10052_L_features_slide_wlen5.csv
Deleted F:\MindlessReading\Data\s10052\s10052_R_features_slide_wlen2.csv
Deleted F:\MindlessReading\Data\s10052\s10052_R_features_slide_wlen5.csv
Deleted F:\MindlessReading\Data\s10052\s10052_L_features_default.csv
Deleted F:\MindlessReading\Data\s10052\s10052_R_features_default.csv


# Copy Folders / Files

In [None]:
import os
import shutil
import re

# Define your local and online directories
local_root = r"F:\MindlessReading\Data"
online_root = r"Z:\Mindless Reading\Data"

# define the folder name
# folder = 'eeg/ICAPruned'
folder = 'eeg/Preprocessed'
# folder = 'page'

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(local_root) if os.path.isdir(os.path.join(local_root, d)) and subject_pattern.match(d)]

for subject in subjects:
    local_path = os.path.join(local_root, subject, folder)
    online_path = os.path.join(online_root, subject, folder)

    # Check if the local ICAPruned folder exists
    if os.path.exists(local_path):
        # Delete the existing ICAPruned folder in the online directory if it exists
        if os.path.exists(online_path):
            shutil.rmtree(online_path)
            print(f"Deleted existing folder: {online_path}")

        # Copy the ICAPruned folder from local to online
        shutil.copytree(local_path, online_path)
        print(f"Copied {local_path} to {online_path}")
    else:
        print(f"Skipping {subject}: {local_path} folder not found in local directory.")

Deleted existing folder: Z:\Mindless Reading\Data\s10014\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10014\eeg/Preprocessed to Z:\Mindless Reading\Data\s10014\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10052\eeg/Preprocessed to Z:\Mindless Reading\Data\s10052\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10059\eeg/Preprocessed to Z:\Mindless Reading\Data\s10059\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10073\eeg/Preprocessed to Z:\Mindless Reading\Data\s10073\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10080\eeg/Preprocessed to Z:\Mindless Reading\Data\s10080\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10081\eeg/Preprocessed to Z:\Mindless Reading\Data\s10081\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10084\eeg/Preprocessed to Z:\Mindless Reading\Data\s10084\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10085\eeg/Preprocessed to Z:\Mindless Reading\Data\s10085\eeg/Preprocessed
Copied F:\MindlessReading\Data\s10089\eeg/Preprocessed to Z:\Mindless 

KeyboardInterrupt: 

# Copy Eye-Tracking Results CSV Files

In [None]:
import os
import shutil
import re
import glob

# Define your local and online directories
local_root = r"F:\MindlessReading\Data"
online_root = r"Z:\Mindless Reading\Data"

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Define file patterns to match
file_patterns = ["*default.csv", "*same-dur.csv", "*slide_wlen[0-9].csv"]

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(local_root) if os.path.isdir(os.path.join(local_root, d)) and subject_pattern.match(d)]

for subject in subjects:
    local_page_path = os.path.join(local_root, subject)
    online_page_path = os.path.join(online_root, subject)

    # Check if the local "page" folder exists
    if os.path.exists(local_page_path):
        # Ensure the online "page" folder exists
        os.makedirs(online_page_path, exist_ok=True)

        # Copy only the CSV files matching the patterns
        for pattern in file_patterns:
            for file in glob.glob(os.path.join(local_page_path, pattern)):
                filename = os.path.basename(file)
                dest_path = os.path.join(online_page_path, filename)
                shutil.copy2(file, dest_path)
                print(f"Copied {file} to {dest_path}")

    else:
        print(f"Skipping {subject}: {local_page_path} not found in local directory.")

Copied F:\MindlessReading\Data\s10014\s10014_L_features_default.csv to Z:\Mindless Reading\Data\s10014\s10014_L_features_default.csv
Copied F:\MindlessReading\Data\s10014\s10014_R_features_default.csv to Z:\Mindless Reading\Data\s10014\s10014_R_features_default.csv
Copied F:\MindlessReading\Data\s10014\s10014_L_features_same-dur.csv to Z:\Mindless Reading\Data\s10014\s10014_L_features_same-dur.csv
Copied F:\MindlessReading\Data\s10014\s10014_R_features_same-dur.csv to Z:\Mindless Reading\Data\s10014\s10014_R_features_same-dur.csv
Copied F:\MindlessReading\Data\s10014\s10014_L_features_slide_wlen2.csv to Z:\Mindless Reading\Data\s10014\s10014_L_features_slide_wlen2.csv
Copied F:\MindlessReading\Data\s10014\s10014_L_features_slide_wlen5.csv to Z:\Mindless Reading\Data\s10014\s10014_L_features_slide_wlen5.csv
Copied F:\MindlessReading\Data\s10014\s10014_R_features_slide_wlen2.csv to Z:\Mindless Reading\Data\s10014\s10014_R_features_slide_wlen2.csv
Copied F:\MindlessReading\Data\s10014\s10

# Append Files

In [2]:
import os
import re
import pandas as pd
import numpy as np
from glob import glob
from scipy import stats

def append_files(path, file_pattern, file_name=None):
    '''
    Append subject files (eye/behavior) and save as a group .csv file

    Parameters
    ----------
    path : string
        DESCRIPTION. The relative path to the data folder
    file_pattern : string, optional
        DESCRIPTION. The pattern for files being appended. 
        The default is '*[mono|R]_features*' for eye feature files.
    file_name : string, optional
        DESCRIPTION. the group file name to save
        The default is 'group_R_features.csv'.

    Returns
    -------
    df_group : dataframe
        DESCRIPTION. The group dataframe 

    '''
    if file_name is None:
        file_name = f'group_features_{file_pattern}.csv'

    # Get all subdirectories in the given path
    sub_folders = [f.path for f in os.scandir(path) if f.is_dir()]

    # Initialize an empty DataFrame to store aggregated data
    df_group = pd.DataFrame()

    # Iterate over each subject folder
    for folder_path in sub_folders:
        
        # Extract subject ID (matching "s" followed by exactly 5 digits)
        subject_match = re.search(r's\d{5}', folder_path)
        if not subject_match:
            continue  # Skip if no valid subject ID is found
        subject_id = subject_match.group(0)

        # Find all matching CSV files (e.g., "sxxxxx_L_features_end2.csv" or "sxxxxx_R_features_end2.csv")
        file_paths = glob(os.path.join(folder_path, "*.csv"))
        matching_files = [f for f in file_paths if re.search(file_pattern, os.path.basename(f))]

        if not matching_files:
            print(f"Skipping {subject_id}: No matching file found.")
            continue  # Skip if no matching file is found

        # Prioritize R_* file if both L_* and R_* exist
        r_files = [f for f in matching_files if 'R_' in os.path.basename(f)]
        file_path = r_files[0] if r_files else matching_files[0]  # Use R_ if available, otherwise use any match

        # Read the selected subject CSV file
        df_ind = pd.read_csv(file_path)

        # Add a subject ID column
        df_ind['sub_id'] = subject_id

        # Append to the group DataFrame
        df_group = pd.concat([df_group, df_ind], ignore_index=True)
    
    
    # save and return the group dataframe
    df_group = df_group.loc[:, ~df_group.columns.str.match('Unnamed')]
    
    # calculate z-score of two correlation coefficient columns
    # zipf_duration_correlation and word_length_duration_correlation
    for col_name in ['zipf_fixdur_corr', 'word_length_fixdur_corr']:
        # get index
        index = df_group.columns.get_loc(col_name)
        # extract the column values
        col = df_group[col_name]
        # compute the z-score
        z_col = stats.zscore(col, nan_policy='omit')
        # insert into the dataframe
        df_group.insert(index, f'zscored_{col_name}', z_col)

    # normalize features using subject level median
    subjects = df_group['sub_id'].unique()
    for subject in subjects:
        idx = df_group['sub_id'] == subject
        # pupil size
        pupil_baseline = df_group.loc[idx, 'pupil_baseline'].median()
        df_group.loc[idx, 'norm_pupil'] = df_group.loc[idx, 'pupil'] / pupil_baseline
        # interblink_interval
        ibi_baseline = df_group.loc[idx, 'ibi_baseline'].median()
        df_group.loc[idx, 'norm_ibi'] = df_group.loc[idx, 'ibi'] / ibi_baseline

        # make sure every subject has MW episodes
        df_sub = df_group[idx]
        if np.sum(df_sub['is_MWreported']) == 0:
            raise ValueError(f'Subject {subject} has no MW epsidoes. Please exclude from analyses.')
    
    df_group.to_csv(os.path.join(path, 'Files', file_name))
    print(f"File saved successfully: {file_name}")


# call function to append individual dataset
# path = r"F:\MindlessReading\Data"
path = r"/Volumes/GBLDrive/MindlessReading/Data"

# end2
# append_files(path, file_pattern='end2_sr')
# append_files(path, file_pattern='end2_tp')

# # end5
# append_files(path, file_pattern='end5_sr')
# append_files(path, file_pattern='end5_tp')

# # # mw_fixed: same duration based on MW episodes
# append_files(path, file_pattern='mw_fixed_sr')
# append_files(path, file_pattern='mw_fixed_tp')

# # # page_fixed: same duration based on the whole MW pages
# append_files(path, file_pattern='page_fixed_sr')
# append_files(path, file_pattern='page_fixed_tp')

# default
append_files(path, file_pattern='default_sr')

# Slide Window (window duration 5 seconds)
# append_files(path, file_pattern='slide5.0')

# Slide Window (window duration 2 seconds)
# append_files(path, file_pattern='slide2.0')

# slide window 1:6:0.5
# for w in np.arange(1, 6.5, 0.5):
#     file_pattern = f"slide{w:.1f}"  # Ensures format like 'slide1.0'
#     append_files(path, file_pattern=file_pattern)

File saved successfully: group_features_default_sr.csv


# Generate Event Label File (for EEG)

In [10]:
import os
import pandas as pd

# load the sliding window dataset
path = r"/Volumes/GBLDrive/MindlessReading/Data/Files"
file_path = os.path.join(path, 'group_features_slide2.0.csv')
df = pd.read_csv(file_path)

df['event_time'] = (df['win_start'] + df['win_end']) / 2 - df['relative_time']
# Keep only relevant columns
df = df[['sub_id', 'run', 'page', 'task_start', 'page_start', 'page_end', 'mw_onset', 'mw_offset', 'label', 'event_time']]

# Remove duplicate 'self_report' entries per subject, run, and page
df = df.drop_duplicates(subset=['sub_id', 'run', 'page', 'label'])
# Sort the dataset for logical ordering
df = df.sort_values(by=['sub_id', 'run', 'page']).reset_index(drop=True)

# Create the key column
df['key'] = (
    df['sub_id'].astype(str) + "_" +
    df['run'].astype(str) + "_" +
    df['page'].astype(str) + "_" +
    df['label']
)

# compute MW duration
df['mw_dur'] = df['mw_offset'] - df['mw_onset']

# compute time interval between events and page boudaries
df['page_start2event'] = df['event_time'] - df['page_start']
df['event2page_end'] = df['page_end'] - df['event_time']

# Check if all event times fall within page boundaries
invalid_events = df[(df['page_start2event'] < 0) | (df['event2page_end'] < 0)]

if not invalid_events.empty:
    print("Warning: Some event times fall outside of page boundaries.")
    print(invalid_events[['sub_id', 'run', 'page', 'label', 'page_start2event', 'event2page_end']])
else:
    print("All event times are within valid page boundaries.")

# Save to CSV file
output_path = os.path.join(path, 'group_event_label.csv')
df.to_csv(output_path)


      sub_id  run  page        label  page_start2event  event2page_end
3404  s10181    1     5  self_report            18.199   -4.547474e-13


# Manage files for OSF

## Copy raw data

In [6]:
import os
import re
import shutil
# Define the root directory containing subject folders
# root_dir = "F:/MindlessReading/Data"
root_dir = r"/Volumes/GBLDrive/MindlessReading/Data"

# Define the expected number of files in each folder
expected_files_count = 5

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) and subject_pattern.match(d)]

# Define the destination directory for raw data
raw_data_dir = os.path.join(r"/Users/hsun11/Desktop/raw_data")

print(f"Found {len(subjects)} subjects matching pattern")
print(f"Raw data will be saved to: {raw_data_dir}")
print("-" * 50)

# Create the main raw_data directory
os.makedirs(raw_data_dir, exist_ok=True)

success_count = 0
skipped_count = 0
total_copied_files = {"eye": 0, "log": 0}

for subject in sorted(subjects):
    print(f"Processing subject: {subject}")
    
    # Source paths
    source_subject_dir = os.path.join(root_dir, subject)
    source_eye_dir = os.path.join(source_subject_dir, "eye")
    source_log_dir = os.path.join(source_subject_dir, "log")
    
    # Destination paths
    dest_subject_dir = os.path.join(raw_data_dir, subject)
    dest_eye_dir = os.path.join(dest_subject_dir, "eye")
    dest_log_dir = os.path.join(dest_subject_dir, "log")
    
    # Create destination directories
    os.makedirs(dest_eye_dir, exist_ok=True)
    os.makedirs(dest_log_dir, exist_ok=True)
    
    # Check if destination folders already have the complete set of files
    existing_asc_files = [f for f in os.listdir(dest_eye_dir) if f.endswith('.asc')] if os.path.exists(dest_eye_dir) else []
    existing_csv_files = [f for f in os.listdir(dest_log_dir) if f.endswith('.csv')] if os.path.exists(dest_log_dir) else []
    
    if len(existing_asc_files) == expected_files_count and len(existing_csv_files) == expected_files_count:
        print(f"  ⏭️  Skipping {subject} - already has complete set:")
        print(f"    - {len(existing_asc_files)} .asc files in eye folder")
        print(f"    - {len(existing_csv_files)} .csv files in log folder")
        skipped_count += 1
        print()
        continue
    
    copied_files = {"eye": [], "log": []}
    errors = []
    
    try:
        # Copy .asc files from eye folder
        if os.path.exists(source_eye_dir):
            asc_files = [f for f in os.listdir(source_eye_dir) if f.endswith('.asc')]
            
            if len(asc_files) == expected_files_count:
                for asc_file in asc_files:
                    source_file = os.path.join(source_eye_dir, asc_file)
                    dest_file = os.path.join(dest_eye_dir, asc_file)
                    shutil.copyfile(source_file, dest_file)
                    copied_files["eye"].append(asc_file)
            else:
                errors.append(f"Expected {expected_files_count} .asc files, found {len(asc_files)}")
        else:
            errors.append("Eye folder does not exist")
        
        # Copy .csv files from log folder
        if os.path.exists(source_log_dir):
            csv_files = [f for f in os.listdir(source_log_dir) if f.endswith('.csv')]
            
            if len(csv_files) == expected_files_count:
                for csv_file in csv_files:
                    source_file = os.path.join(source_log_dir, csv_file)
                    dest_file = os.path.join(dest_log_dir, csv_file)
                    shutil.copyfile(source_file, dest_file)
                    copied_files["log"].append(csv_file)
            else:
                errors.append(f"Expected {expected_files_count} .csv files, found {len(csv_files)}")
        else:
            errors.append("Log folder does not exist")
            
    except Exception as e:
        errors.append(f"Error copying files: {str(e)}")
    
    # Report results for this subject
    if errors:
        print(f"  ❌ Errors for {subject}:")
        for error in errors:
            print(f"    - {error}")
    else:
        print(f"  ✅ Successfully copied:")
        print(f"    - {len(copied_files['eye'])} .asc files from eye folder")
        print(f"    - {len(copied_files['log'])} .csv files from log folder")
        success_count += 1
        total_copied_files["eye"] += len(copied_files["eye"])
        total_copied_files["log"] += len(copied_files["log"])
    
    print()

# Summary
print("=" * 50)
print("SUMMARY:")
print(f"Successfully processed: {success_count}/{len(subjects)} subjects")
print(f"Skipped (already had files): {skipped_count}/{len(subjects)} subjects")
print(f"Total .asc files copied: {total_copied_files['eye']}")
print(f"Total .csv files copied: {total_copied_files['log']}")
print(f"Raw data directory: {raw_data_dir}")

Found 45 subjects matching pattern
Raw data will be saved to: /Users/hsun11/Desktop/raw_data
--------------------------------------------------
Processing subject: s10014
  ⏭️  Skipping s10014 - already has complete set:
    - 5 .asc files in eye folder
    - 5 .csv files in log folder

Processing subject: s10052
  ⏭️  Skipping s10052 - already has complete set:
    - 5 .asc files in eye folder
    - 5 .csv files in log folder

Processing subject: s10059
  ⏭️  Skipping s10059 - already has complete set:
    - 5 .asc files in eye folder
    - 5 .csv files in log folder

Processing subject: s10073
  ⏭️  Skipping s10073 - already has complete set:
    - 5 .asc files in eye folder
    - 5 .csv files in log folder

Processing subject: s10081
  ⏭️  Skipping s10081 - already has complete set:
    - 5 .asc files in eye folder
    - 5 .csv files in log folder

Processing subject: s10084
  ⏭️  Skipping s10084 - already has complete set:
    - 5 .asc files in eye folder
    - 5 .csv files in log 

## Generate demographic information

In [8]:
import pandas as pd
import os

# Load the Excel file
path = r"/Volumes/GBLDrive/MindlessReading/Data/Files"
file_name = "demo_info.xlsx"  # Update path if needed
df = pd.read_excel(os.path.join(path, file_name))

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Find all subject folders from raw_data folder
raw_data_dir = r"/Users/hsun11/Desktop/raw_data"
subject_pattern = re.compile(r"^s\d{5}$")

# Get subject IDs from raw_data folder
subject_ids = [d for d in os.listdir(raw_data_dir) 
               if os.path.isdir(os.path.join(raw_data_dir, d)) and subject_pattern.match(d)]

print(f"Found {len(subject_ids)} subjects in raw_data folder")
subject_nums = [int(s[1:]) for s in subject_ids]

# Filter by participant ID
df_filtered = df[df['participant_id'].isin(subject_nums)].copy()

# Age: ensure numeric and compute stats
df_filtered['age'] = pd.to_numeric(df_filtered['age'], errors='coerce')
age_min = df_filtered['age'].min()
age_max = df_filtered['age'].max()
age_median = df_filtered['age'].median()
age_mean = df_filtered['age'].mean()

# Gender: clean and count
df_filtered['gender_clean'] = df_filtered['gender'].str.strip().str.lower()
df_filtered['gender_clean'] = df_filtered['gender_clean'].replace({
    'female': 'female',
    'male': 'male',
    'nonbinary': 'non-binary',
    'non-binary': 'non-binary',
    'gender-nonconforming': 'non-binary',
    'transgender male': 'non-binary'
})
gender_counts = df_filtered['gender_clean'].value_counts()

# Handedness: clean and count
df_filtered['handness_clean'] = df_filtered['handness'].str.strip().str.lower()
df_filtered['handness_clean'] = df_filtered['handness_clean'].replace({
    'right': 'right',
    'left': 'left',
    'left, but used right-handed setup': 'left',
    'left-handed setup': 'left',
    'left, but used right-handed setup for mouse and keyboard': 'left',
    'ambidextrous (left-hand dominant)': 'left'
})
handness_counts = df_filtered['handness_clean'].value_counts()

# Print results
print("=" * 50)
print("DEMOGRAPHIC ANALYSIS")
print("=" * 50)
print(f"Total subjects analyzed: {len(df_filtered)}")
print()
print(f"Age range: {age_min} - {age_max}")
print(f"Median age: {age_median}")
print(f"Mean age: {age_mean:.2f}")
print()
print("Gender distribution:")
for gender, count in gender_counts.items():
    percentage = (count / len(df_filtered)) * 100
    print(f"  {gender}: {count} ({percentage:.1f}%)")
print()
print("Handedness distribution:")
for hand, count in handness_counts.items():
    percentage = (count / len(df_filtered)) * 100
    print(f"  {hand}: {count} ({percentage:.1f}%)")

# Optional: Show subjects that are in raw_data but missing from demographics
missing_subjects = [s for s in subject_nums if s not in df['participant_id'].values]
if missing_subjects:
    print()
    print(f"Warning: {len(missing_subjects)} subjects in raw_data have no demographic data:")
    print(f"Missing: {sorted(missing_subjects)}")

# Save filtered demographics to CSV in raw_data folder
output_file = os.path.join(raw_data_dir, "subject_demographics_information.csv")
df_filtered.to_csv(output_file, index=False)
print()
print(f"✅ Saved filtered demographics to: {output_file}")
print(f"   Contains {len(df_filtered)} subjects with {len(df_filtered.columns)} columns")

Found 45 subjects in raw_data folder
DEMOGRAPHIC ANALYSIS
Total subjects analyzed: 45

Age range: 18 - 64
Median age: 20.0
Mean age: 22.49

Gender distribution:
  female: 31 (68.9%)
  male: 9 (20.0%)
  non-binary: 5 (11.1%)

Handedness distribution:
  right: 38 (84.4%)
  left: 7 (15.6%)

✅ Saved filtered demographics to: /Users/hsun11/Desktop/raw_data/subject_demographics_information.csv
   Contains 45 subjects with 8 columns


## Copy processed data

In [None]:
import os
import re
import shutil
import glob

# Define the root directory containing subject folders
# root_dir = "F:/MindlessReading/Data"
root_dir = r"/Volumes/GBLDrive/MindlessReading/Data"

# Define the expected number of files in each folder
expected_files_count = 5

# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) and subject_pattern.match(d)]

# Define the destination directory for processed data
processed_data_dir = os.path.join(r"/Users/hsun11/Desktop/processed_data")

print(f"Found {len(subjects)} subjects matching pattern")
print(f"Processed data will be saved to: {processed_data_dir}")
print("-" * 50)

# Create the main processed_data directory
os.makedirs(processed_data_dir, exist_ok=True)

# Define CSV file patterns to copy
csv_patterns = [
    '*end2_sr*.csv',
    '*end2_tp*.csv', 
    '*end5_sr*.csv',
    '*end5_tp*.csv',
    '*mw_fixed_sr*.csv',
    '*mw_fixed_tp*.csv',
    '*page_fixed_sr*.csv',
    '*page_fixed_tp*.csv',
    '*default_sr*.csv',
    '*slide2.0*.csv'
]

for subject in sorted(subjects):
    print(f"Processing subject: {subject}")
    
    subject_path = os.path.join(root_dir, subject)
    subject_processed_dir = os.path.join(processed_data_dir, subject)
    
    # Create subject directory in processed_data
    os.makedirs(subject_processed_dir, exist_ok=True)
    
    # Copy 5 folders from eye subdirectory
    eye_dir = os.path.join(subject_path, "eye")
    if os.path.exists(eye_dir):
        eye_folders = [d for d in os.listdir(eye_dir) if os.path.isdir(os.path.join(eye_dir, d))]
        
        if len(eye_folders) >= 5:
            # Copy first 5 folders (or you can modify this logic to select specific folders)
            for i, folder in enumerate(sorted(eye_folders)[:5]):
                src_folder = os.path.join(eye_dir, folder)
                dst_folder = os.path.join(subject_processed_dir, 'eye', folder)
                
                try:
                    shutil.copytree(src_folder, dst_folder)
                    # print(f"  Copied eye folder: {folder}")
                except Exception as e:
                    print(f"  Error copying eye folder {folder}: {e}")
        else:
            print(f"  Warning: Only found {len(eye_folders)} folders in eye directory, expected 5")
    else:
        print(f"  Warning: Eye directory not found for subject {subject}")
    
    # Copy CSV files with specified patterns
    for pattern in csv_patterns:
        matching_files = glob.glob(os.path.join(subject_path, pattern))
        
        for file_path in matching_files:
            file_name = os.path.basename(file_path)
            dst_path = os.path.join(subject_processed_dir, file_name)
            
            try:
                shutil.copy2(file_path, dst_path)
                # print(f"  Copied CSV: {file_name}")
                csv_files_copied += 1
            except Exception as e:
                print(f"  Error copying CSV {file_name}: {e}")
    
    print("-" * 30)

print("Processing complete!")

Found 45 subjects matching pattern
Processed data will be saved to: /Users/hsun11/Desktop/processed_data
--------------------------------------------------
Processing subject: s10014
  Copied eye folder: s014_r1_2023_11_29_09_28_data
  Copied eye folder: s014_r2_2023_11_29_09_56_data
  Copied eye folder: s014_r3_2023_11_29_10_15_data
  Copied eye folder: s014_r4_2023_11_29_10_28_data
  Copied eye folder: s014_r5_2023_11_29_10_43_data
  Copied CSV: s10014_R_features_end2_sr.csv
  Copied CSV: s10014_L_features_end2_sr.csv
  Copied CSV: s10014_R_features_end2_tp.csv
  Copied CSV: s10014_L_features_end2_tp.csv
  Copied CSV: s10014_L_features_end5_sr.csv
  Copied CSV: s10014_R_features_end5_sr.csv
  Copied CSV: s10014_L_features_end5_tp.csv
  Copied CSV: s10014_R_features_end5_tp.csv
  Copied CSV: s10014_L_features_mw_fixed_sr.csv
  Copied CSV: s10014_R_features_mw_fixed_sr.csv
  Copied CSV: s10014_L_features_mw_fixed_tp.csv
  Copied CSV: s10014_R_features_mw_fixed_tp.csv
  Copied CSV: s100

KeyboardInterrupt: 