# Create window datasets

In [None]:
import os
import re
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

# define data root
# this is the path to the ROAMM folder on local machine
data_root = r"/gpfs1/pi/djangraw/mindless_reading/data"
random_seed = 42
# define window size and sampling rate
sfreq = 256
window_seconds = 5
window_size = int(sfreq * window_seconds)

all_subjects = sorted([d for d in os.listdir(data_root) if d.startswith('s') and os.path.isdir(os.path.join(data_root, d))])

min_sub_num = 0

for subject_id in all_subjects:
    df = pd.DataFrame()
    m = re.match(r"^s(\d+)$", subject_id)
    sub_num = int(m.group(1))
    if sub_num <= min_sub_num:
        print(f"Subject {subject_id} is skipped")
        continue

    subject_dir = os.path.join(data_root, subject_id, "ml_data")
    save_dir = os.path.join(subject_dir, f'{window_size}window_datasets')

    pkl_files = [f for f in os.listdir(subject_dir) if f.endswith('.pkl')]
    # make sure each subject has 5 runs of data
    if len(pkl_files) != 5:
        raise ValueError(f"Subject {subject_id} has {len(pkl_files)} runs instead of 5")
    
    for pkl_file in pkl_files:
        df_sub_single_run = pd.read_pickle(os.path.join(subject_dir, pkl_file))
        df_sub_single_run = df_sub_single_run[df_sub_single_run['first_pass_reading'] == 1]
        # convert bool col explicitly to avoid pandas warning
        for col in ['is_blink', 'is_sacc', 'is_fix', 'is_mw', 'first_pass_reading']:
            df_sub_single_run[col] = df_sub_single_run[col] == True

        # filter out samples 2 seconds before page end
        mask = df_sub_single_run['time'] < df_sub_single_run['page_end']-2
        df_sub_single_run = df_sub_single_run[mask]
        # append to the dataframe
        df = pd.concat([df, df_sub_single_run])
        # add subject id to the dataframe
        df['subject_id'] = subject_id
    print(f'Subject {subject_id} has been loaded.')
    
    # normalize pupil size features
    df['blink_interp_LPupil_norm'] = df['blink_interp_LPupil'] / df['blink_interp_LPupil'].median()
    df['blink_interp_RPupil_norm'] = df['blink_interp_RPupil'] / df['blink_interp_RPupil'].median()

    windowed_data = []
    windowed_labels = []

    # Process data in chunks of window_size
    for i in range(0, len(df), window_size):
        window = df.iloc[i:i+window_size]
        # Skip if window is too small
        if len(window) < window_size:
            continue
        # Check if labels are consistent in this window
        labels_in_window = window['is_mw'].unique()
        if len(labels_in_window) > 1:
            # Skip windows with mixed labels
            continue

        # Extract features for this window: keep as 2D array (window_size x feature_number)
        windowed_data.append(window.values)
        # Use the consistent label
        windowed_labels.append(labels_in_window[0])

    # check class distribution before undersampling
    unique, _ = np.unique(windowed_labels, return_counts=True)
    if len(unique) < 2:
        print(f"Subject {subject_id} has only one class in windowed labels. Skipping undersampling.")
        continue

    # Use RandomUnderSampler on flattened data, then recover 3D structure
    windowed_data_flat = [w.flatten() for w in windowed_data]
    undersampler = RandomUnderSampler(random_state=random_seed)
    X_resampled_flat, y_resampled = undersampler.fit_resample(windowed_data_flat, windowed_labels)
    # Recover 3D array: (n_samples, window_size, n_features)
    window_size = windowed_data[0].shape[0]
    n_features = windowed_data[0].shape[1]
    X_resampled = np.array(X_resampled_flat).reshape(-1, window_size, n_features)
    X = np.transpose(X_resampled, (0, 2, 1))  # (N, num_channels, window_size)
    y = np.array(y_resampled, dtype=int)

    # get col names
    col_names = df.columns.tolist()

    # save windowed data and labels
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    np.save(os.path.join(save_dir, f'{subject_id}_{window_size}windowed_data.npy'), X)
    np.save(os.path.join(save_dir, f'{subject_id}_{window_size}windowed_labels.npy'), y)

    # for col names, only save one copy
    file_path = os.path.join(save_dir, f'{subject_id}_col_names.npy')
    # if not os.path.exists(file_path):
    np.save(file_path, col_names)
    
    print(f'Windowed data and labels for subject {subject_id} have been saved.')

Subject s10014 has been loaded.
Windowed data and labels for subject s10014 have been saved.
Subject s10052 has been loaded.
Windowed data and labels for subject s10052 have been saved.
Subject s10059 has been loaded.
Windowed data and labels for subject s10059 have been saved.
Subject s10073 has been loaded.
Windowed data and labels for subject s10073 have been saved.
Subject s10081 has been loaded.
Windowed data and labels for subject s10081 have been saved.
Subject s10084 has been loaded.
Windowed data and labels for subject s10084 have been saved.
Subject s10085 has been loaded.
Windowed data and labels for subject s10085 have been saved.
Subject s10089 has been loaded.
Windowed data and labels for subject s10089 have been saved.
Subject s10094 has been loaded.
Windowed data and labels for subject s10094 have been saved.
Subject s10100 has been loaded.
Windowed data and labels for subject s10100 have been saved.
Subject s10103 has been loaded.
Subject s10103 has only one class in w

In [None]:
temp_labels = np.array(windowed_labels)

In [2]:
import pandas as pd

df = pd.read_csv("/gpfs1/pi/djangraw/mindless_reading/data/all_subjects_512windowed_features.csv")

for subj in df['subject_id'].unique():
    print(f"{subj}: MW: {df[df['subject_id'] == subj]['label'].sum()}")

s10014: MW: 130
s10052: MW: 119
s10059: MW: 26
s10073: MW: 70
s10081: MW: 35
s10084: MW: 31
s10085: MW: 35
s10089: MW: 24
s10094: MW: 7
s10100: MW: 67
s10103: MW: 106
s10110: MW: 9
s10111: MW: 57
s10115: MW: 87
s10117: MW: 184
s10121: MW: 80
s10125: MW: 232
s10138: MW: 69
s10139: MW: 81
s10141: MW: 37
s10144: MW: 18
s10145: MW: 22
s10148: MW: 42
s10153: MW: 130
s10156: MW: 95
s10158: MW: 100
s10159: MW: 48
s10160: MW: 33
s10165: MW: 91
s10173: MW: 14
s10177: MW: 30
s10178: MW: 32
s10180: MW: 220
s10181: MW: 42
s10183: MW: 128
s10185: MW: 129
s10186: MW: 45
s10188: MW: 28
s10192: MW: 180
s10195: MW: 58
s10196: MW: 5
s10197: MW: 6
s10200: MW: 33
s10202: MW: 18
