In [16]:
# import pandas as pd
# import dask.dataframe as dd
# import os

# dtypes = {'uid': 'str','dataset': 'str','city_section': 'str','ExperimentalCondition': 'str','EventName': 'object','HitObjectName': 'object','ObjectName_4': 'object','ObjectName_5': 'object', 'EventDuration': 'float64', 'TimeStamp': 'float64'}

# base_dir = os.getcwd()
# data_path = base_dir + '/Data/'

# df = dd.read_csv(data_path + "cleaned_windowed.csv", assume_missing=True, dtype=dtypes, blocksize="100MB", low_memory=False)  
# df = df.compute() 


# # Ensure sorted by uid and event sequence
# df = df.sort_values(['uid']).reset_index(drop=True)

# # Identify window within each participant: every 500 rows is one window
# df['row_within_uid'] = df.groupby('uid').cumcount()
# df['window_id'] = df['row_within_uid'] // 500
# df['within_window_idx'] = df['row_within_uid'] % 500

# # Determine the actual event name for each window (from second half)
# event_map = (
#     df[df['within_window_idx'] >= 250]
#     .groupby(['uid', 'window_id'])['EventName']
#     .first()
#     .reset_index()
#     .rename(columns={'EventName': 'TrueEvent'})
# )

# df = df.merge(event_map, on=['uid', 'window_id'])

# # For the first 250 rows of each window, average steering input across participants for the same TrueEvent and time index
# mask = df['within_window_idx'] < 250
# avg_si = (
#     df[mask]
#     .groupby(['TrueEvent', 'within_window_idx'])['SteeringInput']
#     .mean()
#     .reset_index()
#     .rename(columns={'SteeringInput': 'SI_avg'})
# )

# df = df.merge(avg_si, on=['TrueEvent', 'within_window_idx'], how='left')

# # Replace SteeringInput in the first half with the computed average
# df.loc[mask, 'SteeringInput'] = df.loc[mask, 'SI_avg']

# # Drop helper columns
# df.drop(['row_within_uid', 'window_id', 'within_window_idx', 'TrueEvent', 'SI_avg'], axis=1, inplace=True)

# # Save to new CSV
# df.to_csv('steering_replaced.csv', index=False)

# print("Written steering_replaced.csv with averaged first-half SteeringInput.")


In [17]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import random
# import numpy as np

# # — reload original & replaced —
# base_dir = os.getcwd()  # or wherever your Data folder lives
# data_path = base_dir + '/Data/'

# orig = pd.read_csv(data_path + "cleaned_windowed.csv", dtype=dtypes)
# repl = pd.read_csv("steering_replaced.csv", dtype=dtypes)

# # — recompute window & row indices exactly as you did before —
# for dfv in (orig, repl):
#     dfv['row_within_uid']   = dfv.groupby('uid').cumcount()
#     dfv['window_id']        = dfv['row_within_uid'] // 500
#     dfv['within_window_idx']= dfv['row_within_uid'] % 500

# # — pick 4 random (uid, window_id) pairs to inspect —
# keys = orig[['uid','window_id']].drop_duplicates()
# keys = keys.sample(15, random_state=0).values.tolist()

# time_sec = np.linspace(-5, 5, 500)

# for uid, win in keys:
#     # masks
#     m0 = (orig.uid==uid)&(orig.window_id==win)
#     m1 = (repl.uid==uid)&(repl.window_id==win)

#     si0 = orig.loc[m0, 'SteeringInput'].values
#     si1 = repl.loc[m1, 'SteeringInput'].values

#     fig, ax = plt.subplots(figsize=(8,4))
#     ax.plot(time_sec, si0, '--', label='original', alpha=0.7)
#     ax.plot(time_sec, si1,  '-' , label='replaced', linewidth=2)
#     ax.axvline(0, color='k', linestyle=':')
#     ax.set_title(f"UID={uid} | window={win}")
#     ax.set_xlabel("Time (s)")
#     ax.set_ylabel("SteeringInput")
#     ax.legend()
#     ax.grid(True)
#     plt.ylim(-.5, .5)
#     plt.tight_layout()
#     plt.show()


In [22]:
import pandas as pd
import dask.dataframe as dd
import os

# --- User-defined Feature Combinations ---
feature_combinations = [
    ['EyeDirWorldCombined.x', 'NoseVector.x'],
    ['EyeDirWorldCombined.x', 'NoseVector.x', 'EyeDirWorldCombined.y', 'NoseVector.y'],
    ['EyeDirWorldCombined.x', 'NoseVector.x', 'EyeDirWorldCombined.y', 'NoseVector.y', 'SteeringInput'],
    ['EyeDirWorldCombined.x', 'NoseVector.x', 'EyeDirWorldCombined.y', 'NoseVector.y', 'EyeDirWorldCombined.z', 'NoseVector.z'],
    ['EyeDirWorldCombined.x', 'NoseVector.x', 'EyeDirWorldCombined.y', 'NoseVector.y', 'SteeringInput', 'EyeDirWorldCombined.z', 'NoseVector.z']
]

# --- Event Names to Process ---
event_names = ['StagEventNew', 'FallingRocksEventNew', 'FogEventNew']

# --- Shared Configuration ---
alg = "nearest_neighbor"
steering_var = 'SteeringInput'
base_dir = os.getcwd()
data_root = os.path.join(base_dir, 'data', 'cleaned_data', 'data_segment')

# --- Loop ---
for features in feature_combinations:
    feature_count = len(features)
    folder_name = f"outliers_removed_{feature_count}features"
    data_path = os.path.join(data_root, folder_name)

    for event_name in event_names:
        filename = f"segment_around_{event_name}_{alg}.csv"
        filepath = os.path.join(data_path, filename)

        if not os.path.exists(filepath):
            print(f"File not found, skipping: {filepath}")
            continue

        print(f"Processing: {filepath}")

        # Load with Dask
        dtypes = {
            'uid': 'str', 'dataset': 'str', 'city_section': 'str', 'ExperimentalCondition': 'str',
            'EventName': 'object', 'HitObjectName': 'object', 'ObjectName_4': 'object', 'ObjectName_5': 'object',
            'EventDuration': 'float64', 'TimeStamp': 'float64'
        }

        df = dd.read_csv(filepath, assume_missing=True, dtype=dtypes, blocksize="100MB", low_memory=False).compute()

        # Sort & assign within-uid indices
        df = df.sort_values(['uid']).reset_index(drop=True)
        df['row_within_uid'] = df.groupby('uid').cumcount()
        df['window_id'] = 0
        df['within_window_idx'] = df['row_within_uid']

        # Determine true event from second half
        event_map = (
            df[df['within_window_idx'] >= 250]
            .groupby(['uid', 'window_id'])['EventName']
            .first()
            .reset_index()
            .rename(columns={'EventName': 'TrueEvent'})
        )
        df = df.merge(event_map, on=['uid', 'window_id'], how='left')

        # Set SteeringInput to 0 before onset
        if steering_var in features:
            df.loc[df['within_window_idx'] < 250, steering_var] = 0

        # Drop helper columns
        df.drop(['row_within_uid', 'window_id', 'within_window_idx', 'TrueEvent'], axis=1, inplace=True)

        # Create subdirectory inside the correct folder
        mdir = os.path.join(data_path, 'steeringRemoved')
        os.makedirs(mdir, exist_ok=True)  # Make sure it exists

        # Create the filename and output path
        output_filename = f"segment_around_{event_name}_{alg}_steeringRemoved.csv"
        output_path = os.path.join(mdir, output_filename)

        df.to_csv(output_path, index=False)
        print(f"Saved cleaned file to:\n{output_path}\n")


Processing: c:\Users\erene\OneDrive\Desktop\fpca-takeOverRequests\data\cleaned_data\data_segment\outliers_removed_2features\segment_around_StagEventNew_nearest_neighbor.csv
Saved cleaned file to:
c:\Users\erene\OneDrive\Desktop\fpca-takeOverRequests\data\cleaned_data\data_segment\outliers_removed_2features\steeringRemoved\segment_around_StagEventNew_nearest_neighbor_steeringRemoved.csv

Processing: c:\Users\erene\OneDrive\Desktop\fpca-takeOverRequests\data\cleaned_data\data_segment\outliers_removed_2features\segment_around_FallingRocksEventNew_nearest_neighbor.csv
Saved cleaned file to:
c:\Users\erene\OneDrive\Desktop\fpca-takeOverRequests\data\cleaned_data\data_segment\outliers_removed_2features\steeringRemoved\segment_around_FallingRocksEventNew_nearest_neighbor_steeringRemoved.csv

Processing: c:\Users\erene\OneDrive\Desktop\fpca-takeOverRequests\data\cleaned_data\data_segment\outliers_removed_2features\segment_around_FogEventNew_nearest_neighbor.csv
Saved cleaned file to:
c:\Users\

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# — reload original & replaced —
base_dir = os.getcwd()  # or wherever your Data folder lives

orig = pd.read_csv(data_path + "segment_around_FallingRocksEventNew_nearest_neighbor.csv", dtype=dtypes)
repl = pd.read_csv(data_path + "segment_around_FallingRocksEventNew_nearest_neighbor_steeringRemoved.csv", dtype=dtypes)

print(f"Original data dimensions: {orig.shape}")
print(f"Replaced data dimensions: {repl.shape}")

# — recompute row indices within each uid —
for dfv in (orig, repl):
    dfv['row_within_uid'] = dfv.groupby('uid').cumcount()

# — pick 15 random uid to inspect (because no windows anymore) —
unique_uids = orig['uid'].unique()
selected_uids = np.random.RandomState(seed=0).choice(unique_uids, size=15, replace=False)

# Time array matches full length of largest UID (change if needed)
# Otherwise, for each uid, time axis matches length per uid
# Here we will create time axis from -5 to 5 seconds, spacing depends on data length

for uid in selected_uids:
    # mask per uid
    m0 = orig['uid'] == uid
    m1 = repl['uid'] == uid

    si0 = orig.loc[m0, 'SteeringInput'].values
    si1 = repl.loc[m1, 'SteeringInput'].values

    len0 = len(si0)
    len1 = len(si1)

    # Use the length of original or replaced data
    length = min(len0, len1)

    # Create time axis for current UID
    time_sec = np.linspace(-5, 5, length)

    # In case lengths differ, trim to common length
    si0 = si0[:length]
    si1 = si1[:length]

    fig, ax = plt.subplots(figsize=(8,4))
    ax.plot(time_sec, si0, '--', label='original', alpha=0.7)
    ax.plot(time_sec, si1,  '-' , label='replaced', linewidth=2)
    ax.axvline(0, color='k', linestyle=':')
    ax.set_title(f"UID={uid}")
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("SteeringInput")
    ax.legend()
    ax.grid(True)
    plt.ylim(-0.5, 0.5)
    plt.tight_layout()
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\erene\\OneDrive\\Desktop\\fpca-takeOverRequests\\data\\cleaned_data\\data_segment\\outliers_removed_7featuressegment_around_FallingRocksEventNew_nearest_neighbor.csv'

In [None]:


# Assign all rows for each UID to a single window (window_id = 0)
df['row_within_uid'] = df.groupby('uid').cumcount()
df['window_id'] = 0  # all rows per uid belong to one window
df['within_window_idx'] = df['row_within_uid']

# Now grouping works without KeyError
group_sizes = df.groupby(['uid', 'window_id']).size().reset_index(name='counts')
print(group_sizes)

                                  uid  window_id  counts
0    0037b2329de444c18d751b4e79901b39          0     501
1    0117810eb9634c4f98f842021ee6a595          0     501
2    0121f5b2f59d434f8beb17bf3e2a80b9          0     501
3    0956f0cca5f546d79a0cf4fbae23d496          0     501
4    09a23914cf354ea39444511406d16722          0     501
..                                ...        ...     ...
155  f9c6ff61370141c89ea9bbc536d796e1          0     501
156  fa2e2604ec6a4820851f032e80f09ba1          0     501
157  fa4ca90c5b80445b9af0b7ec4fbcc124          0     501
158  fd19a21a4a8846ca82bd127e4e1933f5          0     501
159  ff4288f304e74bbf93aa6508c7df8145          0     501

[160 rows x 3 columns]
