# LoopAR project:
## 1. Data preprocessing

In [209]:
import warnings

import pathos.pools

warnings.simplefilter(action='ignore', category=FutureWarning)

import json
import itertools
import os
import math
from functools import partial
from ast import literal_eval
import pandas as pd
import dask.dataframe as dd
# import dataframe_image as dfi
import numpy as np
from numpy import dtype

import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib import patches
from scipy.signal import find_peaks
from IPython.display import display
from tqdm.notebook import tqdm

## 1.1 Useful functions
To open, read, and save from .txt format to .csv

In [210]:
def get_filename_as_parts(filename):
    """
    Extract the parts of file name without .txt ending
    :param filename: 
    :return: 
    """
    return filename.split(".")[0].split("_")

def get_uid_calibration_data(folder_path):
    folder_path = os.path.join(folder_path, '..', 'ParticipantCalibrationData')
    _, matching_files_list = analyze_folder(folder_path)
    df_cal = pd.DataFrame()
    for txt_file in matching_files_list:
        filename = os.path.join(folder_path, txt_file)
        with open(filename, "r") as file:
            data_list_cal = json.load(file)
        df = pd.json_normalize(data_list_cal, max_level=1)
        df_cal = pd.concat([df, df_cal], ignore_index=True)

    df_cal['included'] = df_cal.apply(lambda row: ((row['EyeValidationError.x']<1.5) or (row['EyeValidationError.y']<1.5)) and
                                                  (not 'EyeCalibrationSkipped' in row['SpecialNotes']) and
                                                  (not 'EyeValidationSkipped' in row['SpecialNotes']) and
                                                  (not 'Keyboard' in row['SteeringInputDevice']), axis = 1)
    df_cal.rename(columns={'ParticipantUuid': 'uid'}, inplace=True)

    return df_cal[['uid', 'included', 'ExperimentalCondition']]

def get_events_data(folder_path):
    folder_path = os.path.join(folder_path, '..', 'SceneData')
    _, matching_files_list = analyze_folder(folder_path)
    df_scene = pd.DataFrame(columns=['uid'])
    for txt_file in matching_files_list: 
        filename = os.path.join(folder_path, txt_file)
        with open(filename, "r") as file:
            data_list = json.load(file)
        df = pd.json_normalize(data_list,'EventBehavior',max_level=1)

        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df['uid'] = parts[0]
        
        df_scene = pd.concat([df_scene, df], ignore_index=True)

    return df_scene
    
def analyze_folder(folder_path, target_id = '', file_type='txt'):
    """
    Given a folder path, identifies all .txt files, splits their names by '_',
    and returns a set of unique IDs extracted from the first part of each file name.
    """
    unique_ids = set() # Initialize an empty set to store unique IDs
    files_of_interest = [f for f in os.listdir(folder_path) if f.lower().endswith('.'+file_type)]
    matching_files = []
    
    # List all .txt files in the specified folder
    if target_id != '':
        print("Matching files:")
    for filename in files_of_interest:
        parts = get_filename_as_parts(filename)
        if len(parts) > 0:
            unique_ids.add(parts[0])  # Add the first part to the set
        if target_id in parts:
            print(parts)
            matching_files.append(filename)
        
    return unique_ids, matching_files if target_id != '' else files_of_interest

In [211]:
get_events_data(file_path_calibration)

Unnamed: 0,uid,EventName,StartofEventTimeStamp,EndOfEventTimeStamp,EventDuration,SuccessfulCompletionState,HitObjectName
0,beaea101b49242a183285032cc6357ea,StagEventNew,1.603289e+09,1.603289e+09,4.660249,False,StagWithRB
1,beaea101b49242a183285032cc6357ea,FallingRocksEventNew,1.603289e+09,1.603289e+09,18.133327,True,
2,beaea101b49242a183285032cc6357ea,FogEventNew,1.603290e+09,1.603290e+09,20.050239,False,Cube (42)
3,861f9d156a7a4372adc36d4cba454abb,StagEventNew,1.600601e+09,1.600601e+09,6.277663,True,
4,861f9d156a7a4372adc36d4cba454abb,FallingRocksEventNew,1.600601e+09,1.600601e+09,9.693997,False,Cube (25)
...,...,...,...,...,...,...,...
2523,71bc29ba9b3c415f8575dcfb484a5fe2,GhostDriver,1.601125e+09,1.601125e+09,6.229658,False,Cube (19)
2524,71bc29ba9b3c415f8575dcfb484a5fe2,Baustelle,1.601125e+09,1.601125e+09,15.266277,True,
2525,de17edda8c414228b075ee25c75c6c69,MarketPlaceEvent,1.600610e+09,1.600610e+09,5.597554,False,024 Variant
2526,de17edda8c414228b075ee25c75c6c69,CyclistEvent,1.600610e+09,1.600610e+09,7.428447,False,Accidentcyclist


Count the amount of unique ids in eyeTracking data folder

In [212]:
# Set folders
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Eyetracking/"
# extracted_folder_path = "Data/2_Extracted/"

In [213]:
# Example usage:
ids, matching_files_list = analyze_folder(raw_folder_path)

print(f"Total unique IDs found in {raw_folder_path} ending with .txt files: {len(ids)} with a total of {len(matching_files_list)} files")
list(ids)

Total unique IDs found in Data/0_Raw/Eyetracking/ ending with .txt files: 255 with a total of 1106 files


['7c4a213768e645e4a46f4317ad87d76d',
 '973efd2d86b64b7db7d5cc3f3f3725b2',
 '0c2734d5a8664e1abd2ffcf8aa6fa5d6',
 'f767840dc9b94eeabb8d0cfb2a5da57c',
 '851861aa8d4748f1a179aa60b482b505',
 '40ffa04f4781418c946b01206562b124',
 '7824cc42ee0447a4a2fa56b9c93bbabb',
 '951592742a88497989fbb71a65781639',
 'e65adf07df65405faff00b395537018b',
 '26f82ce33a204cdc90464729cabc4eda',
 'd9cd9704a0064fed9993333363608346',
 '1deaa8e567f1435da07c0817717ef4fe',
 'e278f318559a49c09266bd5a19d2f41c',
 '472bf3fb08d24c6db3d76964534665ab',
 'cc9284dfb5f645858196537e05c1959b',
 '8b27ff8591af4cd79067cf156d23ab95',
 '2fe73e7e2534479aa59aeb91635906aa',
 '50bc84deece34316bc3b8e5bb5829a68',
 'e02a32edf7494874b489598a611bd443',
 '8f9b8786312349639e8811f451054df0',
 'ff5026837ead4091afc0026b9e2142ac',
 '81b4ef081c7f4f70b90f414b5d0144bf',
 '33380a28635d455ebcd6c6f011329554',
 'a0cd3c82306a47a68d7d048f1fe05286',
 '8d197adcbe654aebb414dd9650b8f622',
 'fa1e10f3495d49f69ee05224e4eaec4f',
 'e113171c2c8c48679e92a0e241b31060',
 

Flatten 

In [214]:
# Flatten the nested dictionaries
def flatten_dict(d, parent_key='', sep='_'):
    """
    Flattens a given nested list to a dictionary. 
    This function is similar to json.normalize() to flatten nested dictionaries.But it is specific to json files  that contain nested lists of dictionaries. In our case, we need to flatten nested dictionaries in 'hitObject' to 'hitObject'
    https: //pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html
    :param d: list of dictionaries
    :param parent_key: first level key
    :param sep: _ or . to separate keys names when combining them into the df column name
    :return: 
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

Calculate distance to determine closest objects hit

In [215]:
# Set the default distance between player and object if none can be calculated
max_distance = 9999999
number_of_closest_hit_objects = 5

# Calculates the distance between the player and an object
def distance(playerX, playerY, playerZ, objectX, objectY, objectZ):
    global max_distance
    if (objectX is None or objectY is None or objectZ is None or 
        playerX is None or playerY is None or playerZ is None):
        return pd.Series([max_distance])
    else:
        return pd.Series([math.sqrt((objectX - playerX)**2 + (objectY - playerY)**2 + (objectZ - playerZ)**2)])

column_names = []
# Initialize column names -- was a try for using apply with Dask, but does not work
# column_names = ['uid', 'UnixTimeStamp', 'ObjectName_0', 'HitObjectPosition_x_0', 'HitObjectPosition_y_0', 'HitObjectPosition_z_0', 'HitPointOnObject_x_0', 'HitPointOnObject_y_0', 'HitPointOnObject_z_0', 'distanceToPlayer_0', 'ObjectName_1', 'HitObjectPosition_x_1', 'HitObjectPosition_y_1', 'HitObjectPosition_z_1', 'HitPointOnObject_x_1', 'HitPointOnObject_y_1', 'HitPointOnObject_z_1', 'distanceToPlayer_1', 'ObjectName_2', 'HitObjectPosition_x_2', 'HitObjectPosition_y_2', 'HitObjectPosition_z_2', 'HitPointOnObject_x_2', 'HitPointOnObject_y_2', 'HitPointOnObject_z_2', 'distanceToPlayer_2', 'ObjectName_3', 'HitObjectPosition_x_3', 'HitObjectPosition_y_3', 'HitObjectPosition_z_3', 'HitPointOnObject_x_3', 'HitPointOnObject_y_3', 'HitPointOnObject_z_3', 'distanceToPlayer_3', 'ObjectName_4', 'HitObjectPosition_x_4', 'HitObjectPosition_y_4', 'HitObjectPosition_z_4', 'HitPointOnObject_x_4', 'HitPointOnObject_y_4', 'HitPointOnObject_z_4', 'distanceToPlayer_4']
# 
# column_dict = {'uid': dtype('O'),
#  'UnixTimeStamp': dtype('float64'),
#  'ObjectName_0': dtype('O'),
#  'HitObjectPosition_x_0': dtype('float64'),
#  'HitObjectPosition_y_0': dtype('float64'),
#  'HitObjectPosition_z_0': dtype('float64'),
#  'HitPointOnObject_x_0': dtype('float64'),
#  'HitPointOnObject_y_0': dtype('float64'),
#  'HitPointOnObject_z_0': dtype('float64'),
#  'distanceToPlayer_0': dtype('float64'),
#  'ObjectName_1': dtype('O'),
#  'HitObjectPosition_x_1': dtype('float64'),
#  'HitObjectPosition_y_1': dtype('float64'),
#  'HitObjectPosition_z_1': dtype('float64'),
#  'HitPointOnObject_x_1': dtype('float64'),
#  'HitPointOnObject_y_1': dtype('float64'),
#  'HitPointOnObject_z_1': dtype('float64'),
#  'distanceToPlayer_1': dtype('float64'),
#  'ObjectName_2': dtype('O'),
#  'HitObjectPosition_x_2': dtype('float64'),
#  'HitObjectPosition_y_2': dtype('float64'),
#  'HitObjectPosition_z_2': dtype('float64'),
#  'HitPointOnObject_x_2': dtype('float64'),
#  'HitPointOnObject_y_2': dtype('float64'),
#  'HitPointOnObject_z_2': dtype('float64'),
#  'distanceToPlayer_2': dtype('float64'),
#  'ObjectName_3': dtype('O'),
#  'HitObjectPosition_x_3': dtype('float64'),
#  'HitObjectPosition_y_3': dtype('float64'),
#  'HitObjectPosition_z_3': dtype('float64'),
#  'HitPointOnObject_x_3': dtype('float64'),
#  'HitPointOnObject_y_3': dtype('float64'),
#  'HitPointOnObject_z_3': dtype('float64'),
#  'distanceToPlayer_3': dtype('float64'),
#  'ObjectName_4': dtype('O'),
#  'HitObjectPosition_x_4': dtype('float64'),
#  'HitObjectPosition_y_4': dtype('float64'),
#  'HitObjectPosition_z_4': dtype('float64'),
#  'HitPointOnObject_x_4': dtype('float64'),
#  'HitPointOnObject_y_4': dtype('float64'),
#  'HitPointOnObject_z_4': dtype('float64'),
#  'distanceToPlayer_4': dtype('float64')}

# def all_hit_objects(uid, time, hitObjectList, posX, posY, posZ):
#     global column_names, max_distance
# 
#     # If no hit objects exist, return an empty dataframe
#     if len(hitObjectList) == 0:
#         return pd.DataFrame()
#     
#     # Create a dataframe from the hit object list JSON, if it is of type string, evaluate first
#     hitObjectList = literal_eval(hitObjectList) if isinstance(hitObjectList, str) else hitObjectList
#     allHitObjectDataFrame = pd.concat([pd.DataFrame(columns=['uid', 'UnixTimeStamp']), pd.DataFrame([flatten_dict(d) for d in hitObjectList])], axis=1)
#     allHitObjectDataFrame['uid'] = uid # add column of player UID for later merge back
#     allHitObjectDataFrame['UnixTimeStamp'] = time # add column of player time for later merge back
#     
#     # Calculate the distance to the player for each hit object, if JSON is empty, return max distance
#     allHitObjectDataFrame['distanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitObjectPosition_x'], row['HitObjectPosition_y'], row['HitObjectPosition_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])
#     
#     return allHitObjectDataFrame
# 
# def get_closest_hit_objects(uid, time, allHitObjectDataFrame, posX, posY, posZ):
#     global column_names, max_distance, number_of_closest_hit_objects
#     # Create the hit objects helper dataframe with known column names
#     clostestHitObjectsDF = pd.DataFrame(columns=column_names)
# 
#     # Create start of the dictionary row being created by this function
#     clostestHitObjectsDict = {}
#     clostestHitObjectsDict['uid'] = uid # add column of player UID for later merge back
#     clostestHitObjectsDict['UnixTimeStamp'] = time # add column of player time for later merge back
#     
#     # Sort dataframe by distance from player and only select top 5 rows
#     top5HitObjectDataFrame = allHitObjectDataFrame.drop(['uid', 'UnixTimeStamp'], axis=1).sort_values(by=['distanceToPlayer'], ascending=True).head(number_of_closest_hit_objects).reset_index(drop=True)
#     
#     # Create returning dataframe
#     for i, row in top5HitObjectDataFrame.iterrows():
#         for col in top5HitObjectDataFrame.columns:
#             clostestHitObjectsDict[f"{col}_{i}"] = row[col]
#     
#     clostestHitObjectsDF = pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True)
#     
#     return clostestHitObjectsDF

def closest_objects(uid, time, hitObjectList, posX, posY, posZ):
    """
    Function to calculate distance between player and a  given hitObject name 
    :param uid: uid
    :param time: time reference column
    :param hitObjectList: list of hitObjects names
    :param posX: 
    :param posY: 
    :param posZ: 
    :return: 
    """
    global column_names, max_distance, number_of_closest_hit_objects
    # Create the hit objects helper dataframe with known column names
    clostestHitObjectsDF = pd.DataFrame(columns=column_names)

    # Create start of the dictionary row being created by this function
    clostestHitObjectsDict = {}
    clostestHitObjectsDict['uid'] = uid # add column of player UID for later merge back
    clostestHitObjectsDict['UnixTimeStamp'] = time # add column of player time for later merge back

    # If no hit objects exist, return an empty dataframe
    if len(hitObjectList) == 0:
        return pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True), pd.DataFrame()
    
    # Create a dataframe from the hit object list JSON, if it is of type string, evaluate first
    hitObjectList = literal_eval(hitObjectList) if isinstance(hitObjectList, str) else hitObjectList
    allHitObjectDataFrame = pd.concat([pd.DataFrame(columns=['uid', 'UnixTimeStamp']), pd.DataFrame([flatten_dict(d) for d in hitObjectList])], axis=1)
    allHitObjectDataFrame['uid'] = uid # add column of player UID for later merge back
    allHitObjectDataFrame['UnixTimeStamp'] = time # add column of player time for later merge back
    
    # Calculate the distance to the player for each hit object, if JSON is empty, return max distance
    allHitObjectDataFrame['distanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitObjectPosition_x'], row['HitObjectPosition_y'], row['HitObjectPosition_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])

    # Calculate all hit point on object to player distances to be able to chose only the closest one, e.g., in case of Terrain multiple hits may happen per frame
    allHitObjectDataFrame['hitPointDistanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitPointOnObject_x'], row['HitPointOnObject_y'], row['HitPointOnObject_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])
    
    # Sort dataframe by distance from player and only select top 5 rows
    top5HitObjectDataFrame = allHitObjectDataFrame.drop(['uid', 'UnixTimeStamp'], axis=1).sort_values(by=['distanceToPlayer', 'hitPointDistanceToPlayer'], ascending=True).drop_duplicates(subset=['ObjectName']).drop('hitPointDistanceToPlayer', axis=1).head(number_of_closest_hit_objects).reset_index(drop=True)
    
    # Create returning dataframe
    for i, row in top5HitObjectDataFrame.iterrows():
        for col in top5HitObjectDataFrame.columns:
            clostestHitObjectsDict[f"{col}_{i}"] = row[col]
    
    clostestHitObjectsDF = pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True)
    
    # Update global column names list for dynamic discovery of column names
    column_names = clostestHitObjectsDF.columns.tolist()
    
    return clostestHitObjectsDF, allHitObjectDataFrame
    # OLD CODE WITH UNSTACK THAT DID NOT WORK DUE TO MULTI-LEVEL INDICES
    #helperDF = hitObjectDataFrameTop5.unstack(level=-2).reset_index().rename(columns={0: '', 1: 'level_1'})
    #return helperDF.pivot(index=['uid', 'time'], columns=['level_0','level_1']).reset_index(drop=True)


In [216]:
# def preprocess_txt_files_multi_step(raw_folder_path, preprocessed_folder_path):
#     """
#     Reads all .txt files in the specified folder.
#     """
# 
#     # Get all participant uids in the given raw data folder
#     ids, _ = analyze_folder(raw_folder_path)
#     
#     # Run extraction per uid
#     for target_id in ids:
#         # Do not re-process uids that were already done
#         if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
#             continue
#         
#         # Get list of files for given uid
#         print(f"\nWorking on uid {target_id}...")
#         _, matching_files_list = analyze_folder(raw_folder_path, target_id)
#     
#         # Create output data frame and set most important columns to be the first ones
#         df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
#         for txt_file in matching_files_list:
#             # Read file from disk
#             file_path = os.path.join(raw_folder_path, txt_file)
#             with open(file_path, "r") as file:
#                 data_list = json.load(file)
#             
#             # If data file is empty, continue to next file
#             if len(data_list) == 0:
#                 print(f"INFO: No data found in {txt_file}")
#                 continue
#                 
#             # Create a DataFrame from the flattened data
#             flat_data_list = [flatten_dict(d) for d in data_list]
#             df1 = pd.DataFrame(flat_data_list)
#             
#             # Append file name information to dataframe
#             parts = get_filename_as_parts(txt_file)
#             df1['uid'] = parts[0]
#             df1['dataset'] = parts[1]
#             df1['city_section'] = parts[2]
#             
#             # Merge into master datafram
#             df = pd.concat([df, df1], ignore_index=True)
#     
#         # Only continue if there was data found for a given uid
#         if len(df) > 0:
#             # Process Hit Objects JSON column -> top 5 HitObjects with distances
#             allHitObjectsDataFrame = pd.DataFrame()
#             for i, _ in df.iterrows():
#                 if i % 5000 == 0:
#                     print(f"Processed {i} out of {len(df.index)} hit objects...")
#                 all = all_hit_objects(df['uid'][i], df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
#                 allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, all], ignore_index=True)
#                         
#             # Sort and save proprocessed data per UID
#             df.drop(['hitObjects'], axis=1, inplace=True)
#             df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
#             df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=False)   
# 
#             allHitObjectsDataFrame.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
#             allHitObjectsDataFrame.to_csv(os.path.join(preprocessed_folder_path, target_id + '_hitobjects.csv'), index=False)

**Threaded processing for eye tracking folder:**

In [217]:
def get_input_data_per_uid(target_id, raw_folder_path):
    # Get list of files for given uid
    folder_path = os.path.join(raw_folder_path, '..', 'Input')
    _, matching_files_list = analyze_folder(folder_path, target_id)

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid'])
    for txt_file in matching_files_list:
        # Read file from disk
        file_path = os.path.join(folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # Create a DataFrame from the flattened data
        df1 = pd.json_normalize(data_list, max_level=1)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = parts[0]
    
        # # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

    # Sort and save preprocessed data per UID
    df.sort_values(by=['uid', 'TimeStamp'], inplace=True)

    return df

In [218]:
def add_event_data_to_eye_tracking_data(eye_tracking_data, events):
    result = eye_tracking_data.merge(events, how='left', left_on='uid', right_on='uid')
    result = result[result['EventName'].isna() | ((result['UnixTimeStamp'] >= result['StartofEventTimeStamp']) & (result['UnixTimeStamp'] <= result['EndOfEventTimeStamp']))]
    #result.drop(columns=['StartofEventTimeStamp', 'EndOfEventTimeStamp'], inplace = True)
    result = pd.concat([result, eye_tracking_data[~eye_tracking_data['UnixTimeStamp'].isin(result['UnixTimeStamp'])]], ignore_index=True)
    result.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
    
    return result

In [219]:
def add_input_data_to_eye_tracking_data(eye_tracking_data, raw_folder_path, target_id):
    input_data = get_input_data_per_uid(target_id, raw_folder_path)

    result = eye_tracking_data.merge(input_data, how='left', left_on=['uid', 'UnixTimeStamp'], right_on=['uid', 'TimeStamp'])
    result.drop(columns=['TimeStamp'], inplace = True)
    result.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
    
    return result

In [225]:
def threaded_processing_eyetracking(target_id, raw_folder_path, preprocessed_folder_path, excluded_uids, uid_conditions, events):
    target_file_name = os.path.join(preprocessed_folder_path + target_id + '.csv')
    if os.path.isfile(target_file_name):
        existing_eye_tracking_data = pd.read_csv(target_file_name)
        print(f'File {target_id}.csv already exists...')
        changed = False
        if not set(['EventName']).issubset(set(existing_eye_tracking_data.columns)):
            print(f'...but is missing EVENT data. Adding...')
            existing_eye_tracking_data = add_event_data_to_eye_tracking_data(existing_eye_tracking_data, events)
            changed = True
        if not set(['SteeringInput']).issubset(set(existing_eye_tracking_data.columns)):
            print(f'...but is missing INPUT data. Adding...')
            existing_eye_tracking_data = add_input_data_to_eye_tracking_data(existing_eye_tracking_data, raw_folder_path, target_id)
            changed = True
        if not set(['ExperimentalCondition']).issubset(set(existing_eye_tracking_data.columns)):
            print(f'...but is missing CONDITION data. Adding...')
            existing_eye_tracking_data['ExperimentalCondition'] = uid_conditions.loc[uid_conditions['uid'] == target_id, 'ExperimentalCondition'].item()
            changed = True
        
        # Either save updated eyetracking file or skip
        if changed:
            existing_eye_tracking_data.to_csv(os.path.join(preprocessed_folder_path, 'updated', target_id + '.csv'), index=False)
        else:
            print(f'...SKIPPED!')
        return excluded_uids
    
    # Get list of files for given uid
    print(f"\nWorking on uid {target_id}...")
    print(f"...using output folder {preprocessed_folder_path}")
    _, matching_files_list = analyze_folder(raw_folder_path, target_id)
    
    # Do not process uid if not all 5 files are present
    if len(matching_files_list) < 5:
        excluded_uids.append([target_id, 'Incomplete drive'])
        print(f"INFO: Excluded due to incomplete drive")
        return excluded_uids

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
    stop_flag = False
    for txt_file in matching_files_list:
        if stop_flag:
            continue
            
        # Read file from disk
        file_path = os.path.join(raw_folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # If data file is empty, continue to next file
        if len(data_list) == 0:            
            excluded_uids.append([target_id, 'Empty files found'])
            print(f"INFO: Excluded due to no data found in {txt_file}")
            stop_flag = True
            continue
            
        # Create a DataFrame from the flattened data
        flat_data_list = [flatten_dict(d) for d in data_list]
        df1 = pd.DataFrame(flat_data_list)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = target_id
        df1['dataset'] = parts[1]
        df1['city_section'] = parts[2]
        ### TO BE UPDATED
        
        # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

    # Only continue if there was data found for a given uid
    if len(df) > 0 and not stop_flag:
        # Process Hit Objects JSON column -> top 5 HitObjects with distances
        allHitObjectsDataFrame = pd.DataFrame(columns=['uid', 'dataset', 'UnixTimeStamp'])
        allHitObjectsDataFrame['uid'] = target_id
        allHitObjectsDataFrame['dataset'] = parts[1]
        allHitObjectsDataFrame['ExperimentalCondition'] = uid_conditions.loc[uid_conditions['uid'] == target_id, 'ExperimentalCondition'].item()
        ### TO BE UPDATED

        topHitObjectsDataFrame = pd.DataFrame()
        for i, _ in df.iterrows():
            if i % 5000 == 0:
                print(f"Processed {i} out of {len(df.index)} hit objects...")
            co, all = closest_objects(df['uid'][i], df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
            topHitObjectsDataFrame = pd.concat([topHitObjectsDataFrame, co], ignore_index=True)
            allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, all], ignore_index=True)
        
        # Merge hit objects DF back to data DF and drop hitObjects column with JSON data
        df.set_index(['uid','UnixTimeStamp'], inplace=True)
        topHitObjectsDataFrame.set_index(['uid','UnixTimeStamp'], inplace=True)
        df = df.join(topHitObjectsDataFrame, on=['uid','UnixTimeStamp'], how='left')
        df.drop(['hitObjects'], axis=1, inplace=True)
        
        # Sort and save preprocessed data per UID
        df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
        df.to_csv(target_file_name, index=True)
        allHitObjectsDataFrame.to_csv(os.path.join(preprocessed_folder_path, 'allHitObjects', target_id + '_hitobjects.csv'), index=False)
    
    return excluded_uids


**Threaded processing for Input data folder:**

In [226]:
def threaded_processing_input(target_id, raw_folder_path, preprocessed_folder_path, excluded_uids, uid_conditions):
    # Do not re-process uids that were already done
    if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
        print(f'File {target_id}.csv already exists. Skipping...')
        return excluded_uids
    
    # Get list of files for given uid
    print(f"\nWorking on uid {target_id}...")
    print(f"...using output folder {preprocessed_folder_path}")
    _, matching_files_list = analyze_folder(raw_folder_path, target_id)

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid', 'dataset', 'city_section', 'ExperimentalCondition'])
    for txt_file in matching_files_list:
        # Read file from disk
        file_path = os.path.join(raw_folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # Create a DataFrame from the flattened data
        df1 = pd.json_normalize(data_list, max_level=1)
                
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = parts[0]
        df1['dataset'] = parts[1]
        df1['city_section'] = parts[2]
        df1['ExperimentalCondition'] = uid_conditions.loc[uid_conditions['uid'] == parts[0], 'ExperimentalCondition'].item()
    
        # # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

    # Sort and save preprocessed data per UID
    df.sort_values(by=['uid', 'TimeStamp'], inplace=True)
    df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=False)
    
    return excluded_uids

**General pipeline to process .txt files into .csv format**

In [227]:
raw_folder_path = "Data/0_Raw/Eyetracking/"
preprocessed_folder_path = "Data/1_Preprocessed/Eyetracking/"
id = '0a68f111e4f448d3b8279db69cd9df5e'
eyetracking = pd.read_csv(os.path.join(preprocessed_folder_path, id + '.csv'))

#print(len(eyetracking))
add_input_data_to_eye_tracking_data(eyetracking, raw_folder_path, id)

Matching files:
['0a68f111e4f448d3b8279db69cd9df5e', 'Input', 'Autobahn']
['0a68f111e4f448d3b8279db69cd9df5e', 'Input', 'TrainingScene']
['0a68f111e4f448d3b8279db69cd9df5e', 'Input', 'MountainRoad']
['0a68f111e4f448d3b8279db69cd9df5e', 'Input', 'Westbrueck']
['0a68f111e4f448d3b8279db69cd9df5e', 'Input', 'CountryRoad']


Unnamed: 0,uid,UnixTimeStamp,dataset,city_section,TobiiTimeStamp,FPS,HmdPosition_x,HmdPosition_y,HmdPosition_z,NoseVector_x,...,HitObjectPosition_y_4,HitObjectPosition_z_4,HitPointOnObject_x_4,HitPointOnObject_y_4,HitPointOnObject_z_4,distanceToPlayer_4,ReceivedInput,SteeringInput,AcellerationInput,BrakeInput
0,0a68f111e4f448d3b8279db69cd9df5e,1.602674e+09,EyeTracking,TrainingScene,153.495712,89.967888,-0.307228,1.022527,-103.842102,0.445395,...,,,,,,,,,,
1,0a68f111e4f448d3b8279db69cd9df5e,1.602674e+09,EyeTracking,TrainingScene,153.517975,89.919945,-0.307410,1.022367,-103.841972,0.443813,...,,,,,,,True,0.000389,0.0,0.0
2,0a68f111e4f448d3b8279db69cd9df5e,1.602674e+09,EyeTracking,TrainingScene,153.540146,89.968414,-0.308085,1.021776,-103.842018,0.439297,...,,,,,,,True,0.000389,0.0,0.0
3,0a68f111e4f448d3b8279db69cd9df5e,1.602674e+09,EyeTracking,TrainingScene,153.562454,89.890388,-0.309161,1.020753,-103.841881,0.430322,...,,,,,,,,,,
4,0a68f111e4f448d3b8279db69cd9df5e,1.602674e+09,EyeTracking,TrainingScene,153.584579,89.982567,-0.310638,1.019290,-103.841515,0.416664,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37187,0a68f111e4f448d3b8279db69cd9df5e,1.602675e+09,EyeTracking,Autobahn,875.579163,84.273964,-604.667480,64.104576,5073.650391,0.960676,...,,,,,,,True,-0.006865,0.0,0.0
37188,0a68f111e4f448d3b8279db69cd9df5e,1.602675e+09,EyeTracking,Autobahn,875.601379,85.342766,-604.410767,64.113228,5073.747070,0.961294,...,,,,,,,True,-0.006865,0.0,0.0
37189,0a68f111e4f448d3b8279db69cd9df5e,1.602675e+09,EyeTracking,Autobahn,875.624939,85.193512,-604.153320,64.121864,5073.845215,0.961644,...,,,,,,,True,-0.006865,0.0,0.0
37190,0a68f111e4f448d3b8279db69cd9df5e,1.602675e+09,EyeTracking,Autobahn,875.636169,85.541122,-603.894714,64.130600,5073.943848,0.961842,...,,,,,,,True,-0.006865,0.0,0.0


In [228]:
def preprocess_txt_files(raw_folder_path, preprocessed_folder_path, type, multi_processing = False):
    """
    Reads all .txt files in the specified folder.
    :param raw_folder_path:
    :param preprocessed_folder_path:
    :param multi_processing: Put false to save all excluded UIDs; in multiprocessing = True it will not work
    """

    # Get conditions per UID
    uid_conditions = get_uid_calibration_data(raw_folder_path)
    
    # Get event data per UID
    events = get_events_data(raw_folder_path)
    
    # Load already excluded UIDs from previous
    excluded_uids_path = os.path.join(preprocessed_folder_path, '..', '_excluded_uids.csv')
    print(f"...using ALREADY EXCLUDED UID output folder {excluded_uids_path}")
    excluded_uids = pd.DataFrame(columns=['uid', 'reason']) if not os.path.isfile(excluded_uids_path) else pd.read_csv(excluded_uids_path)

    # Get all participant uids in the given raw data folder
    ids, _ = analyze_folder(raw_folder_path)
    
    # Select only valid uids to process
    ids_to_process = [id for id in ids if uid_conditions.loc[uid_conditions['uid'] == id, 'included'].item()]
    ids_to_process = [id for id in ids_to_process if id not in excluded_uids['uid'].values.tolist()]

    # Convert to list to be able to use .append()
    excluded_uids = excluded_uids.values.tolist()

    # Note down failed calibration reasons
    for id in ids:
        if not uid_conditions.loc[uid_conditions['uid'] == id, 'included'].bool():
            excluded_uids.append([id, 'Failed calibration'])
    
    if multi_processing:
        # Create a process pool, using the pathos fork of multiprocessing
        pool = pathos.pools.ProcessPool()
        # Use partial if the resulting function has more than one input variable, here: processing path, excluded UIDs
        if type == 'eyetracking':
            function = partial(threaded_processing_eyetracking, raw_folder_path=raw_folder_path, preprocessed_folder_path=preprocessed_folder_path, excluded_uids=excluded_uids, uid_conditions=uid_conditions, events=events)
        elif type == 'input':
            function = partial(threaded_processing_input, raw_folder_path=raw_folder_path, preprocessed_folder_path=preprocessed_folder_path, excluded_uids=excluded_uids, uid_conditions=uid_conditions)
        else:
            print('Nothing to do. Choose a type.')
        # Execute pool across list of independent inputs
        pool.map(function, list(ids_to_process))
    else:
        for id in ids_to_process:
            if type == 'eyetracking':
                excluded_uids = threaded_processing_eyetracking(id, raw_folder_path, preprocessed_folder_path, excluded_uids, uid_conditions, events)
            elif type == 'input':
                excluded_uids = threaded_processing_input(id, raw_folder_path, preprocessed_folder_path, excluded_uids, uid_conditions)
            else:
                print('Nothing to do. Choose a type.')
    
    excluded_uids = pd.DataFrame(excluded_uids, columns=['uid', 'reason'])        
    excluded_uids.drop_duplicates(inplace=True)
    excluded_uids.to_csv(excluded_uids_path, index=False)


In [229]:
# def extracted_preprocessed_files_multi_step(preprocessed_folder_path, extracted_folder_path):
#     ids, _ = analyze_folder(preprocessed_folder_path, file_type='csv')
#     
#     for target_id in ids:
#         file_path = os.path.join(preprocessed_folder_path, target_id + '.csv')
#         with open(file_path, "r") as file:
#             uid_data = pd.read_csv(file)
# 
#         file_path = os.path.join(preprocessed_folder_path, target_id + '_hitobjects.csv')
#         with open(file_path, "r") as file:
#             uid_hit_objects = pd.read_csv(file)
#         
#         timestamps = uid_data['UnixTimeStamp'].unique()
#         for i, timestamp in enumerate(timestamps):
#             if i % 5000 == 0:
#                 print(f"Processed {i} out of {len(timestamps)} time stamps...")
#             player_pos = uid_data[(uid_data['uid'] == target_id) & (uid_data['UnixTimeStamp'] == timestamp)][['HmdPosition_x', 'HmdPosition_y','HmdPosition_z']]
#             topHitObjectsDataFrame = get_closest_hit_objects(target_id, timestamp, uid_hit_objects, player_pos['HmdPosition_x'], player_pos['HmdPosition_y'], player_pos['HmdPosition_z'])
#         
#         uid_data.set_index(['uid','UnixTimeStamp'], inplace=True)
#         topHitObjectsDataFrame.set_index(['uid','UnixTimeStamp'], inplace=True)
#         uid_data = uid_data.join(topHitObjectsDataFrame, on=['uid','UnixTimeStamp'], how='left')
#         
#         df.to_csv(os.path.join(extracted_folder_path, target_id + '.csv'), index=True)


## 2. Applying pipeline to eye and input data

In [230]:
# Preprocess all files in the given raw folder (currently only EyeTracking input data)
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Eyetracking/"
preprocess_txt_files(raw_folder_path, processed_folder_path, 'eyetracking', multi_processing = False)

...using ALREADY EXCLUDED UID output folder Data/1_Preprocessed/Eyetracking/../_excluded_uids.csv
File 7c4a213768e645e4a46f4317ad87d76d.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['7c4a213768e645e4a46f4317ad87d76d', 'Input', 'Westbrueck']
['7c4a213768e645e4a46f4317ad87d76d', 'Input', 'MountainRoad']
['7c4a213768e645e4a46f4317ad87d76d', 'Input', 'CountryRoad']
['7c4a213768e645e4a46f4317ad87d76d', 'Input', 'TrainingScene']
['7c4a213768e645e4a46f4317ad87d76d', 'Input', 'Autobahn']
...but is missing CONDITION data. Adding...
File 973efd2d86b64b7db7d5cc3f3f3725b2.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['973efd2d86b64b7db7d5cc3f3f3725b2', 'Input', 'MountainRoad']
['973efd2d86b64b7db7d5cc3f3f3725b2', 'Input', 'CountryRoad']
['973efd2d86b64b7db7d5cc3f3f3725b2', 'Input', 'Autobahn']
['973efd2d86b64b7db7d5cc3f3f3725b2', 'Input', 'Westbrueck']
[

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 4a10fec1583843e5b1c82fb74fa7c775.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['4a10fec1583843e5b1c82fb74fa7c775', 'Input', 'CountryRoad']
['4a10fec1583843e5b1c82fb74fa7c775', 'Input', 'TrainingScene']
['4a10fec1583843e5b1c82fb74fa7c775', 'Input', 'Autobahn']
['4a10fec1583843e5b1c82fb74fa7c775', 'Input', 'Westbrueck']
['4a10fec1583843e5b1c82fb74fa7c775', 'Input', 'MountainRoad']
...but is missing CONDITION data. Adding...
File 4ed7e605b1614ccc98269519940f6965.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['4ed7e605b1614ccc98269519940f6965', 'Input', 'TrainingScene']
['4ed7e605b1614ccc98269519940f6965', 'Input', 'Westbrueck']
['4ed7e605b1614ccc98269519940f6965', 'Input', 'Autobahn']
['4ed7e605b1614ccc98269519940f6965', 'Input', 'CountryRoad']
['4ed7e605b1614ccc98269519940f6965', 'Input', 'MountainRoad']
...but is missing CONDITION data. Ad

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 5fe54d902792467181089d8e4a73b80c.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['5fe54d902792467181089d8e4a73b80c', 'Input', 'TrainingScene']
['5fe54d902792467181089d8e4a73b80c', 'Input', 'Westbrueck']
['5fe54d902792467181089d8e4a73b80c', 'Input', 'CountryRoad']
['5fe54d902792467181089d8e4a73b80c', 'Input', 'MountainRoad']
['5fe54d902792467181089d8e4a73b80c', 'Input', 'Autobahn']
...but is missing CONDITION data. Adding...
File 8753a7212b79463e9b80143314a63a39.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['8753a7212b79463e9b80143314a63a39', 'Input', 'CountryRoad']
['8753a7212b79463e9b80143314a63a39', 'Input', 'MountainRoad']
['8753a7212b79463e9b80143314a63a39', 'Input', 'TrainingScene']
['8753a7212b79463e9b80143314a63a39', 'Input', 'Autobahn']
['8753a7212b79463e9b80143314a63a39', 'Input', 'Westbrueck']
...but is missing CONDITION data. Ad

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 81957954cd71450cacc5fd738dd9ebd2.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['81957954cd71450cacc5fd738dd9ebd2', 'Input', 'Autobahn']
['81957954cd71450cacc5fd738dd9ebd2', 'Input', 'TrainingScene']
['81957954cd71450cacc5fd738dd9ebd2', 'Input', 'Westbrueck']
['81957954cd71450cacc5fd738dd9ebd2', 'Input', 'MountainRoad']
['81957954cd71450cacc5fd738dd9ebd2', 'Input', 'CountryRoad']
...but is missing CONDITION data. Adding...
File da49a4ef8443428192674cb184c1eb19.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['da49a4ef8443428192674cb184c1eb19', 'Input', 'Westbrueck']
['da49a4ef8443428192674cb184c1eb19', 'Input', 'CountryRoad']
['da49a4ef8443428192674cb184c1eb19', 'Input', 'TrainingScene']
['da49a4ef8443428192674cb184c1eb19', 'Input', 'Autobahn']
['da49a4ef8443428192674cb184c1eb19', 'Input', 'MountainRoad']
...but is missing CONDITION data. Ad

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 22d6fdd77f704e00aa350bf02adc9bc3.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['22d6fdd77f704e00aa350bf02adc9bc3', 'Input', 'Westbrueck']
['22d6fdd77f704e00aa350bf02adc9bc3', 'Input', 'TrainingScene']
['22d6fdd77f704e00aa350bf02adc9bc3', 'Input', 'CountryRoad']
['22d6fdd77f704e00aa350bf02adc9bc3', 'Input', 'MountainRoad']
['22d6fdd77f704e00aa350bf02adc9bc3', 'Input', 'Autobahn']
...but is missing CONDITION data. Adding...
File f9c6ff61370141c89ea9bbc536d796e1.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['f9c6ff61370141c89ea9bbc536d796e1', 'Input', 'Autobahn']
['f9c6ff61370141c89ea9bbc536d796e1', 'Input', 'Westbrueck']
['f9c6ff61370141c89ea9bbc536d796e1', 'Input', 'CountryRoad']
['f9c6ff61370141c89ea9bbc536d796e1', 'Input', 'TrainingScene']
['f9c6ff61370141c89ea9bbc536d796e1', 'Input', 'MountainRoad']
...but is missing CONDITION data. Ad

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 2763d041b30741d583498734a8ed3361.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['2763d041b30741d583498734a8ed3361', 'Input', 'Westbrueck']
['2763d041b30741d583498734a8ed3361', 'Input', 'CountryRoad']
['2763d041b30741d583498734a8ed3361', 'Input', 'Autobahn']
['2763d041b30741d583498734a8ed3361', 'Input', 'MountainRoad']
['2763d041b30741d583498734a8ed3361', 'Input', 'TrainingScene']
...but is missing CONDITION data. Adding...
File 97fa63f17b4f4fd98355731cb513f5d4.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'MountainRoad']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'TrainingScene']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'Westbrueck']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'CountryRoad']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'Autobahn']
...but is missing CONDITION data. Ad

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 5da827b55d1e429daa583641b19e4951.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['5da827b55d1e429daa583641b19e4951', 'Input', 'Autobahn']
['5da827b55d1e429daa583641b19e4951', 'Input', 'CountryRoad']
['5da827b55d1e429daa583641b19e4951', 'Input', 'MountainRoad']
['5da827b55d1e429daa583641b19e4951', 'Input', 'Westbrueck']
['5da827b55d1e429daa583641b19e4951', 'Input', 'TrainingScene']
...but is missing CONDITION data. Adding...
File f0d19dd21d9c4d108419ff0982fb8a00.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['f0d19dd21d9c4d108419ff0982fb8a00', 'Input', 'Westbrueck']
['f0d19dd21d9c4d108419ff0982fb8a00', 'Input', 'CountryRoad']
['f0d19dd21d9c4d108419ff0982fb8a00', 'Input', 'TrainingScene']
['f0d19dd21d9c4d108419ff0982fb8a00', 'Input', 'MountainRoad']
['f0d19dd21d9c4d108419ff0982fb8a00', 'Input', 'Autobahn']
...but is missing CONDITION data. Ad

  existing_eye_tracking_data = pd.read_csv(target_file_name)


File 8e03eb1671774d3d9a35f97178902a45.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['8e03eb1671774d3d9a35f97178902a45', 'Input', 'CountryRoad']
['8e03eb1671774d3d9a35f97178902a45', 'Input', 'TrainingScene']
['8e03eb1671774d3d9a35f97178902a45', 'Input', 'MountainRoad']
['8e03eb1671774d3d9a35f97178902a45', 'Input', 'Autobahn']
['8e03eb1671774d3d9a35f97178902a45', 'Input', 'Westbrueck']
...but is missing CONDITION data. Adding...
File ef0636bf75a345fd937f66a883b23c4d.csv already exists...
...but is missing EVENT data. Adding...
...but is missing INPUT data. Adding...
Matching files:
['ef0636bf75a345fd937f66a883b23c4d', 'Input', 'Westbrueck']
['ef0636bf75a345fd937f66a883b23c4d', 'Input', 'Autobahn']
['ef0636bf75a345fd937f66a883b23c4d', 'Input', 'CountryRoad']
['ef0636bf75a345fd937f66a883b23c4d', 'Input', 'MountainRoad']
['ef0636bf75a345fd937f66a883b23c4d', 'Input', 'TrainingScene']
...but is missing CONDITION data. Ad

In [102]:
raw_folder_path = "Data/0_Raw/Input/"  # Replace with your actual folder path
processed_folder_path_input = "Data/1_Preprocessed/Input/"
preprocess_txt_files(raw_folder_path, processed_folder_path_input, 'input', multi_processing = True)

...using ALREADY EXCLUDED UID output folder Data/1_Preprocessed/Input/../_excluded_uids.csv


In [32]:
|# Sample file processing
# file_path = '/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/EyeTracking/66bdb97a653b48f98c02d5e764089c00_EyeTracking_Westbrueck.txt'  # Replace with the actual file path
# 
# with open(file_path, 'r') as f:
#     data_list = json.load(f)
# df = pd.DataFrame([flatten_dict(d) for d in data_list])

In [509]:
# # Processing of a single file's hit objects using iteration
# allHitObjectsDataFrame = pd.DataFrame()
# for i, _ in df.iterrows():
#     co, _ = closest_objects('john', df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
#     allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, co], ignore_index=True)
# 
# allHitObjectsDataFrame

In [178]:
# file_path = os.path.join(raw_folder_input_path, txt_file)
# with open('/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/EyeTracking/0a68f111e4f448d3b8279db69cd9df5e_EyeTracking_Autobahn.txt', "r") as file:
#     data_list = json.load(file)
#     # Assuming the nested dictionary is in the 'acList' column
# normalized_data = pd.json_normalize(data_list,max_level=1)
# normalized_data
# ############
# # from ast import literal_eval
# # normalized_data["hitObjects"] = hitObjectList = literal_eval(normalized_data["hitObjects"]) if isinstance(normalized_data["hitObjects"], str) else normalized_data["hitObjects"]
# # normalized_data.head()

# 3. Extracting calibration data

In [13]:
file_path_calibration = 'Data/0_Raw/ParticipantCalibrationData/'
ids, matching_files_list = analyze_folder(file_path_calibration, '0a68f111e4f448d3b8279db69cd9df5e')
matching_files_list

Matching files:
['0a68f111e4f448d3b8279db69cd9df5e', 'ParticipantCalibrationData']


['0a68f111e4f448d3b8279db69cd9df5e_ParticipantCalibrationData.txt']

In [385]:
# Read calibration data and reject 
def reject_based_on_calibration_data(file_path_calibration):
    df_cal = pd.DataFrame()
    for filename in os.listdir(file_path_calibration):
        with open(os.path.join(file_path_calibration, filename), "r") as file:
            data_list_cal = json.load(file)
            df1_cal = pd.json_normalize(data_list_cal, max_level=1)
            df_cal = pd.concat([df1_cal, df_cal], ignore_index=True)
            # Reject uid when eyeValidation error > 1.5
            df_cal = df_cal[(df_cal['EyeValidationError.x']<1.5) | (df_cal['EyeValidationError.y']<1.5)]
            # Reject uid that skipped eye calibration and/or validation
            df_cal = df_cal[~df_cal['SpecialNotes'].str.contains('EyeCalibrationSkipped|EyeValidationSkipped')]
            # Reject uid that contain keyboard as input device
            df_cal= df_cal[~df_cal['SteeringInputDevice'].str.contains('Keyboard')]
            # Drop unwanted columns
            final_df = df_cal[['ParticipantUuid', 'ExperimentalCondition']]
            
    return final_df.rename(columns={'ParticipantUuid':'uid'})

In [9]:
df_cal = reject_based_on_calibration_data(file_path_calibration)
df_cal[df_cal['uid']=='f8199c55b531448aa14b053f0b1c099b']['ExperimentalCondition'].iloc[0]

'FullLoopAR'

In [507]:
print("Total number of uid without calibration issues: ", len(df_cal))
df_cal.groupby('ExperimentalCondition').count()

Total number of uid without calibration issues:  184


Unnamed: 0_level_0,uid
ExperimentalCondition,Unnamed: 1_level_1
AudioOnly,34
BaseCondition,51
FullLoopAR,54
HUDOnly,45


## Scene DATA

In [575]:
file_path_scene = '/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/SceneData/'

In [576]:
uids, matching_files_list = analyze_folder(file_path_scene,target_id='e02a32edf7494874b489598a611bd443')

Matching files:
['e02a32edf7494874b489598a611bd443', 'SceneData', 'CountryRoad']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'Westbrueck']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'Autobahn']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'MountainRoad']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'TrainingScene']


Unnamed: 0,uid,dataset,city_section,EventName,StartofEventTimeStamp,EndOfEventTimeStamp,EventDuration,SuccessfulCompletionState,HitObjectName
0,e02a32edf7494874b489598a611bd443,SceneData,CountryRoad,MarketPlaceEvent,1607341000.0,1607341000.0,18.693071,True,
1,e02a32edf7494874b489598a611bd443,SceneData,CountryRoad,CyclistEvent,1607341000.0,1607341000.0,15.789046,True,
2,e02a32edf7494874b489598a611bd443,SceneData,CountryRoad,MotorcyclistEvent,1607341000.0,1607341000.0,4.867074,False,Motorcyclist2
3,e02a32edf7494874b489598a611bd443,SceneData,Westbrueck,I_variant,1607341000.0,1607341000.0,2.936358,True,
4,e02a32edf7494874b489598a611bd443,SceneData,Westbrueck,II_variant,1607341000.0,1607341000.0,7.384221,False,Cube
5,e02a32edf7494874b489598a611bd443,SceneData,Westbrueck,III_variant,1607341000.0,1607341000.0,9.097585,True,
6,e02a32edf7494874b489598a611bd443,SceneData,Autobahn,Panne,1607341000.0,1607341000.0,4.64764,True,
7,e02a32edf7494874b489598a611bd443,SceneData,Autobahn,GhostDriver,1607341000.0,1607341000.0,14.337612,False,Carbody
8,e02a32edf7494874b489598a611bd443,SceneData,Autobahn,Baustelle,1607341000.0,1607341000.0,3.78717,False,Cube (9)
9,e02a32edf7494874b489598a611bd443,SceneData,MountainRoad,StagEventNew,1607340000.0,1607340000.0,13.0355,True,
