# LoopAR project:
## 1. Data preprocessing
## 1. Data preprocessing

In [1]:
import warnings

import cap
import pathos.pools

warnings.simplefilter(action='ignore', category=FutureWarning)

import time
import json
import itertools
import os
import math
from functools import partial
from ast import literal_eval
import pandas as pd
pd.set_option('display.float_format', '{:.9f}'.format)

import dask.dataframe as dd
# import dataframe_image as dfi
import numpy as np
from numpy import dtype

import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib import patches
from scipy.signal import find_peaks
from IPython.display import display
from tqdm.notebook import tqdm

In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


## 1.1 Useful functions
To open, read, and save from .txt format to .csv

In [17]:
def get_filename_as_parts(filename):
    """
    Extract the parts of file name without .txt ending
    :param filename: 
    :return: 
    """
    return filename.split(".")[0].split("_")
    
def analyze_folder(folder_path, target_id = '', file_type='txt'):
    """
    Given a folder path, identifies all .txt files, splits their names by '_',
    and returns a set of unique IDs extracted from the first part of each file name.
    """
    unique_ids = set() # Initialize an empty set to store unique IDs
    files_of_interest = [f for f in os.listdir(folder_path) if f.lower().endswith('.'+file_type)]
    matching_files = []
    
    # List all .txt files in the specified folder
    if target_id != '':
        print("Matching files:")
    for filename in files_of_interest:
        parts = get_filename_as_parts(filename)
        if len(parts) > 0:
            unique_ids.add(parts[0])  # Add the first part to the set
        if target_id in parts:
            print(parts)
            matching_files.append(filename)
        
    return unique_ids, matching_files if target_id != '' else files_of_interest

Count the amount of unique ids in eyeTracking data folder

In [18]:
# Set folders
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Eyetracking/"
# extracted_folder_path = "Data/2_Extracted/"

In [19]:
# Example usage:
ids, matching_files_list = analyze_folder(raw_folder_path)

print(f"Total unique IDs found in {raw_folder_path} ending with .txt files: {len(ids)} with a total of {len(matching_files_list)} files")
list(ids)

Total unique IDs found in Data/0_Raw/Eyetracking/ ending with .txt files: 255 with a total of 1106 files


['d093379ce1224f1bb24b9730da105927',
 '7c4a213768e645e4a46f4317ad87d76d',
 '2062aedb89224bb8a3f6982b487b4d27',
 'f4db5c989fa64b90b31d396943964c12',
 '2462e5d82bc44a328934442cae03d83f',
 'a9166f363d53462ea23ed7f26e798f63',
 '09a23914cf354ea39444511406d16722',
 '9b9220b8bf8c4e61adde8bc7571540ef',
 '6f7a67231a16454dbd657d2f48443aac',
 '2f3b9268b6dc4c04a0c77ef79ae52b36',
 '4ed7e605b1614ccc98269519940f6965',
 '2fe73e7e2534479aa59aeb91635906aa',
 'ac50cb2dbe784eca9f86d1a424a3cf04',
 'b298967064b144a4bbe4507238e66a80',
 'f4c4e380b2b941e4a963fe12f4038865',
 '22d6fdd77f704e00aa350bf02adc9bc3',
 'ce5d4e8191c541ae89395fe5e7920a02',
 '9c657fb86e9847bfbcd377a19157a71d',
 '5a99db5eeb8748db8a3a86f9015f78bd',
 'ccb6fbce179d456c892ce2e029dd7fd1',
 'a5c5d2051bd1493ea79e1d565524610f',
 '4702745fe0df4a778149aee6882a31de',
 'ef2e9fc68dc5486084a0cc170db5462b',
 '7e732acc694248ceaa3547de5fc77639',
 'b041b78d359c4763a53246d5ede93ffd',
 '95228b7595954054ae2d11cfbd7623f8',
 'a5e665fb970b4b03a810698a63b3b635',
 

Calculate distance to determine closest objects hit

In [20]:
# Set the default distance between player and object if none can be calculated
max_distance = 9999999
number_of_closest_hit_objects = 5

def distance(playerX, playerY, playerZ, objectX, objectY, objectZ):
    global max_distance
    if (objectX is None or objectY is None or objectZ is None or 
        playerX is None or playerY is None or playerZ is None):
        return pd.Series([max_distance])
    else:
        return pd.Series([math.sqrt((objectX - playerX)**2 + (objectY - playerY)**2 + (objectZ - playerZ)**2)])

**Threaded processing for eye tracking folder:**

In [21]:
def get_uid_calibration_data(folder_path):
    folder_path = os.path.join(folder_path, '..', 'ParticipantCalibrationData')
    _, matching_files_list = analyze_folder(folder_path)
    df_cal = pd.DataFrame()
    for txt_file in matching_files_list:
        filename = os.path.join(folder_path, txt_file)
        with open(filename, "r") as file:
            data_list_cal = json.load(file)
        df = pd.json_normalize(data_list_cal, max_level=1)
        df_cal = pd.concat([df, df_cal], ignore_index=True)

    df_cal['included'] = df_cal.apply(lambda row: ((row['EyeValidationError.x']<1.5) or (row['EyeValidationError.y']<1.5)) and
                                                  (not 'EyeCalibrationSkipped' in row['SpecialNotes']) and
                                                  (not 'EyeValidationSkipped' in row['SpecialNotes']) and
                                                  (not 'Keyboard' in row['SteeringInputDevice']), axis = 1)
    df_cal.rename(columns={'ParticipantUuid': 'uid'}, inplace=True)

    return df_cal[['uid', 'included', 'ExperimentalCondition']]

In [22]:
def get_events_data(folder_path):
    folder_path = os.path.join(folder_path, '..', 'SceneData')
    _, matching_files_list = analyze_folder(folder_path)
    df_scene = pd.DataFrame(columns=['uid'])
    for txt_file in matching_files_list: 
        filename = os.path.join(folder_path, txt_file)
        with open(filename, "r") as file:
            data_list = json.load(file)
        df = pd.json_normalize(data_list,'EventBehavior',max_level=1)

        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df['uid'] = parts[0]
        
        df_scene = pd.concat([df_scene, df], ignore_index=True)

    return df_scene

In [23]:
def get_input_data_per_uid(target_id, raw_folder_path):
    # Get list of files for given uid
    folder_path = os.path.join(raw_folder_path, '..', 'Input')
    _, matching_files_list = analyze_folder(folder_path, target_id)

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid'])
    for txt_file in matching_files_list:
        # Read file from disk
        file_path = os.path.join(folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # Create a DataFrame from the flattened data
        df1 = pd.json_normalize(data_list, max_level=1)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = parts[0]
    
        # # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

    # Sort and save preprocessed data per UID
    df.sort_values(by=['uid', 'TimeStamp'], inplace=True)
    df.drop_duplicates(inplace=True)

    return df

In [24]:
def add_input_data_to_eye_tracking_data(eye_tracking_data, raw_folder_path, target_id, errors_per_uid):
    input_data = get_input_data_per_uid(target_id, raw_folder_path)
    input_data.sort_values(by=['uid', 'TimeStamp'], inplace=True)
    
    result = pd.merge_asof(eye_tracking_data, input_data, left_on='UnixTimeStamp', right_on='TimeStamp', suffixes=('', '_input'), direction='nearest', tolerance=0.001)
    
    result.drop(columns=['uid_input'], inplace = True)
    count_delta = result.count().loc['UnixTimeStamp'] - result.count().loc['TimeStamp']
    if count_delta > 0:
        print(f"There are {count_delta} missing rows from 'input' data")
        errors_per_uid.append([target_id, f'Eye tracking data count vs. input data count mismatch: {count_delta}', result.count().loc['UnixTimeStamp'], result.count().loc['TimeStamp']])

    result.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
    
    return result, errors_per_uid

In [25]:
def add_event_data_to_eye_tracking_data(eye_tracking_data, events):
    result = eye_tracking_data.merge(events, how='left', left_on='uid', right_on='uid')
    result = result[result['EventName'].isna() | ((result['UnixTimeStamp'] >= result['StartofEventTimeStamp']) & (result['UnixTimeStamp'] <= result['EndOfEventTimeStamp']))]
    #result.drop(columns=['StartofEventTimeStamp', 'EndOfEventTimeStamp'], inplace = True)
    result = pd.concat([result, eye_tracking_data[~eye_tracking_data['UnixTimeStamp'].isin(result['UnixTimeStamp'])]], ignore_index=True)
    result.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
    
    return result

In [26]:
def add_additional_data_sources_and_save(eye_tracking_data, target_id, raw_folder_path, target_file_name, events, uid_conditions, errors_per_uid):
    changed = False
    if not set(['EventName']).issubset(set(eye_tracking_data.columns)):
        print(f'...Adding EVENT data.')
        eye_tracking_data = add_event_data_to_eye_tracking_data(eye_tracking_data, events)
        changed = True
    if not set(['SteeringInput']).issubset(set(eye_tracking_data.columns)):
        print(f'...Adding INPUT data.')
        eye_tracking_data, errors_per_uid = add_input_data_to_eye_tracking_data(eye_tracking_data, raw_folder_path, target_id, errors_per_uid)
        changed = True
    if not set(['ExperimentalCondition']).issubset(set(eye_tracking_data.columns)):
        print(f'...Adding CONDITION data.')
        eye_tracking_data['ExperimentalCondition'] = uid_conditions.loc[uid_conditions['uid'] == target_id, 'ExperimentalCondition'].item()
        changed = True

    if changed:
        eye_tracking_data.to_csv(target_file_name, index = False)
    else:
        print(f'...SKIPPED!')
    
    return errors_per_uid


In [27]:
def add_top_hitobjects(eye_tracking_dataframe, hitObjectDataFrame, target_file_name_all_hitobjects):
    hitObjectDataFrame2 = hitObjectDataFrame.copy()

    # Calculate object distaince to player
    hitObjectDataFrame2['distanceToPlayer'] = hitObjectDataFrame.apply(lambda row: distance(row['HmdPosition.x'], row['HmdPosition.y'], row['HmdPosition.z'], row['HitObjectPosition.x'], row['HitObjectPosition.y'], row['HitObjectPosition.z']), axis=1)

    # Calculate hit point distance to player to chose only the closest one later, e.g., in case of Terrain multiple hits may happen per frame
    hitObjectDataFrame2['hitPointDistanceToPlayer'] = hitObjectDataFrame.apply(lambda row: distance(row['HmdPosition.x'], row['HmdPosition.y'], row['HmdPosition.z'], row['HitPointOnObject.x'], row['HitPointOnObject.y'], row['HitPointOnObject.z']), axis=1)

    # Order the hit piont data frame by: timestamp, distance to player, and distance to hit point, clean up dataframe after
    hitObjectDataFrame_ordered = hitObjectDataFrame2.sort_values(by=['UnixTimeStamp', 'distanceToPlayer', 'hitPointDistanceToPlayer'], ascending=True).drop_duplicates(subset=['UnixTimeStamp', 'ObjectName']).drop(['hitPointDistanceToPlayer', 'HmdPosition.x', 'HmdPosition.y', 'HmdPosition.z'], axis=1).reset_index(drop=True)
    
    # Create a rank column for all hit points in order and save hit object dataframe
    hitObjectDataFrame_ordered['rank'] = hitObjectDataFrame_ordered.groupby('UnixTimeStamp').cumcount(ascending=True) + 1
    hitObjectDataFrame_ordered.to_csv(target_file_name_all_hitobjects, index=False)

    # Only kepp closest hit objects
    hitObjectDataFrame_ordered = hitObjectDataFrame_ordered[hitObjectDataFrame_ordered['rank'] <= number_of_closest_hit_objects].reset_index(drop=True)

    # Prepare for pivot: create the output list column, exclude rank and UnixTimstamp as they are used for pivoting
    value_columns = hitObjectDataFrame_ordered.loc[:, (hitObjectDataFrame_ordered.columns != 'rank') & (hitObjectDataFrame_ordered.columns != 'UnixTimeStamp')].columns.tolist()
    
    # Pivot data frame around UnixTimeStamp and rank, then rename columns and drop the multi-level index
    hitObjectDataFrameFinal = hitObjectDataFrame_ordered.pivot_table(index='UnixTimeStamp', columns='rank', values=value_columns, aggfunc='first', sort=False)
    hitObjectDataFrameFinal.columns = [f'{a}_{b}' for a, b in hitObjectDataFrameFinal.columns]
    hitObjectDataFrameFinal = hitObjectDataFrameFinal.reset_index()

    df = eye_tracking_dataframe.merge(hitObjectDataFrameFinal, how='left', left_on=['UnixTimeStamp'], right_on=['UnixTimeStamp'])
    df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)

    return df

In [28]:
def threaded_processing_eyetracking(target_id, raw_folder_path, preprocessed_folder_path, excluded_uids, errors_per_uid, uid_conditions, events, redo = False):
    # Create output folder and set file names for data output
    os.makedirs(os.path.join(preprocessed_folder_path, 'allHitObjects'), exist_ok=True)
    target_file_name = os.path.join(preprocessed_folder_path, target_id + '.csv')
    target_file_name_all_hitobjects = os.path.join(preprocessed_folder_path, 'allHitObjects', target_id + '_hitobjects.csv')

    # Check if file already exists, if yes, check if any columns are missing
    if os.path.isfile(target_file_name) and not redo:
        existing_eye_tracking_data = pd.read_csv(target_file_name, low_memory=False)
        print(f'File {target_id}.csv already exists...')
        errors_per_uid = add_additional_data_sources_and_save(existing_eye_tracking_data, target_id, raw_folder_path, target_file_name, events, uid_conditions, errors_per_uid)
        return excluded_uids, errors_per_uid
    
    # Otherwise: Get to work on specific UID starting with getting the list of all files for that UID
    print(f"\nWorking on uid {target_id}...")
    print(f"...using output folder {preprocessed_folder_path}")
    _, matching_files_list = analyze_folder(raw_folder_path, target_id)
    
    # Check for completeness, i.e., do not process UID if not all 4 major parts, excluding TrainingScene, are present
    if len([e for e in matching_files_list if get_filename_as_parts(e)[1] != 'TrainingScene']) < 4:
        excluded_uids.append([target_id, 'Incomplete drive'])
        print(f"[INFO] Excluded due to incomplete drive")
        return excluded_uids, errors_per_uid

    # Create output data frames and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid', 'dataset', 'city_section', 'ExperimentalCondition'])
    df_hitObjects = pd.DataFrame(columns=['uid'])

    # Stop flag helps to stop execution if empty files are found during processing
    stop_flag = False
    for txt_file in matching_files_list:
        if stop_flag:
            continue
            
        # Read file from disk
        file_path = os.path.join(raw_folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # If data file is empty, continue to next file
        if len(data_list) == 0:            
            excluded_uids.append([target_id, 'Empty files found'])
            print(f"[INFO] Excluded due to no data found in {txt_file}")
            stop_flag = True
            continue
            
        # Create eye data DataFrame from the flattened data AND DROP DUPLICATES, which do exist in raw data
        df1 = pd.json_normalize(data_list, max_level=1)
        df1.drop(['hitObjects'], axis=1, inplace=True)
        df1.drop_duplicates(inplace=True)
        
        # Create hit object data DataFrame from flattened hitObjects column
        df1_hitObjects = pd.json_normalize(data_list, record_path=['hitObjects'], max_level = 1, meta=['UnixTimeStamp', 'HmdPosition'])
        df1_hitObjects_head = pd.json_normalize(df1_hitObjects['HmdPosition'])
        df1_hitObjects_head.rename(columns={'x': 'HmdPosition.x', 'y': 'HmdPosition.y', 'z': 'HmdPosition.z'}, inplace=True)
        df1_hitObjects = pd.concat([df1_hitObjects.drop('HmdPosition', axis=1), df1_hitObjects_head], axis=1)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = target_id
        df1['dataset'] = parts[1]
        df1['city_section'] = parts[2]
        df1['ExperimentalCondition'] = uid_conditions.loc[uid_conditions['uid'] == target_id, 'ExperimentalCondition'].item()
        
        # Merge into master dataframe
        df = pd.concat([df, df1], ignore_index=True)
        df_hitObjects = pd.concat([df_hitObjects, df1_hitObjects], ignore_index=True)
        
    # Only continue if there was data found for a given uid
    if len(df) > 0 and not stop_flag:
        # Add hit object columns
        df.sort_values(['uid', 'UnixTimeStamp'], inplace=True)
        df_hitObjects.sort_values(['uid', 'UnixTimeStamp'], inplace=True)
        df = add_top_hitobjects(df, df_hitObjects, target_file_name_all_hitobjects)

        # Add input, event data and save file
        errors_per_uid = add_additional_data_sources_and_save(df, target_id, raw_folder_path, target_file_name, events, uid_conditions, errors_per_uid)
    
    return excluded_uids, errors_per_uid

**Threaded processing for Input data folder:**

**General pipeline to process .txt files into .csv format**

In [29]:
def preprocess_txt_files(raw_folder_path, preprocessed_folder_path, multi_processing = False, specific_uids = []):
    """
    Reads all .txt files in the specified folder.
    :param raw_folder_path:
    :param preprocessed_folder_path:
    :param multi_processing: Put false to save all excluded UIDs; in multiprocessing = True it will not work
    """
    start_time = time.time()

    # Get conditions per UID
    uid_conditions = get_uid_calibration_data(raw_folder_path)
    
    # Get event data per UID
    events = get_events_data(raw_folder_path)
    
    # Load already excluded UIDs from previous
    os.makedirs(preprocessed_folder_path, exist_ok=True)
    excluded_uids_path = os.path.join(preprocessed_folder_path, '_excluded_uids.csv')
    excluded_uids_columns = ['uid', 'reason']
    print(f"...using ALREADY EXCLUDED UID file {excluded_uids_path}")
    excluded_uids_df = pd.DataFrame(columns=excluded_uids_columns) if not os.path.isfile(excluded_uids_path) else pd.read_csv(excluded_uids_path)

    errors_per_uid_path = os.path.join(preprocessed_folder_path, '_errors_per_uid.csv')
    print(f"...using ERRORS PER UID File {errors_per_uid_path}")
    errors_per_uid_columns = ['uid', 'error_type', 'number_1', 'number_2']
    errors_per_uid_df = pd.DataFrame(columns=errors_per_uid_columns) if not os.path.isfile(errors_per_uid_path) else pd.read_csv(errors_per_uid_path)
    
    # Convert to list to be able to use .append()
    excluded_uids = excluded_uids_df.values.tolist()
    errors_per_uid = errors_per_uid_df.values.tolist()

    # If specific UIDs are given, only process those
    if len(specific_uids) > 0:
        print(f"Working on only SPECIFIC UIDs: {specific_uids}")
        ids_to_process = specific_uids
    else:
        # Get all participant uids in the given raw data folder
        ids, _ = analyze_folder(raw_folder_path)
        
        # Select only valid uids to process
        ids_to_process = [id for id in ids if uid_conditions.loc[uid_conditions['uid'] == id, 'included'].item()]
        ids_to_process = [id for id in ids_to_process if id not in excluded_uids_df['uid'].values.tolist()]
        
        print(ids_to_process)

        # Note down failed calibration reasons
        for id in ids:
            if not uid_conditions.loc[uid_conditions['uid'] == id, 'included'].bool():
                excluded_uids.append([id, 'Failed calibration'])
    
    if multi_processing:
        # Create a process pool, using the pathos fork of multiprocessing
        pool = pathos.pools.ProcessPool()

        # Use partial if the resulting function has more than one input variable, here: processing path, excluded UIDs
        function = partial(threaded_processing_eyetracking, raw_folder_path=raw_folder_path, preprocessed_folder_path=preprocessed_folder_path, excluded_uids=excluded_uids, errors_per_uid=errors_per_uid, uid_conditions=uid_conditions, events=events)

        # Execute pool across list of independent inputs
        results = pool.map(function, list(ids_to_process))
        for result in results:
            for excluded_uid in result[0]:
                excluded_uids.append(excluded_uid)
            for error in result[1]:
                errors_per_uid.append(error)
    else:
        for id in ids_to_process:
            excluded_uids, errors_per_uid = threaded_processing_eyetracking(id, raw_folder_path, preprocessed_folder_path, excluded_uids, errors_per_uid, uid_conditions, events, redo=len(specific_uids) > 0)
    
    # Do not save state when specific UIDs are given
    if len(specific_uids) == 0:
        excluded_uids_df = pd.DataFrame(excluded_uids, columns=excluded_uids_columns)        
        excluded_uids_df.drop_duplicates(inplace=True)
        excluded_uids_df.to_csv(excluded_uids_path, index=False)

        errors_per_uid_df = pd.DataFrame(errors_per_uid, columns=errors_per_uid_columns)        
        errors_per_uid_df.drop_duplicates(inplace=True)
        errors_per_uid_df.to_csv(errors_per_uid_path, index=False)
    
    end_time = time.time()

    print(f"\nElapsed time to run this code: {end_time - start_time:.6f} seconds")
    print('DONE!')

## 2. Applying pipeline to eye and input data

In [31]:
specific_id = '8e03eb1671774d3d9a35f97178902a45'
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/"

In [32]:
# Preprocess all files in the given raw folder (currently only EyeTracking input data)
#preprocess_txt_files(raw_folder_path, processed_folder_path, multi_processing = False, specific_uids = [specific_id])
preprocess_txt_files(raw_folder_path, processed_folder_path, multi_processing = True)

...using ALREADY EXCLUDED UID file Data/1_Preprocessed/_excluded_uids.csv
...using ERRORS PER UID File Data/1_Preprocessed/_errors_per_uid.csv
['7c4a213768e645e4a46f4317ad87d76d', '2062aedb89224bb8a3f6982b487b4d27', '2f3b9268b6dc4c04a0c77ef79ae52b36', '4ed7e605b1614ccc98269519940f6965', '2fe73e7e2534479aa59aeb91635906aa', 'ac50cb2dbe784eca9f86d1a424a3cf04', '22d6fdd77f704e00aa350bf02adc9bc3', '9c657fb86e9847bfbcd377a19157a71d', '5a99db5eeb8748db8a3a86f9015f78bd', 'ccb6fbce179d456c892ce2e029dd7fd1', 'ef2e9fc68dc5486084a0cc170db5462b', '7e732acc694248ceaa3547de5fc77639', 'b041b78d359c4763a53246d5ede93ffd', 'a5e665fb970b4b03a810698a63b3b635', '29006bddb3e8430582165c73bc7af864', 'f6bf93fa2bb14083849d203d285bb594', 'f78cb865f76b4ce4879083715f112d91', '50bc84deece34316bc3b8e5bb5829a68', '76add0e05beb4e7ea99bda997efd7622', 'e51c62a636934699b9a2b304635ba0fe', 'e5a628df7e514b288fb6c6024f32e6d8', '7de8141926e0497396bddddf37a1b115', 'f820d1c1117c4255a529f527c6c82d1f', '09e9bc26ef584ef29e0e4512f3a

In [51]:
existing_eye_tracking_data = pd.read_csv(os.path.join(processed_folder_path, specific_id + '.csv'), low_memory=False)
existing_input_data = get_input_data_per_uid(specific_id, raw_folder_path)

Matching files:
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'MountainRoad']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'TrainingScene']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'Westbrueck']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'CountryRoad']
['97fa63f17b4f4fd98355731cb513f5d4', 'Input', 'Autobahn']


In [52]:
existing_eye_tracking_data.count()

uid                      37348
dataset                  37348
city_section             37348
ExperimentalCondition    37348
UnixTimeStamp            37348
                         ...  
TimeStamp                37347
ReceivedInput            37347
SteeringInput            37347
AcellerationInput        37347
BrakeInput               37347
Length: 80, dtype: int64

In [53]:
existing_input_data.count()

uid                  37347
TimeStamp            37347
ReceivedInput        37347
SteeringInput        37347
AcellerationInput    37347
BrakeInput           37347
dtype: int64

In [27]:
existing_input_data.count().loc['UnixTimeStamp']-existing_input_data.count().loc['TimeStamp']

uid                  37347
TimeStamp            37347
ReceivedInput        37347
SteeringInput        37347
AcellerationInput    37347
BrakeInput           37347
dtype: int64

In [ ]:
tscomp = pd.concat([existing_eye_tracking_data.sort_values(['UnixTimeStamp']).reset_index()['UnixTimeStamp'], existing_input_data.sort_values(['TimeStamp']).reset_index()['TimeStamp']], axis=1, sort=False)
tscomp['delta'] = tscomp['UnixTimeStamp'] - tscomp['TimeStamp']
tscomp

In [23]:
tscomp = pd.concat([existing_eye_tracking_data.sort_values(['UnixTimeStamp']).reset_index()['UnixTimeStamp'], existing_input_data.sort_values(['TimeStamp']).reset_index()['TimeStamp']], axis=1, sort=False)
tscomp['delta'] = tscomp['UnixTimeStamp'] - tscomp['TimeStamp']
tscomp

Unnamed: 0,UnixTimeStamp,TimeStamp,delta
0,1603188768.786129713,1603188768.784134626,0.001995087
1,1603188768.800091505,1603188768.800091743,-0.000000238
2,1603188768.821036100,1603188768.821035862,0.000000238
3,1603188768.842976093,1603188768.842976093,0.000000000
4,1603188768.865444422,1603188768.865444422,0.000000000
...,...,...,...
37343,1603189491.063649416,1603189491.085590839,-0.021941423
37344,1603189491.085590839,1603189491.107532263,-0.021941423
37345,1603189491.107532501,1603189491.130339146,-0.022806644
37346,1603189491.130339146,1603189491.141309977,-0.010970831


In [61]:
import pandas as pd
import difflib

# Sample dataframes with float indexes (replace with your actual data)
df1 = pd.DataFrame([[1], [2], [3], [4], [5]],
                   index=[1.1, 2.2, 3.3, 4.4, 5.5],
                   columns=['number'])
df1['index'] = df1.index.values

df2 = pd.DataFrame([['a'], ['b'], ['c'], ['d'], ['e']],
                   index=[1.2, 2.3, 3.4, 4.5, 5.6],
                   columns=['letter'])
df2['index'] = df2.index.values

res = pd.merge(df1, df2, how='cross')
res['delta'] = res['index_x'] - res['index_y']
res = res[round(abs(res['delta']),1) <= 0.1].drop(['index_x', 'index_y', 'delta'], axis=1).reset_index(drop=True)
res

# Convert float indexes to a list
# df1_index_list = df1.index.tolist()
# 
# # Apply fuzzy matching to df2's index
# df2.index = df2.index.map(lambda x: difflib.get_close_matches(x, df1_index_list)[0])
# 
# # Merge the dataframes
# result_df = df1.join(df2)
# 
# print(result_df)



Unnamed: 0,number,letter
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e


In [None]:
|# Sample file processing
# file_path = '/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/EyeTracking/66bdb97a653b48f98c02d5e764089c00_EyeTracking_Westbrueck.txt'  # Replace with the actual file path
# 
# with open(file_path, 'r') as f:
#     data_list = json.load(f)
# df = pd.DataFrame([flatten_dict(d) for d in data_list])

In [None]:
# # Processing of a single file's hit objects using iteration
# allHitObjectsDataFrame = pd.DataFrame()
# for i, _ in df.iterrows():
#     co, _ = closest_objects('john', df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
#     allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, co], ignore_index=True)
# 
# allHitObjectsDataFrame

In [91]:
#file_path = os.path.join(raw_folder_input_path, txt_file)
with open('/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/97fa63f17b4f4fd98355731cb513f5d4_EyeTracking_Autobahn.txt', "r") as file:
    data_list = json.load(file)
    # Assuming the nested dictionary is in the 'acList' column
normalized_data = pd.json_normalize(data_list, max_level = 1)
normalized_data = pd.json_normalize(data_list, record_path=['hitObjects'], max_level = 1, meta=['UnixTimeStamp', 'HmdPosition'])
#normalized_data[normalized_data['UnixTimeStamp'] == 1603189414.9103609]
normalized_head_data = pd.json_normalize(normalized_data['HmdPosition'], record_prefix=['HmdPosition'])
normalized_head_data.rename(columns={'x': 'HmdPosition.x', 'y': 'HmdPosition.y', 'z': 'HmdPosition.z'}, inplace=True)
normalized_data = pd.concat([normalized_data.drop('HmdPosition', axis=1), normalized_head_data], axis=1)
normalized_data

# ############
# # from ast import literal_eval
# # normalized_data["hitObjects"] = hitObjectList = literal_eval(normalized_data["hitObjects"]) if isinstance(normalized_data["hitObjects"], str) else normalized_data["hitObjects"]
# # normalized_data.head()

Unnamed: 0,ObjectName,HitObjectPosition.x,HitObjectPosition.y,HitObjectPosition.z,HitPointOnObject.x,HitPointOnObject.y,HitPointOnObject.z,UnixTimeStamp,HmdPosition.x,HmdPosition.y,HmdPosition.z
0,ForestRightMiddle (9),-1429.0,36.0,4020.0,-1431.610107422,20.807725906,3995.349609375,1603189414.9103608,-1572.702270508,32.418270111,3680.587890625
1,ForestRightMiddle (11),-1359.0,36.0,4110.0,-1384.790039062,16.952075958,4099.828125,1603189414.9103608,-1572.702270508,32.418270111,3680.587890625
2,Autobahn_Terrain,-3527.879882812,-16.399993896,2682.0,-1560.227539062,31.399425507,3708.33984375,1603189414.9103608,-1572.702270508,32.418270111,3680.587890625
3,ForestRightMiddle (9),-1429.0,36.0,4020.0,-1431.610107422,20.807725906,3995.349609375,1603189414.9103608,-1572.702270508,32.418270111,3680.587890625
4,ForestRightMiddle (11),-1359.0,36.0,4110.0,-1384.790039062,16.952075958,4099.828125,1603189414.9103608,-1572.702270508,32.418270111,3680.587890625
5,Autobahn_Terrain,-3527.879882812,-16.399993896,2682.0,-1560.227539062,31.399425507,3708.33984375,1603189414.9103608,-1572.702270508,32.418270111,3680.587890625


|# 3. Extracting calibration data

In [13]:
file_path_calibration = 'Data/0_Raw/ParticipantCalibrationData/'
ids, matching_files_list = analyze_folder(file_path_calibration, '0a68f111e4f448d3b8279db69cd9df5e')
matching_files_list

Matching files:
['0a68f111e4f448d3b8279db69cd9df5e', 'ParticipantCalibrationData']


['0a68f111e4f448d3b8279db69cd9df5e_ParticipantCalibrationData.txt']

In [385]:
# Read calibration data and reject 
def reject_based_on_calibration_data(file_path_calibration):
    df_cal = pd.DataFrame()
    for filename in os.listdir(file_path_calibration):
        with open(os.path.join(file_path_calibration, filename), "r") as file:
            data_list_cal = json.load(file)
            df1_cal = pd.json_normalize(data_list_cal, max_level=1)
            df_cal = pd.concat([df1_cal, df_cal], ignore_index=True)
            # Reject uid when eyeValidation error > 1.5
            df_cal = df_cal[(df_cal['EyeValidationError.x']<1.5) | (df_cal['EyeValidationError.y']<1.5)]
            # Reject uid that skipped eye calibration and/or validation
            df_cal = df_cal[~df_cal['SpecialNotes'].str.contains('EyeCalibrationSkipped|EyeValidationSkipped')]
            # Reject uid that contain keyboard as input device
            df_cal= df_cal[~df_cal['SteeringInputDevice'].str.contains('Keyboard')]
            # Drop unwanted columns
            final_df = df_cal[['ParticipantUuid', 'ExperimentalCondition']]
            
    return final_df.rename(columns={'ParticipantUuid':'uid'})

In [9]:
df_cal = reject_based_on_calibration_data(file_path_calibration)
df_cal[df_cal['uid']=='f8199c55b531448aa14b053f0b1c099b']['ExperimentalCondition'].iloc[0]

'FullLoopAR'

In [507]:
print("Total number of uid without calibration issues: ", len(df_cal))
df_cal.groupby('ExperimentalCondition').count()

Total number of uid without calibration issues:  184


Unnamed: 0_level_0,uid
ExperimentalCondition,Unnamed: 1_level_1
AudioOnly,34
BaseCondition,51
FullLoopAR,54
HUDOnly,45


## Scene DATA

In [575]:
file_path_scene = '/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/SceneData/'

In [576]:
uids, matching_files_list = analyze_folder(file_path_scene,target_id='e02a32edf7494874b489598a611bd443')

Matching files:
['e02a32edf7494874b489598a611bd443', 'SceneData', 'CountryRoad']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'Westbrueck']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'Autobahn']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'MountainRoad']
['e02a32edf7494874b489598a611bd443', 'SceneData', 'TrainingScene']


Unnamed: 0,uid,dataset,city_section,EventName,StartofEventTimeStamp,EndOfEventTimeStamp,EventDuration,SuccessfulCompletionState,HitObjectName
0,e02a32edf7494874b489598a611bd443,SceneData,CountryRoad,MarketPlaceEvent,1607341000.0,1607341000.0,18.693071,True,
1,e02a32edf7494874b489598a611bd443,SceneData,CountryRoad,CyclistEvent,1607341000.0,1607341000.0,15.789046,True,
2,e02a32edf7494874b489598a611bd443,SceneData,CountryRoad,MotorcyclistEvent,1607341000.0,1607341000.0,4.867074,False,Motorcyclist2
3,e02a32edf7494874b489598a611bd443,SceneData,Westbrueck,I_variant,1607341000.0,1607341000.0,2.936358,True,
4,e02a32edf7494874b489598a611bd443,SceneData,Westbrueck,II_variant,1607341000.0,1607341000.0,7.384221,False,Cube
5,e02a32edf7494874b489598a611bd443,SceneData,Westbrueck,III_variant,1607341000.0,1607341000.0,9.097585,True,
6,e02a32edf7494874b489598a611bd443,SceneData,Autobahn,Panne,1607341000.0,1607341000.0,4.64764,True,
7,e02a32edf7494874b489598a611bd443,SceneData,Autobahn,GhostDriver,1607341000.0,1607341000.0,14.337612,False,Carbody
8,e02a32edf7494874b489598a611bd443,SceneData,Autobahn,Baustelle,1607341000.0,1607341000.0,3.78717,False,Cube (9)
9,e02a32edf7494874b489598a611bd443,SceneData,MountainRoad,StagEventNew,1607340000.0,1607340000.0,13.0355,True,
