# LoopAR project:
## 1. Data preprocessing

In [32]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import json
import itertools
import os
import math
import multiprocessing
from functools import partial
from ast import literal_eval
import pandas as pd
import dask.dataframe as dd
# import dataframe_image as dfi
import numpy as np
from numpy import dtype

import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib import patches
from scipy.signal import find_peaks
from IPython.display import display
from tqdm.notebook import tqdm

## 1.1 Load data

In [14]:
def get_filename_as_parts(filename):
    # Split out parts of file name without .txt ending
    return filename.split(".")[0].split("_")
    
def analyze_folder(folder_path, target_id = '', file_type='txt'):
    """
    Given a folder path, identifies all .txt files, splits their names by '_',
    and returns a set of unique IDs extracted from the first part of each file name.
    """
    unique_ids = set() # Initialize an empty set to store unique IDs
    files_of_interest = [f for f in os.listdir(folder_path) if f.lower().endswith('.'+file_type)]
    matching_files = []
    
    # List all .txt files in the specified folder
    if target_id != '':
        print("Matching files:")
    for filename in files_of_interest:
        parts = get_filename_as_parts(filename)
        if len(parts) > 0:
            unique_ids.add(parts[0])  # Add the first part to the set
        if target_id in parts:
            print(parts)
            matching_files.append(filename)
        
    return unique_ids, matching_files if target_id != '' else files_of_interest

In [15]:
# Set folders
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Eyetracking/"
# extracted_folder_path = "Data/2_Extracted/"

In [16]:
# Example usage:
ids, matching_files_list = analyze_folder(raw_folder_path)

print(f"Total unique IDs found in {raw_folder_path} ending with .txt files: {len(ids)} with a total of {len(matching_files_list)} files")
list(ids)[0]

Total unique IDs found in Data/0_Raw/Eyetracking/ ending with .txt files: 255 with a total of 1106 files


'a5a31c4af00b4199ac4c8760bfaa469e'

In [17]:
target_id = '39e5235b13274feb88430f08f3cd5369'  # Replace with the desired ID
_, matching_files_list = analyze_folder(raw_folder_path, target_id)

matching_files_list

Matching files:
['39e5235b13274feb88430f08f3cd5369', 'EyeTracking', 'MountainRoad']
['39e5235b13274feb88430f08f3cd5369', 'EyeTracking', 'Westbrueck']


['39e5235b13274feb88430f08f3cd5369_EyeTracking_MountainRoad.txt',
 '39e5235b13274feb88430f08f3cd5369_EyeTracking_Westbrueck.txt']

In [18]:
# Flatten the nested dictionaries
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [19]:
# Set the default distance between player and object if none can be calculated
max_distance = 9999999
number_of_closest_hit_objects = 5

# Calculates the distance between the player and an object
def distance(playerX, playerY, playerZ, objectX, objectY, objectZ):
    global max_distance
    if (objectX is None or objectY is None or objectZ is None or 
        playerX is None or playerY is None or playerZ is None):
        return pd.Series([max_distance])
    else:
        return pd.Series([math.sqrt((objectX - playerX)**2 + (objectY - playerY)**2 + (objectZ - playerZ)**2)])

column_names = []
# Initialize column names -- was a try for using apply with Dask, but does not work
# column_names = ['uid', 'UnixTimeStamp', 'ObjectName_0', 'HitObjectPosition_x_0', 'HitObjectPosition_y_0', 'HitObjectPosition_z_0', 'HitPointOnObject_x_0', 'HitPointOnObject_y_0', 'HitPointOnObject_z_0', 'distanceToPlayer_0', 'ObjectName_1', 'HitObjectPosition_x_1', 'HitObjectPosition_y_1', 'HitObjectPosition_z_1', 'HitPointOnObject_x_1', 'HitPointOnObject_y_1', 'HitPointOnObject_z_1', 'distanceToPlayer_1', 'ObjectName_2', 'HitObjectPosition_x_2', 'HitObjectPosition_y_2', 'HitObjectPosition_z_2', 'HitPointOnObject_x_2', 'HitPointOnObject_y_2', 'HitPointOnObject_z_2', 'distanceToPlayer_2', 'ObjectName_3', 'HitObjectPosition_x_3', 'HitObjectPosition_y_3', 'HitObjectPosition_z_3', 'HitPointOnObject_x_3', 'HitPointOnObject_y_3', 'HitPointOnObject_z_3', 'distanceToPlayer_3', 'ObjectName_4', 'HitObjectPosition_x_4', 'HitObjectPosition_y_4', 'HitObjectPosition_z_4', 'HitPointOnObject_x_4', 'HitPointOnObject_y_4', 'HitPointOnObject_z_4', 'distanceToPlayer_4']
# 
# column_dict = {'uid': dtype('O'),
#  'UnixTimeStamp': dtype('float64'),
#  'ObjectName_0': dtype('O'),
#  'HitObjectPosition_x_0': dtype('float64'),
#  'HitObjectPosition_y_0': dtype('float64'),
#  'HitObjectPosition_z_0': dtype('float64'),
#  'HitPointOnObject_x_0': dtype('float64'),
#  'HitPointOnObject_y_0': dtype('float64'),
#  'HitPointOnObject_z_0': dtype('float64'),
#  'distanceToPlayer_0': dtype('float64'),
#  'ObjectName_1': dtype('O'),
#  'HitObjectPosition_x_1': dtype('float64'),
#  'HitObjectPosition_y_1': dtype('float64'),
#  'HitObjectPosition_z_1': dtype('float64'),
#  'HitPointOnObject_x_1': dtype('float64'),
#  'HitPointOnObject_y_1': dtype('float64'),
#  'HitPointOnObject_z_1': dtype('float64'),
#  'distanceToPlayer_1': dtype('float64'),
#  'ObjectName_2': dtype('O'),
#  'HitObjectPosition_x_2': dtype('float64'),
#  'HitObjectPosition_y_2': dtype('float64'),
#  'HitObjectPosition_z_2': dtype('float64'),
#  'HitPointOnObject_x_2': dtype('float64'),
#  'HitPointOnObject_y_2': dtype('float64'),
#  'HitPointOnObject_z_2': dtype('float64'),
#  'distanceToPlayer_2': dtype('float64'),
#  'ObjectName_3': dtype('O'),
#  'HitObjectPosition_x_3': dtype('float64'),
#  'HitObjectPosition_y_3': dtype('float64'),
#  'HitObjectPosition_z_3': dtype('float64'),
#  'HitPointOnObject_x_3': dtype('float64'),
#  'HitPointOnObject_y_3': dtype('float64'),
#  'HitPointOnObject_z_3': dtype('float64'),
#  'distanceToPlayer_3': dtype('float64'),
#  'ObjectName_4': dtype('O'),
#  'HitObjectPosition_x_4': dtype('float64'),
#  'HitObjectPosition_y_4': dtype('float64'),
#  'HitObjectPosition_z_4': dtype('float64'),
#  'HitPointOnObject_x_4': dtype('float64'),
#  'HitPointOnObject_y_4': dtype('float64'),
#  'HitPointOnObject_z_4': dtype('float64'),
#  'distanceToPlayer_4': dtype('float64')}

# def all_hit_objects(uid, time, hitObjectList, posX, posY, posZ):
#     global column_names, max_distance
# 
#     # If no hit objects exist, return an empty dataframe
#     if len(hitObjectList) == 0:
#         return pd.DataFrame()
#     
#     # Create a dataframe from the hit object list JSON, if it is of type string, evaluate first
#     hitObjectList = literal_eval(hitObjectList) if isinstance(hitObjectList, str) else hitObjectList
#     allHitObjectDataFrame = pd.concat([pd.DataFrame(columns=['uid', 'UnixTimeStamp']), pd.DataFrame([flatten_dict(d) for d in hitObjectList])], axis=1)
#     allHitObjectDataFrame['uid'] = uid # add column of player UID for later merge back
#     allHitObjectDataFrame['UnixTimeStamp'] = time # add column of player time for later merge back
#     
#     # Calculate the distance to the player for each hit object, if JSON is empty, return max distance
#     allHitObjectDataFrame['distanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitObjectPosition_x'], row['HitObjectPosition_y'], row['HitObjectPosition_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])
#     
#     return allHitObjectDataFrame
# 
# def get_closest_hit_objects(uid, time, allHitObjectDataFrame, posX, posY, posZ):
#     global column_names, max_distance, number_of_closest_hit_objects
#     # Create the hit objects helper dataframe with known column names
#     clostestHitObjectsDF = pd.DataFrame(columns=column_names)
# 
#     # Create start of the dictionary row being created by this function
#     clostestHitObjectsDict = {}
#     clostestHitObjectsDict['uid'] = uid # add column of player UID for later merge back
#     clostestHitObjectsDict['UnixTimeStamp'] = time # add column of player time for later merge back
#     
#     # Sort dataframe by distance from player and only select top 5 rows
#     top5HitObjectDataFrame = allHitObjectDataFrame.drop(['uid', 'UnixTimeStamp'], axis=1).sort_values(by=['distanceToPlayer'], ascending=True).head(number_of_closest_hit_objects).reset_index(drop=True)
#     
#     # Create returning dataframe
#     for i, row in top5HitObjectDataFrame.iterrows():
#         for col in top5HitObjectDataFrame.columns:
#             clostestHitObjectsDict[f"{col}_{i}"] = row[col]
#     
#     clostestHitObjectsDF = pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True)
#     
#     return clostestHitObjectsDF

def closest_objects(uid, time, hitObjectList, posX, posY, posZ):
    global column_names, max_distance, number_of_closest_hit_objects
    # Create the hit objects helper dataframe with known column names
    clostestHitObjectsDF = pd.DataFrame(columns=column_names)

    # Create start of the dictionary row being created by this function
    clostestHitObjectsDict = {}
    clostestHitObjectsDict['uid'] = uid # add column of player UID for later merge back
    clostestHitObjectsDict['UnixTimeStamp'] = time # add column of player time for later merge back

    # If no hit objects exist, return an empty dataframe
    if len(hitObjectList) == 0:
        return pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True), pd.DataFrame()
    
    # Create a dataframe from the hit object list JSON, if it is of type string, evaluate first
    hitObjectList = literal_eval(hitObjectList) if isinstance(hitObjectList, str) else hitObjectList
    allHitObjectDataFrame = pd.concat([pd.DataFrame(columns=['uid', 'UnixTimeStamp']), pd.DataFrame([flatten_dict(d) for d in hitObjectList])], axis=1)
    allHitObjectDataFrame['uid'] = uid # add column of player UID for later merge back
    allHitObjectDataFrame['UnixTimeStamp'] = time # add column of player time for later merge back
    
    # Calculate the distance to the player for each hit object, if JSON is empty, return max distance
    allHitObjectDataFrame['distanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitObjectPosition_x'], row['HitObjectPosition_y'], row['HitObjectPosition_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])

    # Calculate all hit point on object to player distances to be able to chose only the closest one, e.g., in case of Terrain multiple hits may happen per frame
    allHitObjectDataFrame['hitPointDistanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitPointOnObject_x'], row['HitPointOnObject_y'], row['HitPointOnObject_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])
    
    # Sort dataframe by distance from player and only select top 5 rows
    top5HitObjectDataFrame = allHitObjectDataFrame.drop(['uid', 'UnixTimeStamp'], axis=1).sort_values(by=['distanceToPlayer', 'hitPointDistanceToPlayer'], ascending=True).drop_duplicates(subset=['ObjectName']).drop('hitPointDistanceToPlayer', axis=1).head(number_of_closest_hit_objects).reset_index(drop=True)
    
    # Create returning dataframe
    for i, row in top5HitObjectDataFrame.iterrows():
        for col in top5HitObjectDataFrame.columns:
            clostestHitObjectsDict[f"{col}_{i}"] = row[col]
    
    clostestHitObjectsDF = pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True)
    
    # Update global column names list for dynamic discovery of column names
    column_names = clostestHitObjectsDF.columns.tolist()
    
    return clostestHitObjectsDF, allHitObjectDataFrame

    # OLD CODE WITH UNSTACK THAT DID NOT WORK DUE TO MULTI-LEVEL INDICES
    #helperDF = hitObjectDataFrameTop5.unstack(level=-2).reset_index().rename(columns={0: '', 1: 'level_1'})
    #return helperDF.pivot(index=['uid', 'time'], columns=['level_0','level_1']).reset_index(drop=True)


In [20]:
# def preprocess_txt_files_multi_step(raw_folder_path, preprocessed_folder_path):
#     """
#     Reads all .txt files in the specified folder.
#     """
# 
#     # Get all participant uids in the given raw data folder
#     ids, _ = analyze_folder(raw_folder_path)
#     
#     # Run extraction per uid
#     for target_id in ids:
#         # Do not re-process uids that were already done
#         if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
#             continue
#         
#         # Get list of files for given uid
#         print(f"\nWorking on uid {target_id}...")
#         _, matching_files_list = analyze_folder(raw_folder_path, target_id)
#     
#         # Create output data frame and set most important columns to be the first ones
#         df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
#         for txt_file in matching_files_list:
#             # Read file from disk
#             file_path = os.path.join(raw_folder_path, txt_file)
#             with open(file_path, "r") as file:
#                 data_list = json.load(file)
#             
#             # If data file is empty, continue to next file
#             if len(data_list) == 0:
#                 print(f"INFO: No data found in {txt_file}")
#                 continue
#                 
#             # Create a DataFrame from the flattened data
#             flat_data_list = [flatten_dict(d) for d in data_list]
#             df1 = pd.DataFrame(flat_data_list)
#             
#             # Append file name information to dataframe
#             parts = get_filename_as_parts(txt_file)
#             df1['uid'] = parts[0]
#             df1['dataset'] = parts[1]
#             df1['city_section'] = parts[2]
#             
#             # Merge into master datafram
#             df = pd.concat([df, df1], ignore_index=True)
#     
#         # Only continue if there was data found for a given uid
#         if len(df) > 0:
#             # Process Hit Objects JSON column -> top 5 HitObjects with distances
#             allHitObjectsDataFrame = pd.DataFrame()
#             for i, _ in df.iterrows():
#                 if i % 5000 == 0:
#                     print(f"Processed {i} out of {len(df.index)} hit objects...")
#                 all = all_hit_objects(df['uid'][i], df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
#                 allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, all], ignore_index=True)
#                         
#             # Sort and save proprocessed data per UID
#             df.drop(['hitObjects'], axis=1, inplace=True)
#             df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
#             df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=False)   
# 
#             allHitObjectsDataFrame.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
#             allHitObjectsDataFrame.to_csv(os.path.join(preprocessed_folder_path, target_id + '_hitobjects.csv'), index=False)

In [62]:
num_threads = 30

def threaded_processing(target_id, preprocessed_folder_path, excluded_uids):
    # Do not re-process uids that were already done
    if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
        return
    
    # Get list of files for given uid
    print(f"\nWorking on uid {target_id}...")
    _, matching_files_list = analyze_folder(raw_folder_path, target_id)
    
    # Do not process uid if not all 5 files are present
    if len(matching_files_list) < 5:
        pd.concat([excluded_uids, pd.DataFrame([target_id, 'Incomplete drive'])], ignore_index=True)
        print(f"INFO: Excluded due to incomplete drive")
        return

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
    stop_flag = False
    for txt_file in matching_files_list:
        if stop_flag:
            continue
            
        # Read file from disk
        file_path = os.path.join(raw_folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # If data file is empty, continue to next file
        if len(data_list) == 0:
            pd.concat([excluded_uids, pd.DataFrame([target_id, 'Empty files found'])], ignore_index=True)
            print(f"INFO: Excluded due to no data found in {txt_file}")
            stop_flag = True
            continue
            
        # Create a DataFrame from the flattened data
        flat_data_list = [flatten_dict(d) for d in data_list]
        df1 = pd.DataFrame(flat_data_list)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = parts[0]
        df1['dataset'] = parts[1]
        df1['city_section'] = parts[2]
        
        # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

    # Only continue if there was data found for a given uid
    if len(df) > 0 and not stop_flag:
        # Process Hit Objects JSON column -> top 5 HitObjects with distances
        allHitObjectsDataFrame = pd.DataFrame(columns=['uid', 'dataset', 'UnixTimeStamp'])
        allHitObjectsDataFrame['uid'] = parts[0]
        allHitObjectsDataFrame['dataset'] = parts[1]

        topHitObjectsDataFrame = pd.DataFrame()
        for i, _ in df.iterrows():
            if i % 5000 == 0:
                print(f"Processed {i} out of {len(df.index)} hit objects...")
            co, all = closest_objects(df['uid'][i], df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
            topHitObjectsDataFrame = pd.concat([topHitObjectsDataFrame, co], ignore_index=True)
            allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, all], ignore_index=True)
        
        # Merge hit objects DF back to data DF and drop hitObjects column with JSON data
        df.set_index(['uid','UnixTimeStamp'], inplace=True)
        topHitObjectsDataFrame.set_index(['uid','UnixTimeStamp'], inplace=True)
        df = df.join(topHitObjectsDataFrame, on=['uid','UnixTimeStamp'], how='left')
        df.drop(['hitObjects'], axis=1, inplace=True)
        
        # Sort and save proprocessed data per UID
        df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
        df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=True)
        allHitObjectsDataFrame.to_csv(os.path.join(preprocessed_folder_path, 'allHitObjects', target_id + '_hitobjects.csv'), index=False)
        excluded_uids.drop_duplicates(inplace=True)
        excluded_uids.to_csv(os.path.join(preprocessed_folder_path, '_excluded_uids.csv'), index=False)

def preprocess_txt_files(raw_folder_path, preprocessed_folder_path):
    """
    Reads all .txt files in the specified folder.
    """
    
    # Save excluded UIDs
    excluded_uids_path = os.path.join(preprocessed_folder_path, '_excluded_uids.csv')
    excluded_uids = pd.DataFrame(columns=['uid', 'reason']) if not os.path.isfile(excluded_uids_path) else pd.read_csv(excluded_uids_path)

    # Get all participant uids in the given raw data folder
    ids, _ = analyze_folder(raw_folder_path)
    
    global num_threads
    pool = multiprocessing.pool.ThreadPool(num_threads)
    function = partial(threaded_processing, preprocessed_folder_path=preprocessed_folder_path, excluded_uids=excluded_uids)
    pool.map(function, list(ids))
    pool.close()
    pool.join()
       

In [63]:
# def extracted_preprocessed_files_multi_step(preprocessed_folder_path, extracted_folder_path):
#     ids, _ = analyze_folder(preprocessed_folder_path, file_type='csv')
#     
#     for target_id in ids:
#         file_path = os.path.join(preprocessed_folder_path, target_id + '.csv')
#         with open(file_path, "r") as file:
#             uid_data = pd.read_csv(file)
# 
#         file_path = os.path.join(preprocessed_folder_path, target_id + '_hitobjects.csv')
#         with open(file_path, "r") as file:
#             uid_hit_objects = pd.read_csv(file)
#         
#         timestamps = uid_data['UnixTimeStamp'].unique()
#         for i, timestamp in enumerate(timestamps):
#             if i % 5000 == 0:
#                 print(f"Processed {i} out of {len(timestamps)} time stamps...")
#             player_pos = uid_data[(uid_data['uid'] == target_id) & (uid_data['UnixTimeStamp'] == timestamp)][['HmdPosition_x', 'HmdPosition_y','HmdPosition_z']]
#             topHitObjectsDataFrame = get_closest_hit_objects(target_id, timestamp, uid_hit_objects, player_pos['HmdPosition_x'], player_pos['HmdPosition_y'], player_pos['HmdPosition_z'])
#         
#         uid_data.set_index(['uid','UnixTimeStamp'], inplace=True)
#         topHitObjectsDataFrame.set_index(['uid','UnixTimeStamp'], inplace=True)
#         uid_data = uid_data.join(topHitObjectsDataFrame, on=['uid','UnixTimeStamp'], how='left')
#         
#         df.to_csv(os.path.join(extracted_folder_path, target_id + '.csv'), index=True)


In [64]:
# Preprocess all files in the given raw folder (currently only EyeTracking input data)
preprocess_txt_files(raw_folder_path, processed_folder_path)


Working on uid a5a31c4af00b4199ac4c8760bfaa469e...
Working on uid a57132e2aeba484e8433ae8af9b1abbc...

Working on uid 8ba8fe140022448490326ad0d25ef5a3...

Working on uid 7de8141926e0497396bddddf37a1b115...


Working on uid 240d3d0b36a34accb42f0d98d27fd744...

Working on uid 8f9b8786312349639e8811f451054df0...

Working on uid d5daacd8fad2490fa1f12001b725271b...

Working on uid e635375fe03d455384d473da39fd48e1...

Working on uid 6f36a58eb7d843a593a01191c53bc0d0...

Working on uid ff4288f304e74bbf93aa6508c7df8145...

Working on uid 8d73ff4eb38f4c01b24ffda0981d3f3e...

Working on uid f78cb865f76b4ce4879083715f112d91...

Working on uid 0956f0cca5f546d79a0cf4fbae23d496...

Working on uid 9b9220b8bf8c4e61adde8bc7571540ef...

Working on uid 27ee7a0bfc2840e38225f653232d1487...

Working on uid 1137044be4694bb1a6dc0b6185f8292b...

Working on uid 81957954cd71450cacc5fd738dd9ebd2...

Working on uid cd9a3616783e49b787244cd621e9cdb9...

Working on uid 6ad4d759b7eb4ff392f6db00aec7678f...

Working on 

Process SpawnPoolWorker-48:
Process SpawnPoolWorker-49:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/queues.py", line 364, in get
    with self._rlock:
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
  File "/Library/Frameworks/Python.framework/Versi

KeyboardInterrupt: 

In [32]:
# Sample file processing
file_path = '/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/EyeTracking/66bdb97a653b48f98c02d5e764089c00_EyeTracking_Westbrueck.txt'  # Replace with the actual file path

with open(file_path, 'r') as f:
    data_list = json.load(f)
df = pd.DataFrame([flatten_dict(d) for d in data_list])

In [33]:
# Processing of a single file's hit objects using iteration
allHitObjectsDataFrame = pd.DataFrame()
for i, _ in df.iterrows():
    co, _ = closest_objects('john', df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
    allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, co], ignore_index=True)

allHitObjectsDataFrame

Unnamed: 0,uid,UnixTimeStamp,ObjectName_0,HitObjectPosition_x_0,HitObjectPosition_y_0,HitObjectPosition_z_0,HitPointOnObject_x_0,HitPointOnObject_y_0,HitPointOnObject_z_0,distanceToPlayer_0,...,HitPointOnObject_z_3,distanceToPlayer_3,ObjectName_4,HitObjectPosition_x_4,HitObjectPosition_y_4,HitObjectPosition_z_4,HitPointOnObject_x_4,HitPointOnObject_y_4,HitPointOnObject_z_4,distanceToPlayer_4
0,john,1.601117e+09,CarBody,536.973389,220.836456,1457.509766,538.671204,222.044189,1458.535645,6.952474,...,1127.379761,777.120193,Westbrueck_terrain,-502.709839,-16.4,949.637451,454.382202,221.526688,1417.338989,1188.116164
1,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-26.976925,221.138657,1179.478271,631.687633,...,1412.311523,1181.482160,,,,,,,,
2,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-29.614956,221.912354,1177.810791,630.870387,...,1412.132690,1180.676586,,,,,,,,
3,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-25.666519,221.689301,1180.640381,630.463486,...,1412.279297,1180.275523,,,,,,,,
4,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-24.435617,222.008591,1181.732056,630.058707,...,1412.311157,1179.876532,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6670,john,1.601117e+09,Speed Limit 70,-589.800049,182.723572,1053.971558,-589.300049,182.827957,1053.747803,103.553406,...,,,,,,,,,,
6671,john,1.601117e+09,Speed Limit 70,-589.800049,182.723572,1053.971558,-589.300049,182.761673,1054.104248,103.278730,...,,,,,,,,,,
6672,john,1.601117e+09,Motor Vehicles Only,-567.149902,182.989990,1056.198608,-567.332642,183.976868,1055.678223,80.290445,...,1045.923096,2419.228755,,,,,,,,
6673,john,1.601117e+09,Motor Vehicles Only,-567.149902,182.989990,1056.198608,-566.624634,183.881180,1055.965820,80.018746,...,,,,,,,,,,


In [ ]:
# Processing of a single file's hit objects using dataframe merge
# THIS FOR SOME REASON DOES NOT WORK, THROWS 'ValueError: If using all scalar values, you must pass an index'
#allHitObjectsDataFrame = df.apply(lambda row: closest_objects('1', row['UnixTimeStamp'], row['hitObjects'], row['HmdPosition_x'], row['HmdPosition_y'], row['HmdPosition_z']), axis=1, result_type='expand')
# 
# allHitObjectsDataFrame

In [ ]:
# Processing of a single file's hit objects using dask merge
# ddf = dd.from_pandas(df, npartitions=1)
# ddf.repartition(partition_size="100MB")
# 
# allHitObjectsDask = ddf.apply(lambda row: closest_objects('1', row['UnixTimeStamp'], row['hitObjects'], row['HmdPosition_x'], row['HmdPosition_y'], row['HmdPosition_z']), axis=1, meta=column_dict)
# 
# allHitObjectsDask.compute()