# LoopAR project:
## 1. Data preprocessing

In [70]:
import warnings

import pathos.pools

warnings.simplefilter(action='ignore', category=FutureWarning)

import json
import itertools
import os
import math
from functools import partial
from ast import literal_eval
import pandas as pd
import dask.dataframe as dd
# import dataframe_image as dfi
import numpy as np
from numpy import dtype

import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib import patches
from scipy.signal import find_peaks
from IPython.display import display
from tqdm.notebook import tqdm

## 1.1 Load data

In [71]:
def get_filename_as_parts(filename):
    # Split out parts of file name without .txt ending
    return filename.split(".")[0].split("_")
    
def analyze_folder(folder_path, target_id = '', file_type='txt'):
    """
    Given a folder path, identifies all .txt files, splits their names by '_',
    and returns a set of unique IDs extracted from the first part of each file name.
    """
    unique_ids = set() # Initialize an empty set to store unique IDs
    files_of_interest = [f for f in os.listdir(folder_path) if f.lower().endswith('.'+file_type)]
    matching_files = []
    
    # List all .txt files in the specified folder
    if target_id != '':
        print("Matching files:")
    for filename in files_of_interest:
        parts = get_filename_as_parts(filename)
        if len(parts) > 0:
            unique_ids.add(parts[0])  # Add the first part to the set
        if target_id in parts:
            print(parts)
            matching_files.append(filename)
        
    return unique_ids, matching_files if target_id != '' else files_of_interest

In [72]:
# Set folders
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Eyetracking/"
# extracted_folder_path = "Data/2_Extracted/"

In [73]:
# Example usage:
ids, matching_files_list = analyze_folder(raw_folder_path)

print(f"Total unique IDs found in {raw_folder_path} ending with .txt files: {len(ids)} with a total of {len(matching_files_list)} files")
list(ids)[0]

Total unique IDs found in Data/0_Raw/Eyetracking/ ending with .txt files: 255 with a total of 1106 files


'ff0cb5fdc1884204937eca64b52ee041'

In [74]:
target_id = '39e5235b13274feb88430f08f3cd5369'  # Replace with the desired ID
_, matching_files_list = analyze_folder(raw_folder_path, target_id)

matching_files_list

Matching files:
['39e5235b13274feb88430f08f3cd5369', 'EyeTracking', 'MountainRoad']
['39e5235b13274feb88430f08f3cd5369', 'EyeTracking', 'Westbrueck']


['39e5235b13274feb88430f08f3cd5369_EyeTracking_MountainRoad.txt',
 '39e5235b13274feb88430f08f3cd5369_EyeTracking_Westbrueck.txt']

In [75]:
# Flatten the nested dictionaries
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [76]:
# Set the default distance between player and object if none can be calculated
max_distance = 9999999
number_of_closest_hit_objects = 5

# Calculates the distance between the player and an object
def distance(playerX, playerY, playerZ, objectX, objectY, objectZ):
    global max_distance
    if (objectX is None or objectY is None or objectZ is None or 
        playerX is None or playerY is None or playerZ is None):
        return pd.Series([max_distance])
    else:
        return pd.Series([math.sqrt((objectX - playerX)**2 + (objectY - playerY)**2 + (objectZ - playerZ)**2)])

column_names = []
# Initialize column names -- was a try for using apply with Dask, but does not work
# column_names = ['uid', 'UnixTimeStamp', 'ObjectName_0', 'HitObjectPosition_x_0', 'HitObjectPosition_y_0', 'HitObjectPosition_z_0', 'HitPointOnObject_x_0', 'HitPointOnObject_y_0', 'HitPointOnObject_z_0', 'distanceToPlayer_0', 'ObjectName_1', 'HitObjectPosition_x_1', 'HitObjectPosition_y_1', 'HitObjectPosition_z_1', 'HitPointOnObject_x_1', 'HitPointOnObject_y_1', 'HitPointOnObject_z_1', 'distanceToPlayer_1', 'ObjectName_2', 'HitObjectPosition_x_2', 'HitObjectPosition_y_2', 'HitObjectPosition_z_2', 'HitPointOnObject_x_2', 'HitPointOnObject_y_2', 'HitPointOnObject_z_2', 'distanceToPlayer_2', 'ObjectName_3', 'HitObjectPosition_x_3', 'HitObjectPosition_y_3', 'HitObjectPosition_z_3', 'HitPointOnObject_x_3', 'HitPointOnObject_y_3', 'HitPointOnObject_z_3', 'distanceToPlayer_3', 'ObjectName_4', 'HitObjectPosition_x_4', 'HitObjectPosition_y_4', 'HitObjectPosition_z_4', 'HitPointOnObject_x_4', 'HitPointOnObject_y_4', 'HitPointOnObject_z_4', 'distanceToPlayer_4']
# 
# column_dict = {'uid': dtype('O'),
#  'UnixTimeStamp': dtype('float64'),
#  'ObjectName_0': dtype('O'),
#  'HitObjectPosition_x_0': dtype('float64'),
#  'HitObjectPosition_y_0': dtype('float64'),
#  'HitObjectPosition_z_0': dtype('float64'),
#  'HitPointOnObject_x_0': dtype('float64'),
#  'HitPointOnObject_y_0': dtype('float64'),
#  'HitPointOnObject_z_0': dtype('float64'),
#  'distanceToPlayer_0': dtype('float64'),
#  'ObjectName_1': dtype('O'),
#  'HitObjectPosition_x_1': dtype('float64'),
#  'HitObjectPosition_y_1': dtype('float64'),
#  'HitObjectPosition_z_1': dtype('float64'),
#  'HitPointOnObject_x_1': dtype('float64'),
#  'HitPointOnObject_y_1': dtype('float64'),
#  'HitPointOnObject_z_1': dtype('float64'),
#  'distanceToPlayer_1': dtype('float64'),
#  'ObjectName_2': dtype('O'),
#  'HitObjectPosition_x_2': dtype('float64'),
#  'HitObjectPosition_y_2': dtype('float64'),
#  'HitObjectPosition_z_2': dtype('float64'),
#  'HitPointOnObject_x_2': dtype('float64'),
#  'HitPointOnObject_y_2': dtype('float64'),
#  'HitPointOnObject_z_2': dtype('float64'),
#  'distanceToPlayer_2': dtype('float64'),
#  'ObjectName_3': dtype('O'),
#  'HitObjectPosition_x_3': dtype('float64'),
#  'HitObjectPosition_y_3': dtype('float64'),
#  'HitObjectPosition_z_3': dtype('float64'),
#  'HitPointOnObject_x_3': dtype('float64'),
#  'HitPointOnObject_y_3': dtype('float64'),
#  'HitPointOnObject_z_3': dtype('float64'),
#  'distanceToPlayer_3': dtype('float64'),
#  'ObjectName_4': dtype('O'),
#  'HitObjectPosition_x_4': dtype('float64'),
#  'HitObjectPosition_y_4': dtype('float64'),
#  'HitObjectPosition_z_4': dtype('float64'),
#  'HitPointOnObject_x_4': dtype('float64'),
#  'HitPointOnObject_y_4': dtype('float64'),
#  'HitPointOnObject_z_4': dtype('float64'),
#  'distanceToPlayer_4': dtype('float64')}

# def all_hit_objects(uid, time, hitObjectList, posX, posY, posZ):
#     global column_names, max_distance
# 
#     # If no hit objects exist, return an empty dataframe
#     if len(hitObjectList) == 0:
#         return pd.DataFrame()
#     
#     # Create a dataframe from the hit object list JSON, if it is of type string, evaluate first
#     hitObjectList = literal_eval(hitObjectList) if isinstance(hitObjectList, str) else hitObjectList
#     allHitObjectDataFrame = pd.concat([pd.DataFrame(columns=['uid', 'UnixTimeStamp']), pd.DataFrame([flatten_dict(d) for d in hitObjectList])], axis=1)
#     allHitObjectDataFrame['uid'] = uid # add column of player UID for later merge back
#     allHitObjectDataFrame['UnixTimeStamp'] = time # add column of player time for later merge back
#     
#     # Calculate the distance to the player for each hit object, if JSON is empty, return max distance
#     allHitObjectDataFrame['distanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitObjectPosition_x'], row['HitObjectPosition_y'], row['HitObjectPosition_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])
#     
#     return allHitObjectDataFrame
# 
# def get_closest_hit_objects(uid, time, allHitObjectDataFrame, posX, posY, posZ):
#     global column_names, max_distance, number_of_closest_hit_objects
#     # Create the hit objects helper dataframe with known column names
#     clostestHitObjectsDF = pd.DataFrame(columns=column_names)
# 
#     # Create start of the dictionary row being created by this function
#     clostestHitObjectsDict = {}
#     clostestHitObjectsDict['uid'] = uid # add column of player UID for later merge back
#     clostestHitObjectsDict['UnixTimeStamp'] = time # add column of player time for later merge back
#     
#     # Sort dataframe by distance from player and only select top 5 rows
#     top5HitObjectDataFrame = allHitObjectDataFrame.drop(['uid', 'UnixTimeStamp'], axis=1).sort_values(by=['distanceToPlayer'], ascending=True).head(number_of_closest_hit_objects).reset_index(drop=True)
#     
#     # Create returning dataframe
#     for i, row in top5HitObjectDataFrame.iterrows():
#         for col in top5HitObjectDataFrame.columns:
#             clostestHitObjectsDict[f"{col}_{i}"] = row[col]
#     
#     clostestHitObjectsDF = pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True)
#     
#     return clostestHitObjectsDF

def closest_objects(uid, time, hitObjectList, posX, posY, posZ):
    global column_names, max_distance, number_of_closest_hit_objects
    # Create the hit objects helper dataframe with known column names
    clostestHitObjectsDF = pd.DataFrame(columns=column_names)

    # Create start of the dictionary row being created by this function
    clostestHitObjectsDict = {}
    clostestHitObjectsDict['uid'] = uid # add column of player UID for later merge back
    clostestHitObjectsDict['UnixTimeStamp'] = time # add column of player time for later merge back

    # If no hit objects exist, return an empty dataframe
    if len(hitObjectList) == 0:
        return pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True), pd.DataFrame()
    
    # Create a dataframe from the hit object list JSON, if it is of type string, evaluate first
    hitObjectList = literal_eval(hitObjectList) if isinstance(hitObjectList, str) else hitObjectList
    allHitObjectDataFrame = pd.concat([pd.DataFrame(columns=['uid', 'UnixTimeStamp']), pd.DataFrame([flatten_dict(d) for d in hitObjectList])], axis=1)
    allHitObjectDataFrame['uid'] = uid # add column of player UID for later merge back
    allHitObjectDataFrame['UnixTimeStamp'] = time # add column of player time for later merge back
    
    # Calculate the distance to the player for each hit object, if JSON is empty, return max distance
    allHitObjectDataFrame['distanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitObjectPosition_x'], row['HitObjectPosition_y'], row['HitObjectPosition_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])

    # Calculate all hit point on object to player distances to be able to chose only the closest one, e.g., in case of Terrain multiple hits may happen per frame
    allHitObjectDataFrame['hitPointDistanceToPlayer'] = allHitObjectDataFrame.apply(lambda row: distance(posX, posY, posZ, row['HitPointOnObject_x'], row['HitPointOnObject_y'], row['HitPointOnObject_z']), axis=1) if len(hitObjectList) != 0 else pd.Series([max_distance])
    
    # Sort dataframe by distance from player and only select top 5 rows
    top5HitObjectDataFrame = allHitObjectDataFrame.drop(['uid', 'UnixTimeStamp'], axis=1).sort_values(by=['distanceToPlayer', 'hitPointDistanceToPlayer'], ascending=True).drop_duplicates(subset=['ObjectName']).drop('hitPointDistanceToPlayer', axis=1).head(number_of_closest_hit_objects).reset_index(drop=True)
    
    # Create returning dataframe
    for i, row in top5HitObjectDataFrame.iterrows():
        for col in top5HitObjectDataFrame.columns:
            clostestHitObjectsDict[f"{col}_{i}"] = row[col]
    
    clostestHitObjectsDF = pd.concat([clostestHitObjectsDF, pd.DataFrame([clostestHitObjectsDict])], ignore_index=True)
    
    # Update global column names list for dynamic discovery of column names
    column_names = clostestHitObjectsDF.columns.tolist()
    
    return clostestHitObjectsDF, allHitObjectDataFrame

    # OLD CODE WITH UNSTACK THAT DID NOT WORK DUE TO MULTI-LEVEL INDICES
    #helperDF = hitObjectDataFrameTop5.unstack(level=-2).reset_index().rename(columns={0: '', 1: 'level_1'})
    #return helperDF.pivot(index=['uid', 'time'], columns=['level_0','level_1']).reset_index(drop=True)


In [77]:
# def preprocess_txt_files_multi_step(raw_folder_path, preprocessed_folder_path):
#     """
#     Reads all .txt files in the specified folder.
#     """
# 
#     # Get all participant uids in the given raw data folder
#     ids, _ = analyze_folder(raw_folder_path)
#     
#     # Run extraction per uid
#     for target_id in ids:
#         # Do not re-process uids that were already done
#         if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
#             continue
#         
#         # Get list of files for given uid
#         print(f"\nWorking on uid {target_id}...")
#         _, matching_files_list = analyze_folder(raw_folder_path, target_id)
#     
#         # Create output data frame and set most important columns to be the first ones
#         df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
#         for txt_file in matching_files_list:
#             # Read file from disk
#             file_path = os.path.join(raw_folder_path, txt_file)
#             with open(file_path, "r") as file:
#                 data_list = json.load(file)
#             
#             # If data file is empty, continue to next file
#             if len(data_list) == 0:
#                 print(f"INFO: No data found in {txt_file}")
#                 continue
#                 
#             # Create a DataFrame from the flattened data
#             flat_data_list = [flatten_dict(d) for d in data_list]
#             df1 = pd.DataFrame(flat_data_list)
#             
#             # Append file name information to dataframe
#             parts = get_filename_as_parts(txt_file)
#             df1['uid'] = parts[0]
#             df1['dataset'] = parts[1]
#             df1['city_section'] = parts[2]
#             
#             # Merge into master datafram
#             df = pd.concat([df, df1], ignore_index=True)
#     
#         # Only continue if there was data found for a given uid
#         if len(df) > 0:
#             # Process Hit Objects JSON column -> top 5 HitObjects with distances
#             allHitObjectsDataFrame = pd.DataFrame()
#             for i, _ in df.iterrows():
#                 if i % 5000 == 0:
#                     print(f"Processed {i} out of {len(df.index)} hit objects...")
#                 all = all_hit_objects(df['uid'][i], df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
#                 allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, all], ignore_index=True)
#                         
#             # Sort and save proprocessed data per UID
#             df.drop(['hitObjects'], axis=1, inplace=True)
#             df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
#             df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=False)   
# 
#             allHitObjectsDataFrame.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
#             allHitObjectsDataFrame.to_csv(os.path.join(preprocessed_folder_path, target_id + '_hitobjects.csv'), index=False)

In [155]:
def threaded_processing_eyetracking(target_id, preprocessed_folder_path, excluded_uids):
    # Do not re-process uids that were already done
    if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
        print(f'File {target_id}.csv already exists. Skipping...')
        return excluded_uids
    
    # Get list of files for given uid
    print(f"\nWorking on uid {target_id}...")
    print(f"...using output folder {preprocessed_folder_path}")
    _, matching_files_list = analyze_folder(raw_folder_path, target_id)
    
    # Do not process uid if not all 5 files are present
    if len(matching_files_list) < 5:
        excluded_uids.append([target_id, 'Incomplete drive'])
        print(f"INFO: Excluded due to incomplete drive")
        return excluded_uids

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
    stop_flag = False
    for txt_file in matching_files_list:
        if stop_flag:
            continue
            
        # Read file from disk
        file_path = os.path.join(raw_folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # If data file is empty, continue to next file
        if len(data_list) == 0:            
            excluded_uids.append([target_id, 'Empty files found'])
            print(f"INFO: Excluded due to no data found in {txt_file}")
            stop_flag = True
            continue
            
        # Create a DataFrame from the flattened data
        flat_data_list = [flatten_dict(d) for d in data_list]
        df1 = pd.DataFrame(flat_data_list)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = parts[0]
        df1['dataset'] = parts[1]
        df1['city_section'] = parts[2]
        
        # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

    # Only continue if there was data found for a given uid
    if len(df) > 0 and not stop_flag:
        # Process Hit Objects JSON column -> top 5 HitObjects with distances
        allHitObjectsDataFrame = pd.DataFrame(columns=['uid', 'dataset', 'UnixTimeStamp'])
        allHitObjectsDataFrame['uid'] = parts[0]
        allHitObjectsDataFrame['dataset'] = parts[1]

        topHitObjectsDataFrame = pd.DataFrame()
        for i, _ in df.iterrows():
            if i % 5000 == 0:
                print(f"Processed {i} out of {len(df.index)} hit objects...")
            co, all = closest_objects(df['uid'][i], df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
            topHitObjectsDataFrame = pd.concat([topHitObjectsDataFrame, co], ignore_index=True)
            allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, all], ignore_index=True)
        
        # Merge hit objects DF back to data DF and drop hitObjects column with JSON data
        df.set_index(['uid','UnixTimeStamp'], inplace=True)
        topHitObjectsDataFrame.set_index(['uid','UnixTimeStamp'], inplace=True)
        df = df.join(topHitObjectsDataFrame, on=['uid','UnixTimeStamp'], how='left')
        df.drop(['hitObjects'], axis=1, inplace=True)
        
        # Sort and save preprocessed data per UID
        df.sort_values(by=['uid', 'UnixTimeStamp'], inplace=True)
        df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=True)
        allHitObjectsDataFrame.to_csv(os.path.join(preprocessed_folder_path, 'allHitObjects', target_id + '_hitobjects.csv'), index=False)
    
    return excluded_uids


In [176]:
def threaded_processing_input(target_id, preprocessed_folder_path, excluded_uids):
    # Do not re-process uids that were already done
    if os.path.isfile(os.path.join(preprocessed_folder_path + target_id + '.csv')):
        print(f'File {target_id}.csv already exists. Skipping...')
        return excluded_uids
    
    # Get list of files for given uid
    print(f"\nWorking on uid {target_id}...")
    print(f"...using output folder {preprocessed_folder_path}")
    _, matching_files_list = analyze_folder(raw_folder_path, target_id)

    # Create output data frame and set most important columns to be the first ones
    df = pd.DataFrame(columns=['uid', 'dataset', 'city_section'])
    stop_flag = False
    for txt_file in matching_files_list:
        if stop_flag:
            continue
            
        # Read file from disk
        file_path = os.path.join(raw_folder_path, txt_file)
        with open(file_path, "r") as file:
            data_list = json.load(file)
        
        # Create a DataFrame from the flattened data
        # flat_data_list = [flatten_dict(d) for d in data_list]
        # df1 = pd.DataFrame(flat_data_list)
        # Assuming the nested dictionary is in the 'acList' column
        df1 = pd.json_normalize(data_list,max_level=1)
        
        # Append file name information to dataframe
        parts = get_filename_as_parts(txt_file)
        df1['uid'] = parts[0]
        df1['dataset'] = parts[1]
        df1['city_section'] = parts[2]
        
        # Merge into master datafram
        df = pd.concat([df, df1], ignore_index=True)

        # Sort and save preprocessed data per UID
        df.sort_values(by=['uid', 'TimeStamp'], inplace=True)
        df.to_csv(os.path.join(preprocessed_folder_path, target_id + '.csv'), index=False)
    
    return excluded_uids

In [174]:
def preprocess_txt_files(raw_folder_path, preprocessed_folder_path, type, multi_processing = False):
    """
    Reads all .txt files in the specified folder.
    :param raw_folder_path:
    :param preprocessed_folder_path:
    :param multi_processing: Put false to save all excluded UIDs; in multiprocessing = True it will not work
    """
    
    # Save excluded UIDs
    excluded_uids_path = os.path.join(preprocessed_folder_path, '..', '_excluded_uids.csv')
    print(f"...using EXCLUDED UID output folder {excluded_uids_path}")
    excluded_uids = pd.DataFrame(columns=['uid', 'reason']) if not os.path.isfile(excluded_uids_path) else pd.read_csv(excluded_uids_path)
    # Get all participant uids in the given raw data folder
    ids, _ = analyze_folder(raw_folder_path)
    ids = [id for id in ids if id not in excluded_uids['uid'].values.tolist()]
    excluded_uids = excluded_uids.values.tolist()
    
    if multi_processing:
        # Create a process pool, using the pathos fork of multiprocessing
        pool = pathos.pools.ProcessPool()
        # Use partial if the resulting function has more than one input variable, here: processing path, excluded UIDs
        if type == 'eyetracking':
            function = partial(threaded_processing_eyetracking, preprocessed_folder_path=preprocessed_folder_path, excluded_uids=excluded_uids)
        elif type == 'input':
            function = partial(threaded_processing_input, preprocessed_folder_path=preprocessed_folder_path, excluded_uids=excluded_uids)
        else:
            print('Nothing to do. Choose a type.')
        # Execute pool across list of independent inputs
        pool.map(function, list(ids))
    else:
        for id in ids:
            if type == 'eyetracking':
                excluded_uids = threaded_processing_eyetracking(id, preprocessed_folder_path, excluded_uids)
            elif type == 'input':
                excluded_uids = threaded_processing_input(id, preprocessed_folder_path, excluded_uids)
            else:
                print('Nothing to do. Choose a type.')
    
    excluded_uids = pd.DataFrame(excluded_uids, columns=['uid', 'reason'])        
    excluded_uids.drop_duplicates(inplace=True)
    excluded_uids.to_csv(excluded_uids_path, index=False)


In [170]:
# def extracted_preprocessed_files_multi_step(preprocessed_folder_path, extracted_folder_path):
#     ids, _ = analyze_folder(preprocessed_folder_path, file_type='csv')
#     
#     for target_id in ids:
#         file_path = os.path.join(preprocessed_folder_path, target_id + '.csv')
#         with open(file_path, "r") as file:
#             uid_data = pd.read_csv(file)
# 
#         file_path = os.path.join(preprocessed_folder_path, target_id + '_hitobjects.csv')
#         with open(file_path, "r") as file:
#             uid_hit_objects = pd.read_csv(file)
#         
#         timestamps = uid_data['UnixTimeStamp'].unique()
#         for i, timestamp in enumerate(timestamps):
#             if i % 5000 == 0:
#                 print(f"Processed {i} out of {len(timestamps)} time stamps...")
#             player_pos = uid_data[(uid_data['uid'] == target_id) & (uid_data['UnixTimeStamp'] == timestamp)][['HmdPosition_x', 'HmdPosition_y','HmdPosition_z']]
#             topHitObjectsDataFrame = get_closest_hit_objects(target_id, timestamp, uid_hit_objects, player_pos['HmdPosition_x'], player_pos['HmdPosition_y'], player_pos['HmdPosition_z'])
#         
#         uid_data.set_index(['uid','UnixTimeStamp'], inplace=True)
#         topHitObjectsDataFrame.set_index(['uid','UnixTimeStamp'], inplace=True)
#         uid_data = uid_data.join(topHitObjectsDataFrame, on=['uid','UnixTimeStamp'], how='left')
#         
#         df.to_csv(os.path.join(extracted_folder_path, target_id + '.csv'), index=True)


In [153]:
# Preprocess all files in the given raw folder (currently only EyeTracking input data)
raw_folder_path = "Data/0_Raw/Eyetracking/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Eyetracking/"
preprocess_txt_files(raw_folder_path, processed_folder_path, 'eyetracking', multi_processing = False)

...using EXCLUDED UID output folder Data/1_Preprocessed/Eyetracking/../_excluded_uids.csv
File debf0264467d42c1854783622182005c.csv already exists. Skipping...
File ef2e9fc68dc5486084a0cc170db5462b.csv already exists. Skipping...
File cb5d0147ad15497680895773bcb2d217.csv already exists. Skipping...
File 6ad4d759b7eb4ff392f6db00aec7678f.csv already exists. Skipping...
File 677eb13e053d46edb23876a1cf070e34.csv already exists. Skipping...
File 5c776e10348f4e3589cfc57be88546d2.csv already exists. Skipping...
File 7e732acc694248ceaa3547de5fc77639.csv already exists. Skipping...
File beeea83429274984b2b016773492d208.csv already exists. Skipping...
File 7e7865d70c3c445a9a90aa6cc953ad67.csv already exists. Skipping...
File 228bdb9398f6492092c45467428bb95a.csv already exists. Skipping...
File 9af67bd8dba94cf599f830105cc0a5ab.csv already exists. Skipping...
File 3b4445b1f502437dab3e11c5eaeed1ee.csv already exists. Skipping...
File 2fe73e7e2534479aa59aeb91635906aa.csv already exists. Skipping...


In [177]:
raw_folder_path = "Data/0_Raw/Input/"  # Replace with your actual folder path
processed_folder_path = "Data/1_Preprocessed/Input/"
preprocess_txt_files(raw_folder_path, processed_folder_path, 'input', multi_processing = True)

...using EXCLUDED UID output folder Data/1_Preprocessed/Input/../_excluded_uids.csv


In [32]:
# Sample file processing
file_path = '/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/EyeTracking/66bdb97a653b48f98c02d5e764089c00_EyeTracking_Westbrueck.txt'  # Replace with the actual file path

with open(file_path, 'r') as f:
    data_list = json.load(f)
df = pd.DataFrame([flatten_dict(d) for d in data_list])

In [33]:
# Processing of a single file's hit objects using iteration
allHitObjectsDataFrame = pd.DataFrame()
for i, _ in df.iterrows():
    co, _ = closest_objects('john', df['UnixTimeStamp'][i], df['hitObjects'][i], df['HmdPosition_x'][i], df['HmdPosition_y'][i], df['HmdPosition_z'][i])
    allHitObjectsDataFrame = pd.concat([allHitObjectsDataFrame, co], ignore_index=True)

allHitObjectsDataFrame

Unnamed: 0,uid,UnixTimeStamp,ObjectName_0,HitObjectPosition_x_0,HitObjectPosition_y_0,HitObjectPosition_z_0,HitPointOnObject_x_0,HitPointOnObject_y_0,HitPointOnObject_z_0,distanceToPlayer_0,...,HitPointOnObject_z_3,distanceToPlayer_3,ObjectName_4,HitObjectPosition_x_4,HitObjectPosition_y_4,HitObjectPosition_z_4,HitPointOnObject_x_4,HitPointOnObject_y_4,HitPointOnObject_z_4,distanceToPlayer_4
0,john,1.601117e+09,CarBody,536.973389,220.836456,1457.509766,538.671204,222.044189,1458.535645,6.952474,...,1127.379761,777.120193,Westbrueck_terrain,-502.709839,-16.4,949.637451,454.382202,221.526688,1417.338989,1188.116164
1,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-26.976925,221.138657,1179.478271,631.687633,...,1412.311523,1181.482160,,,,,,,,
2,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-29.614956,221.912354,1177.810791,630.870387,...,1412.132690,1180.676586,,,,,,,,
3,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-25.666519,221.689301,1180.640381,630.463486,...,1412.279297,1180.275523,,,,,,,,
4,john,1.601117e+09,mons_LOD,-32.719116,210.965057,1185.078613,-24.435617,222.008591,1181.732056,630.058707,...,1412.311157,1179.876532,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6670,john,1.601117e+09,Speed Limit 70,-589.800049,182.723572,1053.971558,-589.300049,182.827957,1053.747803,103.553406,...,,,,,,,,,,
6671,john,1.601117e+09,Speed Limit 70,-589.800049,182.723572,1053.971558,-589.300049,182.761673,1054.104248,103.278730,...,,,,,,,,,,
6672,john,1.601117e+09,Motor Vehicles Only,-567.149902,182.989990,1056.198608,-567.332642,183.976868,1055.678223,80.290445,...,1045.923096,2419.228755,,,,,,,,
6673,john,1.601117e+09,Motor Vehicles Only,-567.149902,182.989990,1056.198608,-566.624634,183.881180,1055.965820,80.018746,...,,,,,,,,,,


In [ ]:
# Processing of a single file's hit objects using dataframe merge
# THIS FOR SOME REASON DOES NOT WORK, THROWS 'ValueError: If using all scalar values, you must pass an index'
#allHitObjectsDataFrame = df.apply(lambda row: closest_objects('1', row['UnixTimeStamp'], row['hitObjects'], row['HmdPosition_x'], row['HmdPosition_y'], row['HmdPosition_z']), axis=1, result_type='expand')
# 
# allHitObjectsDataFrame

In [ ]:
# Processing of a single file's hit objects using dask merge
# ddf = dd.from_pandas(df, npartitions=1)
# ddf.repartition(partition_size="100MB")
# 
# allHitObjectsDask = ddf.apply(lambda row: closest_objects('1', row['UnixTimeStamp'], row['hitObjects'], row['HmdPosition_x'], row['HmdPosition_y'], row['HmdPosition_z']), axis=1, meta=column_dict)
# 
# allHitObjectsDask.compute()

# 2. Extracting input data

In [8]:
raw_folder_input_path = "Data/0_Raw/EyeTracking/"

In [9]:
dask_input_df = dd.read_csv('/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/Input/0a68f111e4f448d3b8279db69cd9df5e_Input_Autobahn.txt', sep='\t')
dask_input_df

Unnamed: 0_level_0,[
npartitions=1,Unnamed: 1_level_1
,string
,...


In [10]:
ids, matching_files_list = analyze_folder(raw_folder_input_path, '0a68f111e4f448d3b8279db69cd9df5e')


Matching files:
['0a68f111e4f448d3b8279db69cd9df5e', 'EyeTracking', 'CountryRoad']
['0a68f111e4f448d3b8279db69cd9df5e', 'EyeTracking', 'MountainRoad']
['0a68f111e4f448d3b8279db69cd9df5e', 'EyeTracking', 'Autobahn']
['0a68f111e4f448d3b8279db69cd9df5e', 'EyeTracking', 'Westbrueck']
['0a68f111e4f448d3b8279db69cd9df5e', 'EyeTracking', 'TrainingScene']


In [178]:
# file_path = os.path.join(raw_folder_input_path, txt_file)
with open('/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/EyeTracking/0a68f111e4f448d3b8279db69cd9df5e_EyeTracking_Autobahn.txt', "r") as file:
    data_list = json.load(file)
# df = pd.DataFrame(data_list)

In [60]:
df

Unnamed: 0,UnixTimeStamp,TobiiTimeStamp,FPS,HmdPosition,NoseVector,EyePosWorldCombined,EyeDirWorldCombined,EyePosLocalCombined,EyeDirLocalCombined,RightEyeIsBlinkingWorld,RightEyeIsBlinkingLocal,LeftEyeIsBlinkingWorld,LeftEyeIsBlinkingLocal,hitObjects
0,1.602674e+09,744.496460,15.570679,"{'x': -1548.4849853515625, 'y': 134.3332672119...","{'x': -0.7808842062950134, 'y': 0.060032743960...","{'x': -1548.477783203125, 'y': 134.33255004882...","{'x': -0.8407058119773865, 'y': -0.10508625209...","{'x': -0.00013932801084592938, 'y': -0.0025241...","{'x': -0.068359375, 'y': -0.186676025390625, '...",False,False,False,False,"[{'ObjectName': 'ForestRightEntry (2)', 'HitOb..."
1,1.602674e+09,744.517700,16.688669,"{'x': -1554.8427734375, 'y': 133.8691558837890...","{'x': -0.7353829741477966, 'y': 0.061008527874...","{'x': -1554.8363037109375, 'y': 133.8678131103...","{'x': -0.7981982827186584, 'y': -0.11080456525...","{'x': 0.00011648560030153021, 'y': -0.00253092...","{'x': -0.068328857421875, 'y': -0.186691284179...",False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject..."
2,1.602674e+09,744.529968,18.130299,"{'x': -1555.24169921875, 'y': 133.835311889648...","{'x': -0.733123242855072, 'y': 0.0565054081380...","{'x': -1555.23486328125, 'y': 133.833862304687...","{'x': -0.7953356504440308, 'y': -0.11504980921...","{'x': 0.0007309723296202719, 'y': -0.002538742...","{'x': -0.0679168701171875, 'y': -0.18653869628...",False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject..."
3,1.602674e+09,744.550354,19.349819,"{'x': -1555.6414794921875, 'y': 133.8010559082...","{'x': -0.7310828566551208, 'y': 0.052004940807...","{'x': -1555.6351318359375, 'y': 133.7997436523...","{'x': -0.7930845022201538, 'y': -0.11975196748...","{'x': 0.0002455291978549212, 'y': -0.002539825...","{'x': -0.06817626953125, 'y': -0.1864166259765...",False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject..."
4,1.602674e+09,744.561890,20.978073,"{'x': -1556.041748046875, 'y': 133.76502990722...","{'x': -0.7293000221252441, 'y': 0.047270566225...","{'x': -1556.03515625, 'y': 133.76376342773438,...","{'x': -0.7913675904273987, 'y': -0.12440797686...","{'x': 0.00048110962961800396, 'y': -0.00254240...","{'x': -0.0692138671875, 'y': -0.18647766113281...",False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,1.602675e+09,875.579163,84.273964,"{'x': -604.66748046875, 'y': 64.10457611083984...","{'x': 0.9606755375862122, 'y': 0.1503793001174...","{'x': -604.673828125, 'y': 64.10381317138672, ...","{'x': 0.9525787234306335, 'y': 0.0960365682840...","{'x': -0.0012636261526495218, 'y': -0.00168281...","{'x': -0.0416717529296875, 'y': -0.06274414062...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject..."
7556,1.602675e+09,875.601379,85.342766,"{'x': -604.4107666015625, 'y': 64.113227844238...","{'x': 0.961294412612915, 'y': 0.14921180903911...","{'x': -604.4171142578125, 'y': 64.112464904785...","{'x': 0.9521425366401672, 'y': 0.0953773781657...","{'x': -0.001482177758589387, 'y': -0.001686828...","{'x': -0.0458831787109375, 'y': -0.06246948242...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject..."
7557,1.602675e+09,875.624939,85.193512,"{'x': -604.1533203125, 'y': 64.12186431884766,...","{'x': 0.9616440534591675, 'y': 0.1485013365745...","{'x': -604.1596069335938, 'y': 64.121040344238...","{'x': 0.9530147314071655, 'y': 0.0949197337031...","{'x': -0.0010021667694672942, 'y': -0.00168832...","{'x': -0.0454559326171875, 'y': -0.06166076660...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject..."
7558,1.602675e+09,875.636169,85.541122,"{'x': -603.8947143554688, 'y': 64.130599975585...","{'x': 0.9618417024612427, 'y': 0.1481603533029...","{'x': -603.9013061523438, 'y': 64.129920959472...","{'x': 0.9513297080993652, 'y': 0.0961634665727...","{'x': -0.002255020197480917, 'y': -0.001688369...","{'x': -0.0518035888671875, 'y': -0.06083679199...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject..."


In [187]:
# df = pd.read_json('your_file.json', lines=True)

# Assuming the nested dictionary is in the 'acList' column
normalized_data = pd.json_normalize(data_list,max_level=1)
normalized_data
# Create new columns for each key-value pair
# df = pd.concat([df, normalized_data], axis=1)

Unnamed: 0,UnixTimeStamp,TobiiTimeStamp,FPS,RightEyeIsBlinkingWorld,RightEyeIsBlinkingLocal,LeftEyeIsBlinkingWorld,LeftEyeIsBlinkingLocal,hitObjects,HmdPosition.x,HmdPosition.y,...,EyePosWorldCombined.z,EyeDirWorldCombined.x,EyeDirWorldCombined.y,EyeDirWorldCombined.z,EyePosLocalCombined.x,EyePosLocalCombined.y,EyePosLocalCombined.z,EyeDirLocalCombined.x,EyeDirLocalCombined.y,EyeDirLocalCombined.z
0,1.602674e+09,744.496460,15.570679,False,False,False,False,"[{'ObjectName': 'ForestRightEntry (2)', 'HitOb...",-1548.484985,134.333267,...,2660.845947,-0.840706,-0.105086,0.531171,-0.000139,-0.002524,-0.022862,-0.068359,-0.186676,0.980026
1,1.602674e+09,744.517700,16.688669,False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject...",-1554.842773,133.869156,...,2665.263672,-0.798198,-0.110805,0.592094,0.000116,-0.002531,-0.022852,-0.068329,-0.186691,0.980026
2,1.602674e+09,744.529968,18.130299,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1555.241699,133.835312,...,2665.557129,-0.795336,-0.115050,0.595135,0.000731,-0.002539,-0.022830,-0.067917,-0.186539,0.980087
3,1.602674e+09,744.550354,19.349819,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1555.641479,133.801056,...,2665.851807,-0.793085,-0.119752,0.597199,0.000246,-0.002540,-0.022831,-0.068176,-0.186417,0.980087
4,1.602674e+09,744.561890,20.978073,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1556.041748,133.765030,...,2666.147705,-0.791368,-0.124408,0.598536,0.000481,-0.002542,-0.022823,-0.069214,-0.186478,0.980011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,1.602675e+09,875.579163,84.273964,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-604.667480,64.104576,...,5073.665527,0.952579,0.096037,0.288696,-0.001264,-0.001683,-0.022942,-0.041672,-0.062744,0.997147
7556,1.602675e+09,875.601379,85.342766,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-604.410767,64.113228,...,5073.762695,0.952143,0.095377,0.290348,-0.001482,-0.001687,-0.022901,-0.045883,-0.062469,0.996979
7557,1.602675e+09,875.624939,85.193512,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-604.153320,64.121864,...,5073.859863,0.953015,0.094920,0.287646,-0.001002,-0.001688,-0.022887,-0.045456,-0.061661,0.997055
7558,1.602675e+09,875.636169,85.541122,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-603.894714,64.130600,...,5073.959961,0.951330,0.096163,0.292764,-0.002255,-0.001688,-0.022901,-0.051804,-0.060837,0.996796


In [185]:
from ast import literal_eval

normalized_data["hitObjects"] = hitObjectList = literal_eval(normalized_data["hitObjects"]) if isinstance(normalized_data["hitObjects"], str) else normalized_data["hitObjects"]
normalized_data.head()

Unnamed: 0,UnixTimeStamp,TobiiTimeStamp,FPS,RightEyeIsBlinkingWorld,RightEyeIsBlinkingLocal,LeftEyeIsBlinkingWorld,LeftEyeIsBlinkingLocal,hitObjects,HmdPosition.x,HmdPosition.y,...,EyePosWorldCombined.z,EyeDirWorldCombined.x,EyeDirWorldCombined.y,EyeDirWorldCombined.z,EyePosLocalCombined.x,EyePosLocalCombined.y,EyePosLocalCombined.z,EyeDirLocalCombined.x,EyeDirLocalCombined.y,EyeDirLocalCombined.z
0,1602674000.0,744.49646,15.570679,False,False,False,False,"[{'ObjectName': 'ForestRightEntry (2)', 'HitOb...",-1548.484985,134.333267,...,2660.845947,-0.840706,-0.105086,0.531171,-0.000139,-0.002524,-0.022862,-0.068359,-0.186676,0.980026
1,1602674000.0,744.5177,16.688669,False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject...",-1554.842773,133.869156,...,2665.263672,-0.798198,-0.110805,0.592094,0.000116,-0.002531,-0.022852,-0.068329,-0.186691,0.980026
2,1602674000.0,744.529968,18.130299,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1555.241699,133.835312,...,2665.557129,-0.795336,-0.11505,0.595135,0.000731,-0.002539,-0.02283,-0.067917,-0.186539,0.980087
3,1602674000.0,744.550354,19.349819,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1555.641479,133.801056,...,2665.851807,-0.793085,-0.119752,0.597199,0.000246,-0.00254,-0.022831,-0.068176,-0.186417,0.980087
4,1602674000.0,744.56189,20.978073,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1556.041748,133.76503,...,2666.147705,-0.791368,-0.124408,0.598536,0.000481,-0.002542,-0.022823,-0.069214,-0.186478,0.980011


In [67]:
normalized_data

Unnamed: 0,UnixTimeStamp,TobiiTimeStamp,FPS,RightEyeIsBlinkingWorld,RightEyeIsBlinkingLocal,LeftEyeIsBlinkingWorld,LeftEyeIsBlinkingLocal,hitObjects,HmdPosition.x,HmdPosition.y,...,EyePosWorldCombined.z,EyeDirWorldCombined.x,EyeDirWorldCombined.y,EyeDirWorldCombined.z,EyePosLocalCombined.x,EyePosLocalCombined.y,EyePosLocalCombined.z,EyeDirLocalCombined.x,EyeDirLocalCombined.y,EyeDirLocalCombined.z
0,1.602674e+09,744.496460,15.570679,False,False,False,False,"[{'ObjectName': 'ForestRightEntry (2)', 'HitOb...",-1548.484985,134.333267,...,2660.845947,-0.840706,-0.105086,0.531171,-0.000139,-0.002524,-0.022862,-0.068359,-0.186676,0.980026
1,1.602674e+09,744.517700,16.688669,False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject...",-1554.842773,133.869156,...,2665.263672,-0.798198,-0.110805,0.592094,0.000116,-0.002531,-0.022852,-0.068329,-0.186691,0.980026
2,1.602674e+09,744.529968,18.130299,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1555.241699,133.835312,...,2665.557129,-0.795336,-0.115050,0.595135,0.000731,-0.002539,-0.022830,-0.067917,-0.186539,0.980087
3,1.602674e+09,744.550354,19.349819,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1555.641479,133.801056,...,2665.851807,-0.793085,-0.119752,0.597199,0.000246,-0.002540,-0.022831,-0.068176,-0.186417,0.980087
4,1.602674e+09,744.561890,20.978073,False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",-1556.041748,133.765030,...,2666.147705,-0.791368,-0.124408,0.598536,0.000481,-0.002542,-0.022823,-0.069214,-0.186478,0.980011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,1.602675e+09,875.579163,84.273964,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-604.667480,64.104576,...,5073.665527,0.952579,0.096037,0.288696,-0.001264,-0.001683,-0.022942,-0.041672,-0.062744,0.997147
7556,1.602675e+09,875.601379,85.342766,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-604.410767,64.113228,...,5073.762695,0.952143,0.095377,0.290348,-0.001482,-0.001687,-0.022901,-0.045883,-0.062469,0.996979
7557,1.602675e+09,875.624939,85.193512,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-604.153320,64.121864,...,5073.859863,0.953015,0.094920,0.287646,-0.001002,-0.001688,-0.022887,-0.045456,-0.061661,0.997055
7558,1.602675e+09,875.636169,85.541122,False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",-603.894714,64.130600,...,5073.959961,0.951330,0.096163,0.292764,-0.002255,-0.001688,-0.022901,-0.051804,-0.060837,0.996796


In [39]:
# from ast import literal_eval
# 
# df["HmdPosition"] = df["HmdPosition"].apply(literal_eval)
df = df.explode("HmdPosition")
df = pd.concat([df, df.pop("HmdPosition").apply(pd.Series)], axis=1)
df

Unnamed: 0,UnixTimeStamp,TobiiTimeStamp,FPS,NoseVector,EyePosWorldCombined,EyeDirWorldCombined,EyePosLocalCombined,EyeDirLocalCombined,RightEyeIsBlinkingWorld,RightEyeIsBlinkingLocal,LeftEyeIsBlinkingWorld,LeftEyeIsBlinkingLocal,hitObjects,0
0,1.602674e+09,744.517700,16.688669,"{'x': -0.7353829741477966, 'y': 0.061008527874...","{'x': -1554.8363037109375, 'y': 133.8678131103...","{'x': -0.7981982827186584, 'y': -0.11080456525...","{'x': 0.00011648560030153021, 'y': -0.00253092...","{'x': -0.068328857421875, 'y': -0.186691284179...",False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject...",x
0,1.602674e+09,744.517700,16.688669,"{'x': -0.7353829741477966, 'y': 0.061008527874...","{'x': -1554.8363037109375, 'y': 133.8678131103...","{'x': -0.7981982827186584, 'y': -0.11080456525...","{'x': 0.00011648560030153021, 'y': -0.00253092...","{'x': -0.068328857421875, 'y': -0.186691284179...",False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject...",y
0,1.602674e+09,744.517700,16.688669,"{'x': -0.7353829741477966, 'y': 0.061008527874...","{'x': -1554.8363037109375, 'y': 133.8678131103...","{'x': -0.7981982827186584, 'y': -0.11080456525...","{'x': 0.00011648560030153021, 'y': -0.00253092...","{'x': -0.068328857421875, 'y': -0.186691284179...",False,False,False,False,"[{'ObjectName': 'Default Road 002', 'HitObject...",z
1,1.602674e+09,744.529968,18.130299,"{'x': -0.733123242855072, 'y': 0.0565054081380...","{'x': -1555.23486328125, 'y': 133.833862304687...","{'x': -0.7953356504440308, 'y': -0.11504980921...","{'x': 0.0007309723296202719, 'y': -0.002538742...","{'x': -0.0679168701171875, 'y': -0.18653869628...",False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",x
1,1.602674e+09,744.529968,18.130299,"{'x': -0.733123242855072, 'y': 0.0565054081380...","{'x': -1555.23486328125, 'y': 133.833862304687...","{'x': -0.7953356504440308, 'y': -0.11504980921...","{'x': 0.0007309723296202719, 'y': -0.002538742...","{'x': -0.0679168701171875, 'y': -0.18653869628...",False,False,False,False,"[{'ObjectName': 'Default Road 001', 'HitObject...",y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7557,1.602675e+09,875.636169,85.541122,"{'x': 0.9618417024612427, 'y': 0.1481603533029...","{'x': -603.9013061523438, 'y': 64.129920959472...","{'x': 0.9513297080993652, 'y': 0.0961634665727...","{'x': -0.002255020197480917, 'y': -0.001688369...","{'x': -0.0518035888671875, 'y': -0.06083679199...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",y
7557,1.602675e+09,875.636169,85.541122,"{'x': 0.9618417024612427, 'y': 0.1481603533029...","{'x': -603.9013061523438, 'y': 64.129920959472...","{'x': 0.9513297080993652, 'y': 0.0961634665727...","{'x': -0.002255020197480917, 'y': -0.001688369...","{'x': -0.0518035888671875, 'y': -0.06083679199...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",z
7558,1.602675e+09,875.658813,85.997131,"{'x': 0.9621937870979309, 'y': 0.1475926935672...","{'x': -603.6407470703125, 'y': 64.138664245605...","{'x': 0.9521538615226746, 'y': 0.0970381423830...","{'x': -0.002260910114273429, 'y': -0.001692718...","{'x': -0.0506439208984375, 'y': -0.05909729003...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",x
7558,1.602675e+09,875.658813,85.997131,"{'x': 0.9621937870979309, 'y': 0.1475926935672...","{'x': -603.6407470703125, 'y': 64.138664245605...","{'x': 0.9521538615226746, 'y': 0.0970381423830...","{'x': -0.002260910114273429, 'y': -0.001692718...","{'x': -0.0506439208984375, 'y': -0.05909729003...",False,False,False,False,"[{'ObjectName': 'Autobahn_Terrain', 'HitObject...",y


In [24]:
# from io import StringIO
# df2 = pd.read_csv(StringIO('\n'.join(['|'.join(map(str, row)) for row in data_list])), sep='|', index_col=None)
# df2

In [65]:
input_df = pd.DataFrame(columns=['uid', 'dataset','city_section'])

for txt_file in matching_files_list:
    # Read file from disk
    file_path = os.path.join(raw_folder_input_path, txt_file)
    with open(file_path, "r") as file:
        data_list = json.load(file)
    df = pd.DataFrame(data_list)
    # If data file is empty, continue to next file
    # if len(data_list) == 0:
    #     print(f"INFO: Excluded due to no data found in {txt_file}")
    #     stop_flag = True
    #     continue
        
    # Append file name information to dataframe
    parts = get_filename_as_parts(txt_file)
    df['uid'] = parts[0]
    df['dataset'] = parts[1]
    df['city_section'] = parts[2]
    # Create a DataFrame from the flattened data
    # flat_data_list = [flatten_dict(d) for d in data_list]
    # df1 = pd.DataFrame(flat_data_list)
    
    # Merge into master dataframe
    input_df = pd.concat([input_df,df], ignore_index=True)


In [None]:
input_df.head()

In [26]:
# file_path = os.path.join(raw_folder_path, txt_file)
# with open(dask_input_df, "r") as file:
dask_input_df = json.loads('/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/Input/0a68f111e4f448d3b8279db69cd9df5e_Input_Autobahn.txt'.read())
dask_input_df

AttributeError: 'str' object has no attribute 'read'

In [30]:
# Read file from disk
# file_path = os.path.join(raw_folder_path, txt_file)
with open('/Users/johnmadrid/GitHub/WestdriveLoopARData/Data/0_Raw/Input/0a68f111e4f448d3b8279db69cd9df5e_Input_Autobahn.txt', "r") as file:
    data_list = json.load(file)
    df_input = pd.DataFrame(data_list)
    

In [31]:
df_input

Unnamed: 0,TimeStamp,ReceivedInput,SteeringInput,AcellerationInput,BrakeInput
0,1.602674e+09,True,-0.005949,0.0,0.0
1,1.602674e+09,True,-0.005491,0.0,0.0
2,1.602674e+09,True,-0.005491,0.0,0.0
3,1.602674e+09,True,-0.005491,0.0,0.0
4,1.602674e+09,True,-0.005491,0.0,0.0
...,...,...,...,...,...
7555,1.602675e+09,True,-0.006865,0.0,0.0
7556,1.602675e+09,True,-0.006865,0.0,0.0
7557,1.602675e+09,True,-0.006865,0.0,0.0
7558,1.602675e+09,True,-0.006865,0.0,0.0
