# Step 1  removal duplicates, cleaning, interpolation, smoothing coordinates

This code was developed and written originally by Debora Nolte. Then extended and adapted by Jasmin L. Walter

In [42]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# Preprocessing

## import

In [6]:
import copy  # copy big/deep objects by value
import csv
import datetime  # datetime operations
import itertools  # operate with iterators
import json  # read/write from/into json format
import math
import os  # OS operations (read/write files/folders)
import sys
import time
import warnings  # hide warnings
from collections import Counter
from itertools import groupby
import matplotlib

# process parallelization
from multiprocessing import Manager, Pool, RawArray, cpu_count
from os.path import exists

import matplotlib.pyplot as plt  # mother of plots focr Python

# import mlxtend
import numpy as np  # array/matrix operations (e.g. linear algebra)
import pandas as pd  # operate with dataframes
import pyxdf  # read XDF files (LSL streams recordings)

import scipy.stats
import seaborn as sns  # matplotlib plotting nice with shortcuts
from IPython.display import Markdown, display  # print nicely
from ipywidgets import IntProgress
#from matplotlib.pyplot import cm
from scipy.signal import savgol_coeffs
from tqdm.notebook import tqdm, trange  # mother of progressbars
from scipy.stats import ks_2samp

## optimization

In [7]:
# warnings.simplefilter(action="ignore", category=FutureWarning)

# raw and processed data paths
# PATH_RAW = "C:/Users/schmi/Documents/PhD_Osnabruck_University/SpaRe-VR/Spare-VR-EEG/27.07.23/data"
# PATH_PROC = "C:/Users/schmi/Documents/PhD_Osnabruck_University/SpaRe-VR/Spare-VR-EEG/EEG_Data_Skripte_Debbie/Events"
# PATH_FOREYE = "C:/Users/schmi/Documents/PhD_Osnabruck_University/SpaRe-VR/Spare-VR-EEG/EEG_Data_Skripte_Debbie/ET_Output_MAD-sacc"
# PATH_TRG = "C:/Users/schmi/Documents/PhD_Osnabruck_University/SpaRe-VR/Spare-VR-EEG/EEG_Data_Skripte_Debbie/TriggerFiles_fEEG"


# specify decimals format on pandas tables
# pd.options.display.float_format = "{:.5f}".format

# inline static plotting (default)
%matplotlib inline
# interactive plotting
# %matplotlib widget

# progress bar customized format
B_FORMAT = """📄 {n_fmt} of {total_fmt} {desc} processed: {bar} 
            {percentage:3.0f}% ⏱️{elapsed} ⏳{remaining} ⚙️{rate_fmt}{postfix}"""


CORES = cpu_count()  # number of cpu threads for multiprocessing
print(f"Total CPU threads: {CORES}")


def pbar_fork_hack():
    """
    Hack to enforce progress bars to be displayed by fork processes on
    IPython Apps like Jupyter Notebooks.

    Avoids [IPKernelApp] WARNING | WARNING: attempted to send message from fork

    Important: pass this function as argument for the initializer parameter
    while initializing a multiprocessing pool to make it work. E.g.:

    pool = Pool(processes=N_CORES, initializer=pbar_fork_hack)

    Source:
     - https://github.com/ipython/ipython/issues/11049#issue-306086846
     - https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
    """
    print(" ", end="", flush=True)

Total CPU threads: 8


## data paths & participant list

In [8]:
# DATA_PATH = "E:/WestbrookProject/SpaRe_Data/control_data/pre-processing_2023/step1_preparation"
DATA_PATH = "E:/WestbrookProject/SpaRe_Data/control_data/pre-processing_2023/Step0_dataPreparation"

data_savepath_cleaned = "E:/WestbrookProject/SpaRe_Data/control_data/pre-processing_2023/velocity_based/step1/1_cleaned"
data_savepath_interpolated = "E:/WestbrookProject/SpaRe_Data/control_data/pre-processing_2023//velocity_based/step1/2_interpolated"

data_savepath_smoothed =  "E:/WestbrookProject/SpaRe_Data/control_data/pre-processing_2023/velocity_based/step1/3_smoothed"

# Getting the Folder without hidden files in ascending order 
# DATA_FOLDER = sorted([f for f in os.listdir(DATA_PATH) if not f.startswith('.')], key=str.lower)
# PROCESSED_DATA_FOLDER = sorted([f for f in os.listdir(PROCESSED_DATA_PATH) if not f.startswith('.')], key=str.lower)

# savepath = r'F:\WestbrookProject\SpaRe_Data\pre-processing_2023\step1_preparation\'
# os.chdir(r'F:\WestbrookProject\SpaRe_Data\pre-processed_csv\')

In [9]:
# identify all participants from folder, use this code instead of the participant list based approach
# subIDs = []
# for sub in DATA_FOLDER:
#     if sub[0:4].isdigit() and sub.startswith('1'):
#         subIDs.append(int(sub[0:4]))
#     else:
#         pass
# subIDs = np.unique(subIDs)
# print(subIDs)

In [10]:
# Participant list of all participants that participated 5 sessions x 30 min
# in Westbrook city
PartList = [1004, 1005, 1008, 1010, 1011, 1013, 1017, 1018, 1019, 1021, 1022, 1023, 1054, 1055, 1056, 1057, 1058, 1068, 1069, 1072, 1073, 1074, 1075, 1077, 1079, 1080]
print(len(PartList),'participants') 

26 participants


In [11]:
# # custom participant list, if only a subset of participants should be processed
PartList = [1004]

## Change start time stamp from UNIX to seconds format indicating the passing time since start of the recording

In [8]:
# Convert the Unix timestamps to datetime objects
def convertTimeStamp(dataP):
    
    dataP['timeStampDataPointStart_converted'] = dataP['timeStampDataPointStart'] - dataP.loc[0,'timeStampDataPointStart']
    return dataP

In [9]:
# print(data.loc[0,'timeStampDataPointStart'])

## Remove Duplicates

In [10]:
def removeDuplicates(dataframe):

    # remove rows based on duplicates in the unity and eye tracking variables
    
    eyeUnityComb = dataframe.columns[10:37]
    # check if vars are correctly selected
#     for column_name in enumerate(eyeUnityComb):
#         print(column_name)
    
    
    # Check if differences between consecutive rows are consistently 0 for the selected columns

    # first create df with differences within each column - compare each row with the next
    diffDF = dataframe[eyeUnityComb].diff()

    # then check, if there are 0 differences (doublicates) in all columns
    repeatedRows = diffDF.eq(0).all(axis=1)
    
    

    # Remove duplicate rows
    df_no_duplicates = dataframe[~repeatedRows]

    # Reset index of the new DataFrame
    df_no_duplicates.reset_index(drop=True, inplace=True)
    
    
    return df_no_duplicates
    

In [11]:
# # test code
# file= "1004_Session_2_ET_1_data_prepared.csv"
# data = pd.read_csv(os.path.join(DATA_PATH, file))

# # data = convertTimeStamp(data)
# dataNew = removeDuplicates(data)



In [12]:
# print(len(data))
# print(len(dataNew))

# samplingRate1 = data['timeStampDataPointStart'].diff()
# samplingRate2 = dataNew['timeStampDataPointStart'].diff()

# print("---------------------------------")

# print("max sampling rate 1 ", 1/np.nanmin(samplingRate1))
# print("max sampling rate 2 ", 1/np.nanmin(samplingRate2))
# print("---------------------------------")

# print("mean sampling rate 1 ", 1/np.nanmean(samplingRate1))
# print("mean sampling rate 2 ", 1/np.nanmean(samplingRate2))
# print("---------------------------------")

# print("median sampling rate 1 ", 1/np.nanmedian(samplingRate1))
# print("median sampling rate 2 ", 1/np.nanmedian(samplingRate2))

# print("---------------------------------")



# fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(20, 10))

# axes[0].hist(1/samplingRate1, color='blue')
# axes[0].set_title('sampling rate data')


# axes[1].hist(1/samplingRate2, color= 'red')

# axes[1].set_title('sampling rate data without duplicates')

# save_file = os.path.join(savepath, 'Histogram_consecutiveRepeatedRows.png')
# plt.savefig(save_file, format='png')


## clean and interpolate the data

In [13]:
# # test code
# file= "1004_Session_2_ET_1_data_prepared.csv"
# data = pd.read_csv(os.path.join(DATA_PATH, file))



In [14]:
# ## test code  for plotting full data structure
# pd.set_option('display.max_columns', None)

# # Assuming 'data' is your DataFrame
# data_head = data.head(10)

# # Display head horizontally
# with pd.option_context('display.max_rows', None, 'display.width', 1000):
# #     display(data_head)
#     display(data.iloc[0:120])

# Preprocess Data, Interpolate Blinks

## function clean_et_data to clean the eye tracking data

### function to rename bad eye data to NaN

In [15]:
# renames all eye information variables in data to NaN
def eye_data_to_nan(dataF, time_interval):
    
    nanrep = np.nan
    
    # combined eye info
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionCombinedWorld_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionCombinedWorld_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionCombinedWorld_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionCombinedWorld_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionCombinedWorld_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionCombinedWorld_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionCombinedLocal_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionCombinedLocal_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionCombinedLocal_z'] = nanrep
    
    
    # left eye info
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionLeftWorld_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionLeftWorld_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionLeftWorld_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionLeftWorld_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionLeftWorld_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionLeftWorld_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionLeftLocal_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionLeftLocal_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionLeftLocal_z'] = nanrep
    
    
    # right eye info
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionRightWorld_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionRightWorld_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyePositionRightWorld_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionRightWorld_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionRightWorld_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionRightWorld_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionRightLocal_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionRightLocal_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'eyeDirectionRightLocal_z'] = nanrep
    
    return dataF
        

### funtion to rename bad collider data to NaN

In [16]:
# renames all eye information variables in data to NaN
def collider_data_to_nan(dataF, time_interval):
    
    nanrep = np.nan
    
    # processed collider data
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_name'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_hitPointOnObject_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_hitPointOnObject_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_hitPointOnObject_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_hitObjectColliderBoundsCenter_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_hitObjectColliderBoundsCenter_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_hitObjectColliderBoundsCenter_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'replacedRows'] = 'notClean'

    # NH processed collider
    
    # processed collider data
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_name'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_hitPointOnObject_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_hitPointOnObject_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_hitPointOnObject_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_hitObjectColliderBoundsCenter_x'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_hitObjectColliderBoundsCenter_y'] = nanrep
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'processedCollider_NH_hitObjectColliderBoundsCenter_z'] = nanrep
    
    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(time_interval), 'replacedRows_NH'] = 'notClean'
    
    return dataF

### cleaning function

In [17]:
### Function clean_et_data 
###Final Adjustment Blinks
# def generate_for_eye(uid):
def clean_et_data(dataF, file):
    """
    Correct is an event is too small.

    Parameters:
        file (str): Participant identifier.
    """

    # to determin the area around the blinks
    min_blink_duration = 0.02
    dilate_nan = 0.023  # add it as two samples before and after blink onset # In paper: 0.01 (<1 sample), therefore we use 0.023 (2 samples)
    
    # get a list with True == valid (so eyes open) and False == invalid (so eyes closed/not detected)
#     filt = ((data["gazeValidityCLR.Combined"] != 8) & 
#             ((hit_sort["eyeOpennessLR.LeftEye"] >= 0.05) | (hit_sort["eyeOpennessLR.RightEye"] >= 0.05)) & 
#             ((hit_sort["LeftEy  e_mm"] != -1) | (hit_sort["RightEye_mm"] != -1)))
    dataF['cleanData'] = ((dataF['combinedGazeValidityBitmask'] == 3) & 
                    ((dataF['eyeOpennessLeft'] >= 0.05) | (dataF['eyeOpennessRight'] >= 0.05)) &
                    ((dataF['pupilDiameterMillimetersLeft']  != -1) | (dataF['pupilDiameterMillimetersRight'] != -1)))
    

    #hit_sort['valid'] = filt
#     valid = filt.tolist()
#     valid = data['cleanData'].toList() #unsure if this is needed (Jasmin)
    dataF['isBlink'] = False

    # to check how many close samples there were
    cnt = False
    cnt_item = 0.0

    # to get the blink onset
    blinking = False  # will be true during the blinking
    blink_time = 0.0  # will be updated to blink onset
    blinks = [0.0] * len(dataF)
   
    ### check all data for blinks and rename all invalide data to NaN
    
    # go through the entire list of timestamps
    for t, item in enumerate(dataF['timeStampDataPointStart_converted']):
        
        
        # to check for blinks: get blink onset
        if (dataF['cleanData'][t] == False) and not blinking:
            blinking = True
            # get the time of blinking onset
            blink_time = item
        # if they are over, do:
        elif dataF['cleanData'][t] == True and blinking:
            # reset the blinking parameter
            blinking = False
            # get the end of the blink (so the previous time stamp)
            it = dataF['timeStampDataPointStart_converted'][t - 1]
            
            
            # if the blink duration is bigger than the min blink duration:
            # adapt the blink duration plus the dialate_nan
            if it - blink_time >= min_blink_duration:
                
                
                # Identify valid blink duration
                valid_blink_mask = None
                valid_blink_mask = ((dataF['timeStampDataPointStart_converted'] >= (blink_time - dilate_nan))
                                    & (dataF['timeStampDataPointStart_converted'] <= (it + dilate_nan)))

                if valid_blink_mask.any():
                    # Find the time interval for the blink
                    start_index = dataF.index[valid_blink_mask][0]
                    end_index = dataF.index[valid_blink_mask][-1]

                    # Create the time interval 'ts'
                    ts = dataF.loc[start_index:end_index, 'timeStampDataPointStart_converted']
                    
                    # now rename all relevant data in the blinking ts time interval to NaN and mark it in isBlink
                    dataF = eye_data_to_nan(dataF,ts)
                    
                    dataF = collider_data_to_nan(dataF,ts)
                    
                    dataF.loc[dataF['timeStampDataPointStart_converted'].isin(ts), 'isBlink'] = True
     

            
            # if the blink duration is too small, we do not add an additional window around it
            else:
                valid_mask = None
                valid_mask = ((dataF['timeStampDataPointStart_converted'] >= blink_time) 
                              & (dataF['timeStampDataPointStart_converted'] <= it))

                if valid_mask.any():
                    # Find the time interval for the blink
                    start_index = dataF.index[valid_mask][0]
                    end_index = dataF.index[valid_mask][-1]

                    # Create the time interval 'ts'
                    ts = dataF.loc[start_index:end_index, 'timeStampDataPointStart_converted']
                    
                    # rename all eye and collider data as above
                    dataF = eye_data_to_nan(dataF, ts)
                    dataF = collider_data_to_nan(dataF,ts)  
                    
        
    return dataF


##  interpolation function for eye tracking data

In [4]:
def interpolate_et_data(dataF):
#     print("1 start")
       
#     dataF['processedCollider_name_interpolated'] = dataF['processedCollider_name'].copy()
#     dataF['processedCollider_NH_name_interpolated'] = dataF['processedCollider_NH_name'].copy()
    
    
    # hit collider name: interpolate if there is only one nan between the same colliders
    # do it for both processed collider
#     print("2 interpolate 1 collider")
#     for h in range(len(data) - 2):
#         if data['processedCollider_name_interpolated'][h + 2] == data['processedCollider_name_interpolated'][h] and pd.isnull(data['processedCollider_name_interpolated'][h + 1]):
#             data['processedCollider_name_interpolated'][h + 1] = data['processedCollider_name_interpolated'][h]
        
#         if data['processedCollider_NH_name_interpolated'][h + 2] == data['processedCollider_NH_name_interpolated'][h] and pd.isnull(data['processedCollider_NH_name_interpolated'][h + 1]):
#             data['processedCollider_NH_name_interpolated'][h + 1] = data['processedCollider_NH_name_interpolated'][h]
        

    data_copy = dataF.copy()

    for h in range(len(data_copy) - 2):
        if (
            data_copy['processedCollider_name'].iloc[h + 2] == data_copy['processedCollider_name'].iloc[h]
            and pd.isnull(data_copy['processedCollider_name'].iloc[h + 1])
        ):
            data_copy.loc[h + 1, 'processedCollider_name'] = data_copy['processedCollider_name'].iloc[h]
            data_copy.loc[h + 1, 'replacedRows'] = 'interpolated'

            
        if (
            data_copy['processedCollider_NH_name'].iloc[h + 2] == data_copy['processedCollider_NH_name'].iloc[h]
            and pd.isnull(data_copy['processedCollider_NH_name'].iloc[h + 1])
        ):
            data_copy.loc[h + 1, 'processedCollider_NH_name'] = data_copy['processedCollider_NH_name'].iloc[h]
            data_copy.loc[h + 1, 'replacedRows_NH'] = 'interpolated'


#     # If the modifications are successful and you want to update the original DataFrame
    dataF.update(data_copy)
    
    # duplicate the hit point collider variables to keep track of original hit points without any interpolations 
    # while interpolation is useful for the eye movement detection, it creates unrealistic hit points at places where there
    # might be no objects in the virtual realtiy --> for some analysis or visualizations, the original hit points might be useful
    
    
    dataF["original_processedCollider_hitPointOnObject_x"] = dataF["processedCollider_hitPointOnObject_x"]
    dataF["original_processedCollider_hitPointOnObject_y"] = dataF["processedCollider_hitPointOnObject_y"]
    dataF["original_processedCollider_hitPointOnObject_z"] = dataF["processedCollider_hitPointOnObject_z"]
    dataF["original_processedCollider_NH_hitPointOnObject_x"] = dataF["processedCollider_NH_hitPointOnObject_x"]
    dataF["original_processedCollider_NH_hitPointOnObject_y"] = dataF["processedCollider_NH_hitPointOnObject_y"]
    dataF["original_processedCollider_NH_hitPointOnObject_z"] = dataF["processedCollider_NH_hitPointOnObject_z"]
    
    
    # interpolate eye information
    # interpolate hit on collider coordinates
    # do not interpolate collider names (+ collider center coordinates - must match collider names)
    # mark interpolated rows
    
#     print("3 start going through all columns")
    for column_name in dataF:
        # do not interpolate these columns
        if column_name in [
            "eyePositionCombinedWorld_x",
            "eyePositionCombinedWorld_y",
            "eyePositionCombinedWorld_z",
            "eyeDirectionCombinedWorld_x",
            "eyeDirectionCombinedWorld_y",
            "eyeDirectionCombinedWorld_z",
            "eyeDirectionCombinedLocal_x",
            "eyeDirectionCombinedLocal_y",
            "eyeDirectionCombinedLocal_z",
            "eyePositionLeftWorld_x",
            "eyePositionLeftWorld_y",
            "eyePositionLeftWorld_z",
            "eyeDirectionLeftWorld_x",
            "eyeDirectionLeftWorld_y",
            "eyeDirectionLeftWorld_z",
            "eyeDirectionLeftLocal_x",
            "eyeDirectionLeftLocal_y",
            "eyeDirectionLeftLocal_z",
            "eyePositionRightWorld_x",
            "eyePositionRightWorld_y",
            "eyePositionRightWorld_z",
            "eyeDirectionRightWorld_x",
            "eyeDirectionRightWorld_y",
            "eyeDirectionRightWorld_z",
            "eyeDirectionRightLocal_x",
            "eyeDirectionRightLocal_y",
            "eyeDirectionRightLocal_z",
            "processedCollider_hitPointOnObject_x",
            "processedCollider_hitPointOnObject_y",
            "processedCollider_hitPointOnObject_z",
            "processedCollider_NH_hitPointOnObject_x",
            "processedCollider_NH_hitPointOnObject_y",
            "processedCollider_NH_hitPointOnObject_z"
        ]:
#             print("5", column_name)
            b = dataF[column_name].values.tolist()
    
            # add a variable to keep track of the interpolated rows (for the eye data and for the hit point data)
            if column_name == 'eyePositionCombinedWorld_x':
                removedData = np.zeros(len(b), dtype=bool)
                interpolated = np.zeros(len(b), dtype=bool)
            
            if column_name == 'processedCollider_hitPointOnObject_x':
                interpolatedHP = np.zeros(len(b), dtype=bool)
    
    
            # get number of nan
            v = [
                len(list(group))
                for key, group in groupby(b, key=pd.isnull)
                if key
            ]

            # get corresponding time for each group in v
            idx = [
                idx + 1
                for idx in range(len(b) - 1)
                if not pd.isnull(b[idx]) and pd.isnull(b[idx + 1])
            ]
            if pd.isnull(b[0]):
                idx.insert(
                    0, 0
                )  # if the first element is nan, it will be added here

            # interpolate data
            dataF[column_name] = dataF[column_name].interpolate(
                method="linear", limit_direction="both"
            )
            
            # mark the interpolated rows in the boolean variables
            if column_name == 'eyePositionCombinedWorld_x':
                removedData[np.isnan(b)] = True
                interpolated[np.isnan(b)] = True
            if column_name == 'processedCollider_hitPointOnObject_x':
                interpolatedHP[np.isnan(b)] = True
            
            
            # go through v: if the beginning and end difference is bigger than allowed, replace interpolated data with nan
            b = dataF[column_name].values.tolist()
            b = np.array(
                b
            )  # for the filling in an array is needed instead of a list
            for t, item in enumerate(idx):
                # finish for the last timestamp
                if item + v[t] == len(dataF['timeStampDataPointStart_converted']):
                    break
                # if the distance is bigger then 250ms we do not want to interpolate --> replace values with nan
                if dataF['timeStampDataPointStart_converted'][item + v[t]] - dataF['timeStampDataPointStart_converted'][item] > 0.25:
                    b[item : item + v[t]] = np.nan * len(b[item : item + v[t]])
                    
                    if column_name == 'eyePositionCombinedWorld_x':
                        interpolated[item : item + v[t]] = False
                    if column_name == 'processedCollider_hitPointOnObject_x':
                        interpolatedHP[item : item + v[t]] = False

            # replace the column with interpolated one
            dataF[column_name] = b.tolist()
            
            
             # Add the tracking columns to the DataFrame 
            if column_name == 'processedCollider_hitPointOnObject_x':
                dataF['removedData'] = removedData
                dataF['interpolated'] = interpolated
            if column_name == 'processedCollider_hitPointOnObject_x':
                 dataF['interpolatedHitPoint'] = interpolatedHP
       
    # save df:
#     for_eye.to_csv(f"{PATH_FOREYE}/correTS_{uid}.csv", index=True)

    return dataF

# Smoothing the Data: 5-point median Filter

In [2]:
# smooth coordinates with 5-point median filter
# based on remodnav --> has almost the same length as our filter
def smooth_coordinates(file, data):
    
    # the coordinates to be smoothed
    Xcorr_position_old = data["eyePositionCombinedWorld_x"].tolist()
    Ycorr_position_old = data["eyePositionCombinedWorld_y"].tolist()
    Zcorr_position_old = data["eyePositionCombinedWorld_z"].tolist()
    hpooX_old = data["processedCollider_hitPointOnObject_x"].tolist()
    hpooY_old = data["processedCollider_hitPointOnObject_y"].tolist()
    hpooZ_old = data["processedCollider_hitPointOnObject_z"].tolist()

    Xcorr_position = []
    Ycorr_position = []
    Zcorr_position = []
    hpooX = []
    hpooY = []
    hpooZ = []
    # for NH collider column as well
    hpooX_NH_old = data["processedCollider_NH_hitPointOnObject_x"].tolist()
    hpooY_NH_old = data["processedCollider_NH_hitPointOnObject_y"].tolist()
    hpooZ_NH_old = data["processedCollider_NH_hitPointOnObject_z"].tolist()

    hpooX_NH = []
    hpooY_NH = []
    hpooZ_NH = []
    
    # adapt algorithm to varying sampling rate --> first identify necessary sample window to cover a duration of 0.05 sec min
    # .... then apply nanmedian --> handle the edge cases approrpiately (start and end of file). 

    # smooth it:
    for s in range(len(Xcorr_position_old)):
        if s - 2 >= 0 and s + 2 <= len(Xcorr_position_old):
            Xcorr_position.append(
                np.nanmedian([Xcorr_position_old[s - 2 : s + 3]])
            )
            Ycorr_position.append(
                np.nanmedian([Ycorr_position_old[s - 2 : s + 3]])
            )
            Zcorr_position.append(
                np.nanmedian([Zcorr_position_old[s - 2 : s + 3]])
            )

            hpooX.append(np.nanmedian([hpooX_old[s - 2 : s + 3]]))
            hpooY.append(np.nanmedian([hpooY_old[s - 2 : s + 3]]))
            hpooZ.append(np.nanmedian([hpooZ_old[s - 2 : s + 3]]))

            # for NH as well
            hpooX_NH.append(np.nanmedian([hpooX_NH_old[s - 2 : s + 3]]))
            hpooY_NH.append(np.nanmedian([hpooY_NH_old[s - 2 : s + 3]]))
            hpooZ_NH.append(np.nanmedian([hpooZ_NH_old[s - 2 : s + 3]]))

        elif s - 2 < 0:
            Xcorr_position.append(np.nanmedian([Xcorr_position_old[: s + 3]]))
            Ycorr_position.append(np.nanmedian([Ycorr_position_old[: s + 3]]))
            Zcorr_position.append(np.nanmedian([Zcorr_position_old[: s + 3]]))

            hpooX.append(np.nanmedian([hpooX_old[: s + 3]]))
            hpooY.append(np.nanmedian([hpooY_old[: s + 3]]))
            hpooZ.append(np.nanmedian([hpooZ_old[: s + 3]]))

            # NH
            hpooX_NH.append(np.nanmedian([hpooX_NH_old[: s + 3]]))
            hpooY_NH.append(np.nanmedian([hpooY_NH_old[: s + 3]]))
            hpooZ_NH.append(np.nanmedian([hpooZ_NH_old[: s + 3]]))
        else:
            Xcorr_position.append(np.nanmedian([Xcorr_position_old[s - 2 :]]))
            Ycorr_position.append(np.nanmedian([Ycorr_position_old[s - 2 :]]))
            Zcorr_position.append(np.nanmedian([Zcorr_position_old[s - 2 :]]))

            hpooX.append(np.nanmedian([hpooX_old[s - 2 :]]))
            hpooY.append(np.nanmedian([hpooY_old[s - 2 :]]))
            hpooZ.append(np.nanmedian([hpooZ_old[s - 2 :]]))

            # NH
            hpooX_NH.append(np.nanmedian([hpooX_NH_old[s - 2 :]]))
            hpooY_NH.append(np.nanmedian([hpooY_NH_old[s - 2 :]]))
            hpooZ_NH.append(np.nanmedian([hpooZ_NH_old[s - 2 :]]))

#     for_eye_n = pd.read_csv(f"{PATH_FOREYE}/correTS_{uid}.csv", index_col=0) # unclear why this line exists (Jasmin)
    data["eyePositionCombinedWorld_x"] = Xcorr_position
    data["eyePositionCombinedWorld_y"] = Ycorr_position
    data["eyePositionCombinedWorld_z"] = Zcorr_position

    data["processedCollider_hitPointOnObject_x"]= hpooX
    data["processedCollider_hitPointOnObject_y"] = hpooY
    data["processedCollider_hitPointOnObject_z"] = hpooZ

    data["processedCollider_NH_hitPointOnObject_x"] = hpooX_NH
    data["processedCollider_NH_hitPointOnObject_y"] = hpooY_NH
    data["processedCollider_NH_hitPointOnObject_z"] = hpooZ_NH

   

    return data


## Loop over all data files and apply cleaning & interpolation functions

In [5]:
# os.chdir(DATA_PATH)
print('start')
# useful overviews
noFilePartList = [len(PartList)]
missingFiles = pd.DataFrame(columns=['Participant', 'Session'])

list_dublicates = []
# shared_list_lock = multiprocessing.Lock()


# loop code over all participants in participant list
parts_pbar = tqdm(
    iterable=PartList,
    total=len(PartList),
    desc="participants",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

for indexPart, currentPart in enumerate(parts_pbar):
#     currentPart = PartList[indexPart]
    print(f'Participant {indexPart} - participant ID {currentPart}')
    
    sessions = list(range(1,6))
#     sessions = list(range(2,3))
    
    session_pbar = tqdm(
        iterable=sessions,
        total=len(sessions),
        desc="sessions________",
        dynamic_ncols=True,
        bar_format=B_FORMAT,
    )
    
    
    # loop over recording sessions (should be 5 for each participant)
#     for indexSess in range(1, 6):
#     for indexSess in range(1, 2):
    for indexSess in session_pbar:

        # get eye tracking sessions and loop over them (amount of ET files can vary)
        dirSess = [f for f in os.listdir(DATA_PATH) if f.startswith(f'{currentPart}_Session_{indexSess}') and f.endswith('_data_prepared.csv')]
#         print(dirSess)
#         print(f'Session {indexSess}')
        
        if not dirSess:
            hMF = pd.DataFrame({'Participant': [currentPart], 'Session': [indexSess]})
            missingFiles = pd.concat([missingFiles, hMF], ignore_index=True)
            
        else:
            # Main part - runs if files exist
            
#             participants progress bar
            file_pbar = tqdm(
                iterable=dirSess,
                total=len(dirSess),
                desc="📂 ET files____________",
                dynamic_ncols=True,
                bar_format=B_FORMAT,
            )

#             parts_pbar = tqdm(
#                 iterable=dirSess[:1],
#                 total=1,
#                 desc="📂 participants",
#                 dynamic_ncols=True,
#                 bar_format=B_FORMAT,
#             )
    
           
            # loop necessary for displaying properly the progressbar with multiprocessing
            # source: https://stackoverflow.com/a/40133278
            
#             info_list = []
            
            for file in file_pbar:
#             for file in dirSess[2]:
                print('process file ', file)
            
                # read in data
                data = pd.read_csv(os.path.join(DATA_PATH, file))

                #  remove all dublicates of the start timestamp, then save the information about it 
                # --> no need to use the converted time stamp yet

                initial_rows = len(data.copy())

                data = data[~data['timeStampDataPointStart'].duplicated(keep="first")].reset_index(drop=True)

                initial_rows - len(data)

                removed_rows = initial_rows - len(data)
                percentage_removed = (removed_rows / initial_rows) * 100

                list_dublicates.append([file, initial_rows, removed_rows, percentage_removed])
                # Synchronize access to the shared list
#                 with shared_list_lock:
#                     list_dublicates.append([file, initial_rows, removed_rows, percentage_removed])

################################################ apply functions to data ###############################
                
                # convert the start time stamp by substracting the first timestamp of the file,
                # like this, the converted timestamp starts at 0 and represents the sec passed since the start of the recording
                data = convertTimeStamp(data)
                
                # remove all duplicate rows (due to the bug in the saving script resulting in a too fast running coroutine)
                data = removeDuplicates(data)

                # apply above cleaning function to the data
                data = clean_et_data(data,file)
        
                # save the cleaned data
                newName = file.replace("data_prepared.csv", "data_cleaned.csv")
                data.to_csv(os.path.join(data_savepath_cleaned, newName), index = False)
                
                
                # interpolate the data if the consecutive missing data cluster is less than 250 min long
                data = interpolate_et_data(data)
                
                # save the interpolated data
                newName = file.replace("data_prepared.csv", "data_interpolated.csv")
                data.to_csv(os.path.join(data_savepath_interpolated, newName), index = False)
                
                
                # smooth the data
                data = smooth_coordinates(file, data)

                
                # save the smoothed data

                saveName = file.replace("data_prepared.csv", "data_smoothed.csv")
                data.to_csv(os.path.join(data_savepath_smoothed, saveName), index = False)
    
                # delete data variable to improve memory efficiency
                del data

            
            # Use ThreadPoolExecutor for parallel processing
#             with concurrent.futures.ThreadPoolExecutor() as executor:
#                 # Use submit to asynchronously execute the function for each file in dirSess
#                 futures = [executor.submit(process_data, file, list_duplicates) for file in parts_pbar]

#                 # Wait for all tasks to complete
#                 concurrent.futures.wait(futures)

            
            
 
                

    
                


        
info_dublicates = pd.DataFrame(list_dublicates, columns=['File', 'Initial Rows', 'Removed Rows', 'Percentage Removed'])
                
info_dublicates.head()
                
# next step - the data will be resampled to create a consistent sampling rate. 
# This will be done in Matlab, since the processing function there convenient 

start


NameError: name 'PartList' is not defined

In [21]:
info_dublicates.to_csv(os.path.join(data_savepath_cleaned, "info_dublicates.csv"), index = False)