# order_check_labeling_df script to order and check the hand-labeling files

In [None]:
%reset

In [2]:
import copy  # copy big/deep objects by value
import csv
import datetime  # datetime operations
import itertools  # operate with iterators
import json  # read/write from/into json format
import math
import os  # OS operations (read/write files/folders)
import sys
import time
import warnings  # hide warnings
from collections import Counter
from itertools import groupby
import matplotlib

# process parallelization
from multiprocessing import Manager, Pool, RawArray, cpu_count
from os.path import exists

import matplotlib.pyplot as plt  # mother of plots focr Python

# import mlxtend
import numpy as np  # array/matrix operations (e.g. linear algebra)
import pandas as pd  # operate with dataframes
import pyxdf  # read XDF files (LSL streams recordings)

import scipy.stats
import seaborn as sns  # matplotlib plotting nice with shortcuts
from IPython.display import Markdown, display  # print nicely
from ipywidgets import IntProgress
#from matplotlib.pyplot import cm
from scipy.signal import savgol_coeffs
from tqdm.notebook import tqdm, trange  # mother of progressbars
from scipy.stats import ks_2samp
from multiprocessing import Pool, cpu_count

import scipy.signal

In [3]:
datapath = "E:/Cyprus_project_overview/data/ETdata/oldData/final_labeling_data_frame_nr_NOT_sorted"
savepath = "E:/Cyprus_project_overview/data/ETdata/labeled_ET_data_sorted"
savepathGraphPrep = "E:/Cyprus_project_overview/data/graphs/graphPrepETdata"

In [9]:

# Initialize a problem counter
problem_checkIndex = 0
problem_checkTSorder = 0

indexChecks = 0

allTimestamp = []

allFilesCounter = 0

# Iterate through sessions (1 to 5)
for indexSess in range(1, 6):
    # Get eye-tracking session files
    session_files = sorted(
        [f for f in os.listdir(datapath) if f.startswith(f"fixation_GPS_Expl_{indexSess}") and f.endswith("_labelled.csv")]
    )
    
    # Main part - runs if files exist
    for indexET, filename in enumerate(session_files, start=1):
        allFilesCounter += 1
        print(f"Process file: Session_{indexSess}_ET_{indexET}: {filename}")

        # Read the data
        filepath = os.path.join(datapath, filename)
        data_raw = pd.read_csv(filepath)

        # Sort the data by "frame_nr" in ascending order
        data_sorted = data_raw.sort_values(by="frame_nr", ascending=True, ignore_index=True)

        # Check if the dataset has a "Var1" column
        if "Unnamed: 0" in data_sorted.columns:
            indexChecks += 1
            # Generate the expected order
            test_order = pd.Series(range(len(data_sorted)))

            # Compare the "Var1" column with the expected order
            is_sorted = data_sorted["Unnamed: 0"].reset_index(drop=True).eq(test_order)

            if not(is_sorted.sum() == len(data_sorted)):
                print("Problem in sorting detected!!!!!!!!!!")
                problem_checkIndex += 1
                
        is_increasing = data_sorted["timestamp_[ns]"].diff().gt(0)  # diff() checks if each value is greater than the previous one

        if not(is_increasing[1:].all()):
            print("Problem detected: timestamp_[ns] values are not increasing!")
            problem_checkTSorder += 1

        allTimestamp.extend(data_sorted["timestamp_[ns]"].iloc[1:].tolist())
        data_sorted.to_csv(os.path.join(savepath, filename), index = False)
        
        ##################################################################################
        ## also, since the data csv files have so different formating, Matlab cannot open them reliably. 
        ## Therefore, we will prep the data frames specifically for the graph creation in matlab

        
        columns_to_keep = ["frame_nr", "timestamp_[ns]", "house_nr"]
        data_selected =  data_sorted[columns_to_keep]

        
        saveName =  filename.replace("fixation_GPS","graphPrep")
        data_selected.to_csv(os.path.join(savepathGraphPrep, saveName), index = False)
        ######################################################################################


print(f"out of {indexChecks} index checks, {problem_checkIndex} were not successful")
print(f" {problem_checkTSorder} timestamp order checks were not successful")


# Check if all timestamps are strictly increasing
is_increasing = all(t1 < t2 for t1, t2 in zip(allTimestamp[1:], allTimestamp[2:]))

if is_increasing:
    print("All timestamps are strictly increasing across all files.")
else:
    print("Timestamps are NOT strictly increasing across all files!")

# Optionally, print the number of collected timestamps for debugging
print(f"Number of timestamps collected: {len(allTimestamp)}")

print(f"{allFilesCounter} files were processed")




print("done")

Process file: Session_1_ET_1: fixation_GPS_Expl_1_ET_1_labelled.csv
Process file: Session_1_ET_2: fixation_GPS_Expl_1_ET_2_labelled.csv
Process file: Session_1_ET_3: fixation_GPS_Expl_1_ET_3_labelled.csv
Process file: Session_2_ET_1: fixation_GPS_Expl_2_ET_1_labelled.csv
Process file: Session_2_ET_2: fixation_GPS_Expl_2_ET_2_labelled.csv
Process file: Session_2_ET_3: fixation_GPS_Expl_2_ET_3_labelled.csv
Process file: Session_3_ET_1: fixation_GPS_Expl_3_ET_1_labelled.csv
Process file: Session_3_ET_2: fixation_GPS_Expl_3_ET_2_labelled.csv
Process file: Session_3_ET_3: fixation_GPS_Expl_3_ET_3_labelled.csv
Process file: Session_4_ET_1: fixation_GPS_Expl_4_ET_1_labelled.csv
Process file: Session_4_ET_2: fixation_GPS_Expl_4_ET_2_labelled.csv
Process file: Session_5_ET_1: fixation_GPS_Expl_5_ET_1_labelled.csv
Process file: Session_5_ET_2: fixation_GPS_Expl_5_ET_2_labelled.csv
out of 12 index checks, 0 were not successful
 0 timestamp order checks were not successful
All timestamps are stric