# Introduction
This is the better-organized version of the pipeline for turning data from SimpleLogger.cs into csv files which the DEPTH visualizer can use in Unity. 

# Section 1: Put Everything Into One Big Data Frame

Process File takes a position csv and an interaction csv and returns a single data frame with all of the information you need, for addition to The Big Data Frame (combined_df). This function will not work with data produced by SimpleLogger.cs, because SimpleLogger.cs uses a slightly different format. The changes required for compatability with data produced by SimpleLogger shouldn't be huge, but some of the hard-coded numbers will not correspond to the correct column anymore. 

simplfy_df should remap that giant data from into a dataframe that contains only what you need. 

In [1]:
def process_file(positions_file, interactions_file, participant_number):
    positions = pd.read_csv(positions_file)
    interactions = pd.read_csv(interactions_file)
    
    # Add participant number to the positions DataFrame
    positions['participantNumber'] = participant_number
    
    #get just the interactions where we're going to the next step -- really should not be necessary when using SimpleLogger
    only_steps = interactions[interactions['interaction'] == 'step'].copy()
    
    #this section is all of the time formatting you're doing-- 
    #it's pretty redundant, and you can definitely cut some of it
    
    # The format of your datetime strings
    datetime_format = '%H:%M:%S.%f'
    # Convert 'startTime' to datetime objects
    only_steps['startTime'] = pd.to_datetime(only_steps['startTime'], format=datetime_format)
    #convert endTime too, just to make it not be weird and inconsistent
    only_steps['endTime'] = pd.to_datetime(only_steps['endTime'], format=datetime_format)
    positions['timeStamp'] = pd.to_datetime(positions['timeStamp'], format=datetime_format)
    interactions['startTime'] = pd.to_datetime(interactions['startTime'], format=datetime_format)
    interactions['endTime'] = pd.to_datetime(interactions['endTime'], format=datetime_format)
    
    # Initialize the 'currentStep', 'duration' columns
    positions['currentStep'] = None
    positions['duration'] = None
    
    
    #...the hard part, where we match step numbers and durations from one dataframe to positions from the other
    counter = 0 #number of errors we've encountered

    #here, we use anti-patterns as aggresively as possible in order to make python behave more like C#
    #it runs pretty slow
    for positionIndex in range(positions.shape[0]): #for all positions
        for stepIndex in range(only_steps.shape[0]): # for all steps
            #check if timeStamp is after starttime
            positionTimeStamp = positions.loc[positionIndex, 'timeStamp']
            startTime = only_steps.iloc[stepIndex, 0]
            startTime = pd.to_datetime(startTime) 
            if isinstance(positionTimeStamp, pd.Timestamp) and positionTimeStamp>startTime:
                if positionTimeStamp>startTime:
                    #time stamp is after start time.
                    #check if it is also before end time
                    endTime = only_steps.iloc[stepIndex,1]

                    try:
                        endTime = pd.to_datetime(endTime)
                        # if this position data comes from a timestamp that is during this step...
                        if isinstance(endTime, pd.Timestamp) and positionTimeStamp<endTime: 
                            #then set the 'currentStep' of this position to the correct step
                            positions.loc[positionIndex, 'currentStep'] = only_steps.iloc[stepIndex, 4]
                            #and set 'duration' to the duration of this step
                            positions.loc[positionIndex, 'duration'] = only_steps.iloc[stepIndex, 2]
                    except ValueError:
                        counter +=1
                else:
                    print("position time stamp before start time")
                    print(positionTimeStamp)

    print("Number of errors found in endTime parsing:")
    print(counter)
    
    #check that it worked...
    #print(positions.loc[900])
    
    return positions

# Make the Very Big Data Frame
By calling the above function... so many times. Takes a few minutes to run.

In [2]:
import pandas as pd

#P7 WAS RECORDED IN THE WRONG SCENE-- DO NOT INCLUDE

files_to_process = [
    ('P2/P2_positions.csv', 'P2/P2_interactions.csv', 1),
    ('P3/P3_positions.csv', 'P3/P3_interactions.csv', 2),
    ('P3/P3_positions2.csv', 'P3/P3_interactions2.csv', 3),
    ('P4/P4_positions.csv', 'P4/P4_interactions.csv', 4),
    ('P5/P5_positions.csv', 'P5/P5_interactions.csv', 5),
    ('P6/P6_positions.csv', 'P6/P6_interactions.csv', 6),
    ('P8/P8_positions.csv', 'P8/P8_interactions.csv', 8),
    ('P9/P9_positions.csv', 'P9/P9_interactions.csv', 9),
    ('P10/P10_positions.csv', 'P10/P10_interactions.csv', 10),
    ('P11/P11_positions.csv', 'P11/P11_interactions.csv', 11),    
    ('P12/P12_positions.csv', 'P12/P12_interactions.csv', 12),
    ('P13/P13_positions.csv', 'P13/P13_interactions.csv', 13),
    ('P16/P16_positions.csv', 'P16/P16_interactions.csv', 16),
    ('P17/P17_positions.csv', 'P17/P17_interactions.csv', 17),
    ('P18/P18_positions.csv', 'P18/P18_interactions.csv', 18),
    ('P19/P19_positions.csv', 'P19/P19_interactions.csv', 19),
    ('P20/P20_positions.csv', 'P20/P20_interactions.csv', 20),
    ('P21/P21_positions.csv', 'P21/P21_interactions.csv', 21),
]



processed_dfs = []  # List to store processed DataFrames

for positions, interactions, participant_number in files_to_process:
    processed_df = process_file(positions, interactions, participant_number)
    processed_dfs.append(processed_df)

# Concatenate all processed DataFrames into one
combined_df = pd.concat(processed_dfs, ignore_index=True)

print("done!")
print("number of rows in THE BIG DATA FRAME:")
print(combined_df.shape)

Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
Number of errors found in endTime parsing:
0
done!
number of rows in THE BIG DATA FRAME:
(86159, 13)


In [12]:
# check that it worked...
print(combined_df.head())

                      timeStamp        headPosition              headEuler  \
0 1900-01-01 11:19:07.683963100  (-1.5: -0.1: 17.7)      (0.0: 180.0: 0.0)   
1 1900-01-01 11:19:08.565464700   (-1.1: 1.9: 17.4)  (359.6: 170.2: 359.9)   
2 1900-01-01 11:19:08.898467400   (-1.1: 1.9: 17.4)    (0.3: 170.6: 359.8)   
3 1900-01-01 11:19:09.232968300   (-1.1: 1.9: 17.4)    (1.0: 171.5: 359.9)   
4 1900-01-01 11:19:09.565469000   (-1.1: 1.9: 17.4)    (0.9: 172.5: 359.9)   

               headQuat    leftHandPosition   rightHandPosition currentLeftGO  \
0  (0.0: 1.0: 0.0: 0.0)  (-1.5: -0.1: 17.7)  (-1.5: -0.1: 17.7)       INVALID   
1  (0.0: 1.0: 0.0: 0.1)   (-1.6: 0.0: 16.6)   (-1.6: 0.0: 16.6)       INVALID   
2  (0.0: 1.0: 0.0: 0.1)   (-1.6: 0.0: 16.6)   (-1.6: 0.0: 16.6)       INVALID   
3  (0.0: 1.0: 0.0: 0.1)   (-1.6: 0.0: 16.6)   (-1.6: 0.0: 16.6)       INVALID   
4  (0.0: 1.0: 0.0: 0.1)   (-1.6: 0.0: 16.6)   (-1.6: 0.0: 16.6)       INVALID   

  currentRightGO                            

In [27]:
def simplify_df(df, columns):
    return df[columns]

smaller_df = create_smaller_dataframe(combined_df, ['timeStamp', 'currentStep', 'duration', 'participantNumber','headPosition', 'rightHandPosition', 'leftHandPosition'])

print(smaller_df.loc[7000])  # Display the first few rows of the smaller DataFrame

timeStamp            1900-01-01 11:25:35.367872600
currentStep                                   None
duration                                      None
participantNumber                                2
headPosition                      (-2.2: 1.7: 7.8)
rightHandPosition                 (-2.4: 1.4: 7.8)
leftHandPosition                  (-2.4: 1.4: 7.7)
Name: 7000, dtype: object


# Section 2: Make Voxel Dataframes
This section takes our position data, which there could be any quantity of, and turn it into one matrix of coordinates per tracked object per frame. Every voxel for which combined_df has data is associated with a DEPTH value. In this step, we abstract away from information about specific participants, or information about how much data is associated with each voxel. 

In [29]:
#creates a second data frame, calculated from the big one, of just voxels, stepNumbers, objects, and struggle values.
#one dataframe per tracked object
#for efficiency reasons, it might eventually make sense to make a separate data frame for each step
#but that sounds like it might make the files more annoying to work with

#this takes ~5 seconds to run
def create_voxel_df(column_name, df):
    # Function to parse position string and convert to voxel coordinates
    def parse_position_to_voxel(pos_str):
        x, y, z = map(float, pos_str.strip('()').split(':'))
        return x, y, z

    # Apply the parsing and voxel function to the specified position data
    df['voxel_x'], df['voxel_y'], df['voxel_z'] = zip(*df[column_name].apply(parse_position_to_voxel))

    # Convert duration to a numerical value (total seconds)
    df['duration_seconds'] = pd.to_timedelta(df['duration']).dt.total_seconds()

    # Group by voxel coordinates and step number, then calculate the mean duration and count the rows
    voxel_df = df.groupby(['voxel_x', 'voxel_y', 'voxel_z', 'currentStep']).agg(
        struggleValue=('duration_seconds', 'mean'),
        rowCount=('duration_seconds', 'count')
    ).reset_index()
    
    voxel_df.rename(columns={'duration_seconds': 'struggleValue'}, inplace=True)

    # Sort by step number
    voxel_df.sort_values(by='currentStep', inplace=True)
    
    print("voxel head:")
    print(voxel_df.head())
    print("voxel shape:")
    print(voxel_df.shape)

    return voxel_df

## Make Voxel DataFrames
One data frame per tracked object

In [30]:
rightHand_df = create_voxel_df('rightHandPosition', combined_df)
leftHand_df = create_voxel_df('leftHandPosition', combined_df)
headPos_df = create_voxel_df('headPosition', combined_df)

#save them to csv files
rightHand_df.to_csv("rightHandVoxelVals.csv")
leftHand_df.to_csv("leftHandVoxelVals.csv")
headPos_df.to_csv("headPosVoxelVals.csv")

voxel head:
      voxel_x  voxel_y  voxel_z  currentStep  struggleValue  rowCount
923       0.9      0.8     -3.1            0      15.690191         3
327       0.6      1.1     -3.1            0       9.259005         4
325       0.6      1.1     -3.2            0      20.782579         1
324       0.6      1.1     -3.8            0       5.445603         1
4077      1.3      0.9     -3.0            0      22.845098         2
voxel shape:
(5370, 6)
voxel head:
      voxel_x  voxel_y  voxel_z  currentStep  struggleValue  rowCount
536       0.6      0.8     -3.3            0      20.782579         1
3592      1.2      0.7     -3.0            0      21.476953         1
4398      1.3      1.0     -2.8            0       9.581019         1
225       0.3      1.2     -2.6            0      20.470017         1
458       0.5      1.1     -3.9            0       5.445603         1
voxel shape:
(5112, 6)
voxel head:
      voxel_x  voxel_y  voxel_z  currentStep  struggleValue  rowCount
0       

# Section 3: Averages
When we have no data for a voxel, we should return the average duration for the overall step (rather than 0). 

In [32]:
def calculate_average_duration_per_step(df):
    # Convert duration to a numerical value (total seconds)
    df['duration_seconds'] = pd.to_timedelta(df['duration']).dt.total_seconds()
    
    #special for Varun-- get just the duration of steps on a participant by participant basis


    # Group by participant number and 'currentStep', then calculate the average duration
    step_duration_df = df.groupby(['participantNumber', 'currentStep'])['duration_seconds'].mean().reset_index()

    
    # Further group by 'currentStep' and calculate the overall average duration per step
    overall_step_duration_df = step_duration_df.groupby('currentStep')['duration_seconds'].mean().reset_index()
    overall_step_duration_df.rename(columns={'duration_seconds': 'averageDuration'}, inplace=True)
    
    print("done! Here's the average duration dataframe:")
    print(overall_step_duration_df)

    return overall_step_duration_df

average_struggle_df = calculate_average_duration_per_step(combined_df)
average_struggle_df.to_csv("averageStepDuration.csv")

done! Here's the average duration dataframe:
    currentStep  averageDuration
0             0        16.917993
1             1        31.883381
2             2        21.184851
3             3        48.474720
4             4       125.149054
5             5        47.282039
6             6        32.069374
7             7        59.709139
8             8        14.358170
9             9        17.338134
10           10        26.171734
11           11        11.046651
12           12        76.970495
13           13         6.477943
14           25        35.506851
15           26        69.298364
16           27        34.435676
17           28        35.327887
18           29       103.530542
19           30        60.309307
20           31        29.562788
21           32       207.833538
22           33        42.554830
23           34        31.826103


# Section 4: Utilities
Useful stuff, but not essential for getting data re-sorted and ready to be given to the Unity visualizer.

## a function for getting the DEPTH value for a specific step and set of voxels:

In [None]:
#coordinate of voxel, step, df of all struggle indexes for one tracked object, df of average duration for steps
def get_depth_value(x, y, z, step, voxel_df, fallback_df):
    # Query the DataFrame for the given coordinates and step number
    matching_row = voxel_df[(voxel_df['voxel_x'] == x) & 
                            (voxel_df['voxel_y'] == y) & 
                            (voxel_df['voxel_z'] == z) & 
                            (voxel_df['currentStep'] == step)]
    
    # Check if the query returned a result
    if not matching_row.empty:
        # Return the struggle value
        print("matching row is: ")
        print(matching_row['struggleValue'].iloc[0])
        print("based on this many data points: ")
        print(matching_row['rowCount'])
        return matching_row['struggleValue'].iloc[0]
    else:
        # Handle case where there is no matching voxel
        print("No struggle value found for the following values of x, y, z, and step:")
        print(x)
        print(y)
        print(z)
        print(step)
        print("Returning average duration for this step instead: ")
        # Find and return the average duration for the given step
        fallback_row = fallback_df[fallback_df['currentStep'] == step]
        if not fallback_row.empty:
            print(fallback_row['averageDuration'].iloc[0])
            return fallback_row['averageDuration'].iloc[0]
        else:
            print("Something went wrong-- cannot get struggle value.")
            return None  # Or a default value as needed