# Kalman Filter Multi-step Prediction

## Import and Initial Setup
Set directory to load log files from.

In [1]:
import os
import rospkg

# Create a RosPack object
rospack = rospkg.RosPack()

# Get the path to the package this script is in
package_path = rospack.get_path('hri_predict_ros')

# Define the path to the logs directory
log_dir = os.path.join(package_path, 'logs')

# Specify the path to the rosbag files and the gui data
bag_dir = os.path.join(log_dir, 'bag')
gui_dir = os.path.join(log_dir, 'gui_data')
npz_dir = os.path.join(log_dir, 'npz')

# Define the path to the plots directory
plot_dir = os.path.join(package_path, 'plots')
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

Import npz files in Pandas dataframes for each subject

In [2]:
import os, sys
import pandas as pd
import numpy as np

# Get a list of all folders in npz_dir
folders = [f for f in os.listdir(npz_dir) if os.path.isdir(os.path.join(npz_dir, f))]

# Create an empty dictionary to store the dataframes
npz_dfs = {}

# Iterate over each folder
for folder in folders:
    folder_path = os.path.join(npz_dir, folder)
    
    # Get a list of all npz files in the folder
    npz_files = [f for f in os.listdir(folder_path) if f.endswith('.npz')]
    sorted_npz_files = sorted(npz_files, key=lambda s: int(s.split('_')[1].split('.')[0]))

    print("Processing folder '%s' with %d npz files" % (folder, len(sorted_npz_files)))

    df_all = []
    
    # Iterate over each npz file
    for npz_file in sorted_npz_files:
        npz_file_path = os.path.join(folder_path, npz_file)
        
        # Load the npz file
        data = np.load(npz_file_path)

        df= pd.DataFrame.from_dict({item: data[item] for item in data.files}, orient='index')
        df = df.transpose()

        # Append the dataframe to the list
        df_all.append(df)

    # Concatenate all the dataframes in the list
    df_all = pd.concat(df_all, ignore_index=True)

    print("Processed subject %s. Number of rows: %d\n" % (folder, len(df_all)))

    # Store the dataframe in the dictionary
    npz_dfs[folder] = df_all

    print(npz_dfs)

Processing folder 'sub_3' with 2449 npz files
Processed subject sub_3. Number of rows: 2449

{'sub_3':                timestamp                                     human_meas_pos  \
0     1716796745.5509915  [0.819883158784425, -0.05287422669015035, 1.68...   
1      1716796745.651167  [0.8205125603757503, -0.052190181002376324, 1....   
2      1716796745.750892  [0.8206716144323638, -0.05128366088315833, 1.6...   
3     1716796745.8510232  [0.8209651998729581, -0.05046633399422007, 1.6...   
4     1716796745.9510567  [0.8200319485494555, -0.051277938283696534, 1....   
...                  ...                                                ...   
2444  1716797087.2509308  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2445  1716797087.3509202  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2446  1716797087.4510727  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2447  1716797087.5512564  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2448  1716797087.6506524  [n

Import gui_data for each subject

In [3]:
import os
import pandas as pd

# Get all the file names in the directory
file_names = os.listdir(gui_dir)

# DEBUG
file_names = ['gui_log_sub_3.txt', 'gui_log_sub_4.txt', 'gui_log_sub_6.txt', 'gui_log_sub_7.txt']

# Initialize an empty dictionary to store the dataframes
gui_dfs = {}

# Iterate over each file
for file_name in file_names:
    # Check if the file is a text file
    if file_name.endswith('.txt'): # they are txt files, but structured as csv
        # Construct the file path
        file_path = os.path.join(gui_dir, file_name)
        
        # Read the file as a dataframe
        df = pd.read_csv(file_path)
        
        # Add the dataframe to the dictionary using a portion of the file name as the key
        key = file_name.split('.')[0].split('_')[-2:]
        key = '_'.join(key)
        gui_dfs[key] = df

# DEBUG: Print the first rows of the first dictionary
print(gui_dfs)

{'sub_3':                   Timestamp     Task_name Velocity  Instruction_id  \
0   2024-05-27 09:59:25.821  PICK-&-PLACE     SLOW               0   
1   2024-05-27 09:59:32.939  PICK-&-PLACE     SLOW               1   
2   2024-05-27 09:59:40.066  PICK-&-PLACE     SLOW               2   
3   2024-05-27 09:59:47.190  PICK-&-PLACE     SLOW               3   
4   2024-05-27 09:59:54.305  PICK-&-PLACE     SLOW               4   
5   2024-05-27 10:00:01.437  PICK-&-PLACE     SLOW               5   
6   2024-05-27 10:00:08.552  PICK-&-PLACE     SLOW               6   
7   2024-05-27 10:00:15.675  PICK-&-PLACE     SLOW               7   
8   2024-05-27 10:00:22.795  PICK-&-PLACE     SLOW               8   
9   2024-05-27 10:00:34.994       WALKING     SLOW               0   
10  2024-05-27 10:00:41.088       WALKING     SLOW               1   
11  2024-05-27 10:00:47.182       WALKING     SLOW               2   
12  2024-05-27 10:00:53.275       WALKING     SLOW               3   
13  2024-0

Create lists of column names

In [13]:
n_keypoints = 18
state_names = ['kp{}_{}'.format(i, suffix)
               for i in range(n_keypoints)
               for suffix in ['x', 'xd', 'xdd', 'y', 'yd', 'ydd', 'z', 'zd', 'zdd']]
print(state_names)
print(len(state_names))

measurement_names = ['kp{}_{}'.format(i, suffix)
                     for i in range(n_keypoints)
                     for suffix in ['x', 'y', 'z']]
print(measurement_names)
print(len(measurement_names))

['kp0_x', 'kp0_xd', 'kp0_xdd', 'kp0_y', 'kp0_yd', 'kp0_ydd', 'kp0_z', 'kp0_zd', 'kp0_zdd', 'kp1_x', 'kp1_xd', 'kp1_xdd', 'kp1_y', 'kp1_yd', 'kp1_ydd', 'kp1_z', 'kp1_zd', 'kp1_zdd', 'kp2_x', 'kp2_xd', 'kp2_xdd', 'kp2_y', 'kp2_yd', 'kp2_ydd', 'kp2_z', 'kp2_zd', 'kp2_zdd', 'kp3_x', 'kp3_xd', 'kp3_xdd', 'kp3_y', 'kp3_yd', 'kp3_ydd', 'kp3_z', 'kp3_zd', 'kp3_zdd', 'kp4_x', 'kp4_xd', 'kp4_xdd', 'kp4_y', 'kp4_yd', 'kp4_ydd', 'kp4_z', 'kp4_zd', 'kp4_zdd', 'kp5_x', 'kp5_xd', 'kp5_xdd', 'kp5_y', 'kp5_yd', 'kp5_ydd', 'kp5_z', 'kp5_zd', 'kp5_zdd', 'kp6_x', 'kp6_xd', 'kp6_xdd', 'kp6_y', 'kp6_yd', 'kp6_ydd', 'kp6_z', 'kp6_zd', 'kp6_zdd', 'kp7_x', 'kp7_xd', 'kp7_xdd', 'kp7_y', 'kp7_yd', 'kp7_ydd', 'kp7_z', 'kp7_zd', 'kp7_zdd', 'kp8_x', 'kp8_xd', 'kp8_xdd', 'kp8_y', 'kp8_yd', 'kp8_ydd', 'kp8_z', 'kp8_zd', 'kp8_zdd', 'kp9_x', 'kp9_xd', 'kp9_xdd', 'kp9_y', 'kp9_yd', 'kp9_ydd', 'kp9_z', 'kp9_zd', 'kp9_zdd', 'kp10_x', 'kp10_xd', 'kp10_xdd', 'kp10_y', 'kp10_yd', 'kp10_ydd', 'kp10_z', 'kp10_zd', 'kp10_zdd', 

Manipulate time series
- Change the datatype for the timestamp from string to datetime
- separate measured, filtered and predicted values in different dataframes

In [18]:
# GUI_LOG DATAFRAMES (gui_dfs)
for _, df in gui_dfs.items():
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    # print(df.head())

# NPZ DATAFRAMES (npz_dfs)
for _, df in npz_dfs.items():
    df['timestamp'] = pd.to_datetime(df['timestamp'][0], unit='s')
    
    # Rename 'timestamp' column to 'Timestamp'
    df = df.rename(columns={'timestamp': 'Timestamp'})

    columns = df.columns.values
    meas_columns = ['Timestamp', 'human_meas_pos']
    filt_columns = ['human_filt_x', 'human_filt_var']
    pred_columns = ['pred_human_x', 'pred_human_var']

    df_meas = df[meas_columns]
    df_filt = df[filt_columns]
    df_pred = df[pred_columns]

    for c in df_meas.columns.values:
        df_meas = pd.concat([df_meas, df_meas.pop(c).apply(pd.Series).add_prefix(c+"_")], axis=1)

    for c in df_filt.columns.values:
        df_filt = pd.concat([df_filt, df_filt.pop(c).apply(pd.Series).add_prefix(c+"_")], axis=1)

    print(df_meas.columns.values)
    print(df_meas.shape)

    print(df_filt.columns.values)
    print(df_filt.shape)


    # for column in columns:
    #     df = df[column].apply(pd.Series)
    
    # print(df.shape)
    # pivoted_df = df.pivot(index='Timestamp', columns=df.columns.values, values=)

    # pivoted_df = df.pivot(index='Timestamp', columns='Variable', values='Value')

# # PIVOT THE DATAFRAMES
# pivoted_df = pd.pivot_table(df, values='value', index='timestamp', columns='variable')

# # Pivot the DataFrame
# pivoted_df = df.pivot(index='timestamp', columns='topic', values='message') # use the timestamp as the index
# pivoted_df = pivoted_df.reset_index() # reset the index to make the timestamp a column
# print(f"Dataframe columns: {pivoted_df.columns.values}")
# # Convert the 'timestamp' column to a TimeDeltaIndex
# pivoted_df['timestamp'] = pd.to_timedelta(pivoted_df['timestamp'], unit='s')

# # Resample the DataFrame to a known frequency
# dt = 0.01
# freq_str = f'{dt}S' # seconds
# resampled_df = pivoted_df.resample(freq_str, on='timestamp').mean() # compute the mean of the values in each time bin
# resampled_df = resampled_df.reset_index() # reset the index to make the timestamp a column

Columns:  ['Timestamp' 'human_meas_pos' 'human_filt_x' 'human_filt_var'
 'pred_human_x' 'pred_human_var']
Scalar Columns:  ['Timestamp', 'human_meas_pos', 'human_filt_x', 'human_filt_var']
['Timestamp_0' 'human_meas_pos_0' 'human_meas_pos_1' 'human_meas_pos_2'
 'human_meas_pos_3' 'human_meas_pos_4' 'human_meas_pos_5'
 'human_meas_pos_6' 'human_meas_pos_7' 'human_meas_pos_8'
 'human_meas_pos_9' 'human_meas_pos_10' 'human_meas_pos_11'
 'human_meas_pos_12' 'human_meas_pos_13' 'human_meas_pos_14'
 'human_meas_pos_15' 'human_meas_pos_16' 'human_meas_pos_17'
 'human_meas_pos_18' 'human_meas_pos_19' 'human_meas_pos_20'
 'human_meas_pos_21' 'human_meas_pos_22' 'human_meas_pos_23'
 'human_meas_pos_24' 'human_meas_pos_25' 'human_meas_pos_26'
 'human_meas_pos_27' 'human_meas_pos_28' 'human_meas_pos_29'
 'human_meas_pos_30' 'human_meas_pos_31' 'human_meas_pos_32'
 'human_meas_pos_33' 'human_meas_pos_34' 'human_meas_pos_35'
 'human_meas_pos_36' 'human_meas_pos_37' 'human_meas_pos_38'
 'human_meas_p