# Kalman Filter Multi-step Prediction

## Import and Initial Setup
Set directory to load log files from.

In [1]:
import os
import rospkg

# Create a RosPack object
rospack = rospkg.RosPack()

# Get the path to the package this script is in
package_path = rospack.get_path('hri_predict_ros')

# Define the path to the logs directory
log_dir = os.path.join(package_path, 'logs')

# Specify the path to the rosbag files and the gui data
bag_dir = os.path.join(log_dir, 'bag')
gui_dir = os.path.join(log_dir, 'gui_data')
npz_dir = os.path.join(log_dir, 'npz')

# Define the path to the plots directory
plot_dir = os.path.join(package_path, 'plots')
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

Import npz files in Pandas dataframes for each subject

In [21]:
import os, sys
import pandas as pd
import numpy as np

# Get a list of all folders in npz_dir
folders = [f for f in os.listdir(npz_dir) if os.path.isdir(os.path.join(npz_dir, f))]
folders = sorted(folders, key=lambda s: int(s.split('_')[1]))

folders = folders[:-1] # DEBUG

# Create an empty dictionary to store the dataframes
npz_dfs = {}

# Iterate over each folder
for folder in folders:
    folder_path = os.path.join(npz_dir, folder)
    
    # Get a list of all npz files in the folder
    npz_files = [f for f in os.listdir(folder_path) if f.endswith('.npz')]
    sorted_npz_files = sorted(npz_files, key=lambda s: int(s.split('_')[1].split('.')[0]))

    print("Processing folder '%s' with %d npz files" % (folder, len(sorted_npz_files)))

    df_all = []
    
    # Iterate over each npz file
    for npz_file in sorted_npz_files:
        npz_file_path = os.path.join(folder_path, npz_file)
        
        # Load the npz file
        data = np.load(npz_file_path)

        df= pd.DataFrame.from_dict({item: data[item] for item in data.files}, orient='index')
        df = df.transpose()

        # Append the dataframe to the list
        df_all.append(df)

    # Concatenate all the dataframes in the list
    df_all = pd.concat(df_all, ignore_index=True)

    print("Processed subject %s. Number of rows: %d\n" % (folder, len(df_all)))

    # Store the dataframe in the dictionary
    npz_dfs[folder] = df_all

Processing folder 'sub_3' with 2449 npz files
Processed subject sub_3. Number of rows: 2449

Processing folder 'sub_4' with 2630 npz files
Processed subject sub_4. Number of rows: 2630

Processing folder 'sub_6' with 2694 npz files
Processed subject sub_6. Number of rows: 2694

Processing folder 'sub_7' with 3756 npz files
Processed subject sub_7. Number of rows: 3756



Import gui_data for each subject

In [22]:
import os
import pandas as pd

# Get all the file names in the directory
file_names = os.listdir(gui_dir)

# DEBUG
file_names = ['gui_log_sub_3.txt', 'gui_log_sub_4.txt', 'gui_log_sub_6.txt', 'gui_log_sub_7.txt']

# Initialize an empty dictionary to store the dataframes
gui_dfs = {}

# Iterate over each file
for file_name in file_names:
    # Check if the file is a text file
    if file_name.endswith('.txt'): # they are txt files, but structured as csv
        # Construct the file path
        file_path = os.path.join(gui_dir, file_name)
        
        # Read the file as a dataframe
        df = pd.read_csv(file_path)
        
        # Add the dataframe to the dictionary using a portion of the file name as the key
        key = file_name.split('.')[0].split('_')[-2:]
        key = '_'.join(key)
        gui_dfs[key] = df

Create lists of column names

In [23]:
n_keypoints = 18
state_names = ['kp{}_{}'.format(i, suffix)
               for i in range(n_keypoints)
               for suffix in ['x', 'xd', 'xdd', 'y', 'yd', 'ydd', 'z', 'zd', 'zdd']]
print(state_names)
print(len(state_names))

measurement_names = ['kp{}_{}'.format(i, suffix)
                     for i in range(n_keypoints)
                     for suffix in ['x', 'y', 'z']]
print(measurement_names)
print(len(measurement_names))

['kp0_x', 'kp0_xd', 'kp0_xdd', 'kp0_y', 'kp0_yd', 'kp0_ydd', 'kp0_z', 'kp0_zd', 'kp0_zdd', 'kp1_x', 'kp1_xd', 'kp1_xdd', 'kp1_y', 'kp1_yd', 'kp1_ydd', 'kp1_z', 'kp1_zd', 'kp1_zdd', 'kp2_x', 'kp2_xd', 'kp2_xdd', 'kp2_y', 'kp2_yd', 'kp2_ydd', 'kp2_z', 'kp2_zd', 'kp2_zdd', 'kp3_x', 'kp3_xd', 'kp3_xdd', 'kp3_y', 'kp3_yd', 'kp3_ydd', 'kp3_z', 'kp3_zd', 'kp3_zdd', 'kp4_x', 'kp4_xd', 'kp4_xdd', 'kp4_y', 'kp4_yd', 'kp4_ydd', 'kp4_z', 'kp4_zd', 'kp4_zdd', 'kp5_x', 'kp5_xd', 'kp5_xdd', 'kp5_y', 'kp5_yd', 'kp5_ydd', 'kp5_z', 'kp5_zd', 'kp5_zdd', 'kp6_x', 'kp6_xd', 'kp6_xdd', 'kp6_y', 'kp6_yd', 'kp6_ydd', 'kp6_z', 'kp6_zd', 'kp6_zdd', 'kp7_x', 'kp7_xd', 'kp7_xdd', 'kp7_y', 'kp7_yd', 'kp7_ydd', 'kp7_z', 'kp7_zd', 'kp7_zdd', 'kp8_x', 'kp8_xd', 'kp8_xdd', 'kp8_y', 'kp8_yd', 'kp8_ydd', 'kp8_z', 'kp8_zd', 'kp8_zdd', 'kp9_x', 'kp9_xd', 'kp9_xdd', 'kp9_y', 'kp9_yd', 'kp9_ydd', 'kp9_z', 'kp9_zd', 'kp9_zdd', 'kp10_x', 'kp10_xd', 'kp10_xdd', 'kp10_y', 'kp10_yd', 'kp10_ydd', 'kp10_z', 'kp10_zd', 'kp10_zdd', 

Manipulate time series
- Change the datatype for the timestamp from string to datetime
- separate measured, filtered and predicted values in different dataframes

In [33]:
# GUI_LOG DATAFRAMES (gui_dfs)
for _, df in gui_dfs.items():
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    # print(df.head())

# NPZ DATAFRAMES (npz_dfs)
final_npz_dfs = {}
for subj, df in npz_dfs.items():
    # check if the timestamp column is present
    if 'timestamp' in df.columns:
        # Convert the 'timestamp' column to a TimeDeltaIndex
        df['timestamp'] = pd.to_datetime(df['timestamp'][0], unit='s')
        
        # Rename 'timestamp' column to 'Timestamp'
        df = df.rename(columns={'timestamp': 'Timestamp'})

    df_meas = df[['Timestamp', 'human_meas_pos']]
    df_filt_state = df[['Timestamp', 'human_filt_x']]
    df_filt_var = df[['Timestamp', 'human_filt_var']]
    df_pred_state = df[['Timestamp', 'pred_human_x']] # TEMP
    df_pred_var = df[['Timestamp', 'pred_human_var']] # TEMP

    print(type(df_pred_state['pred_human_x'][0]), df_pred_state['pred_human_x'][0].shape)

    for c in df_meas.columns.values:
        df_meas = pd.concat([df_meas, df_meas.pop(c).apply(pd.Series).add_prefix(c+"_")], axis=1)

    for c in df_filt_state.columns.values:
        df_filt_state = pd.concat([df_filt_state, df_filt_state.pop(c).apply(pd.Series).add_prefix(c+"_")], axis=1)

    for c in df_filt_var.columns.values:
        df_filt_var = pd.concat([df_filt_var, df_filt_var.pop(c).apply(pd.Series).add_prefix(c+"_")], axis=1)

    # print(df_meas.columns.values)
    # print(df_meas.shape)

    # print(df_filt.columns.values)
    # print(df_filt.shape)

    # Change the dataframe column names with the names defined in the cell before
    df_meas = df_meas.rename(columns=dict(zip(df_meas.columns.values, ['Timestamp'] + measurement_names)))
    df_filt_state = df_filt_state.rename(columns=dict(zip(df_filt_state.columns.values, ['Timestamp'] + state_names)))
    df_filt_var = df_filt_var.rename(columns=dict(zip(df_filt_var.columns.values, ['Timestamp'] + state_names)))

    print("df_meas")
    print(df_meas.columns.values)
    print(df_meas.shape)
    print("df_filt_state")
    print(df_filt_state.columns.values)
    print(df_filt_state.shape)
    print("df_filt_var")
    print(df_filt_var.columns.values)
    print(df_filt_var.shape)
    
    # Set the 'Timestamp' column as the index
    df_meas = df_meas.set_index('Timestamp')
    df_filt_state = df_filt_state.set_index('Timestamp') 
    df_filt_var = df_filt_var.set_index('Timestamp')

    # Store the unpacked dataframes in the final_npz_dfs dictionary
    final_npz_dfs[subj] = {'df_meas': df_meas, 'df_filt_state': df_filt_state, 'df_filt_var': df_filt_var}

    sys.exit()  
    # for column in columns:
    #     df = df[column].apply(pd.Series)
    
    # print(df.shape)
    # pivoted_df = df.pivot(index='Timestamp', columns=df.columns.values, values=)

    # pivoted_df = df.pivot(index='Timestamp', columns='Variable', values='Value')

# # PIVOT THE DATAFRAMES
# pivoted_df = pd.pivot_table(df, values='value', index='timestamp', columns='variable')

# # Pivot the DataFrame
# pivoted_df = df.pivot(index='timestamp', columns='topic', values='message') # use the timestamp as the index
# pivoted_df = pivoted_df.reset_index() # reset the index to make the timestamp a column
# print(f"Dataframe columns: {pivoted_df.columns.values}")
# # Convert the 'timestamp' column to a TimeDeltaIndex
# pivoted_df['timestamp'] = pd.to_timedelta(pivoted_df['timestamp'], unit='s')

# # Resample the DataFrame to a known frequency
# dt = 0.01
# freq_str = f'{dt}S' # seconds
# resampled_df = pivoted_df.resample(freq_str, on='timestamp').mean() # compute the mean of the values in each time bin
# resampled_df = resampled_df.reset_index() # reset the index to make the timestamp a column

AttributeError: 'dict' object has no attribute 'columns'