<span style="color:red;font-weight:bold"> Evaluates predictions, computes metrics, and plots results. </span>

In [None]:
import numpy as np
import pickle
import glob
import matplotlib.pyplot as plt
import time
import random
import pandas as pd
import seaborn as sns
import pdb

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]= "0"      # choose which GPU to run on.

import matplotlib
font = {'weight' : 'normal',
        'size'   : 14}
matplotlib.rc('font', **font)

# Trajectory Regression Metrics:
# min_dist_by_timestep: returns the min 2-norm error by timestep across all multimodal preds
# weighted_dist_by_timestep: returns the avg 2-norm error by timestep across all multimodal preds
# dist_by_timestep: returns the 2-norm error by timestep for unimodal prediction
from evaluation_metrics import min_dist_by_timestep, weighted_dist_by_timestep, dist_by_timestep

# Intent Classification Metrics 
# top_k_accuracy: returns the avg. top-k-accuracy for intent prediction
# mean_entropy: returns the avg. entropy of the intent prediction distribution
from evaluation_metrics import top_k_accuracy, mean_entropy                              

### Set notebook parameters (change each time you run).

In [None]:
MODE = 'COMPUTE' 
# 'COMPUTE'   : computes metric results and saves to eval_metrics_file
# 'LOAD'      : load precomputed metric results from eval_metrics_file

# Results directory where predictions have been saved.
results_dir = './results'

# Directory to save summary figures of the evaluation metrics.
figs_dir = results_dir + '/figs'

# Name of pickle file where to save evaluation metrics.
eval_metrics_file = results_dir + '/eval_metrics.pkl'

if not os.path.exists(results_dir):
    os.makedirs(results_dir)
if not os.path.exists(figs_dir):
    os.makedirs(figs_dir)

### Metric Evaluation Results

In [None]:
def compute_metric_results_dict(results_dir):
    pred_files = glob.glob(results_dir + '/*pred.pkl')  # files where predictions are saved
    result_dict = {}                                     # final dictionary returned with metrics
    
    for file in pred_files:
        name = file.split('/')[-1].split('_fold')[0] # model name (e.g. EKF_CV, CNN_b0.100_g1.000, etc.)

        # Make a new dictionary entry if required.
        if name not in result_dict.keys():
            metric_dict = {}
            # Common metric information for every model.
            metric_fields = ['N_instances', 'goal_top_1_acc', 'goal_top_3_acc', 'goal_top_5_acc', 'goal_entropy']

            if 'EKF' in name or 'no_goal' in name:
                metric_fields.append('traj_dist_vs_N') # unimodal, intent-agnostic prediction
            else:
                metric_fields.append('gtraj_dist_vs_N') # multimodal model but using ground truth intent label (1 mode)
                metric_fields.append('wtraj_dist_vs_N') # multimodal model and computing weighted distance error over top-n modes
                metric_fields.append('mtraj_dist_vs_N') # multimodal model and computing min distance error over top-n modes
                metric_fields.append('min_ade')         # multimodal model and computing min average displacement error
            # Assemble sub dictionary and add to overall results_dict for this model.
            for key in ['train', 'test']:
                metric_dict[key] = {}
                for field in metric_fields:
                    metric_dict[key][field] = []
            result_dict[name] = metric_dict

        # Compute metrics for this model and fold.  Add to result_dict.
        preds = pickle.load(open(file, 'rb'))
        for tkey in ['train', 'test']:
            goal_pred = preds[tkey]['goal_pred']
            goal_gt   = preds[tkey]['goal_gt']
            traj_pred_dict = preds[tkey]['traj_pred_dict']
            traj_gt = preds[tkey]['traj_gt']

            # Intent Prediction Metrics
            t1 = top_k_accuracy(goal_pred, goal_gt, k=1)
            t3 = top_k_accuracy(goal_pred, goal_gt, k=3)
            t5 = top_k_accuracy(goal_pred, goal_gt, k=5)
            ment = mean_entropy(goal_pred)

            result_dict[name][tkey]['N_instances'].append(goal_pred.shape[0])
            result_dict[name][tkey]['goal_top_1_acc'].append(t1)
            result_dict[name][tkey]['goal_top_3_acc'].append(t3)
            result_dict[name][tkey]['goal_top_5_acc'].append(t5)
            result_dict[name][tkey]['goal_entropy'].append(ment)

            if 'EKF' in name or 'no_goal' in name:
                # Unimodal, intent-agnostic prediction
                meand, mind, maxd = dist_by_timestep(traj_pred_dict, traj_gt[:,:,:2])
                result_dict[name][tkey]['traj_dist_vs_N'].append(meand)
            else:            
                # Ground truth intent input used for regression. Unimodal but intent-conditioned.
                meand, mind, maxd = dist_by_timestep(traj_pred_dict, traj_gt[:,:,:2])
                result_dict[name][tkey]['gtraj_dist_vs_N'].append(meand)

                # Multimodal predictions and top-k intent predictions used for regression.
                traj_pred_dict_multimodal = preds[tkey]['traj_pred_dict_mm']
                wmeand = weighted_dist_by_timestep(goal_pred, traj_pred_dict_multimodal, traj_gt[:,:,:2])
                mmeand, mmade = min_dist_by_timestep(traj_pred_dict_multimodal, traj_gt[:,:,:2])
                result_dict[name][tkey]['wtraj_dist_vs_N'].append(wmeand)
                result_dict[name][tkey]['mtraj_dist_vs_N'].append(mmeand)
                result_dict[name][tkey]['min_ade'].append(mmade)

    return result_dict

In [None]:
if MODE is 'COMPUTE':
    model_res_dict = compute_metric_results_dict(results_dir)
    pickle.dump(model_res_dict, open(eval_metrics_file, 'wb'))
elif MODE is 'LOAD':
    model_res_dict = pickle.load(open(eval_metrics_file, 'rb'))
[print(k) for k in sorted(model_res_dict.keys())]

### Plot 1: Unimodal trajectory evaluation

In [None]:
# Visualizes trajectory distance error by timestep for EKF, LSTM no/gt intent, CNN no/gt intent variants.
# gt_intent generated by passing ground truth intent label to trajectory submodule of a multimodal LSTM/CNN-LSTM.

# Load data to put into a Pandas dataframe.
data_list = []
for model in sorted(model_res_dict.keys()):
    for split in model_res_dict[model].keys(): # split = 'train' or 'test'
        if 'no_goal' in model:
            # Intent-agnostic trajectory prediction with CNN/LSTM.
            name = model
            name = name.replace('goal', 'intent') # for plots, want name to say 'no_intent'
            traj_dist_vs_N = model_res_dict[model][split]['traj_dist_vs_N']
        elif 'EKF' in model:
            # EKF baseline trajectory prediction.
            name = model 
            traj_dist_vs_N = model_res_dict[model][split]['traj_dist_vs_N']
        elif 'b1.000_g1.000' in model:
            # Intent-conditioned trajectory prediction with CNN/LSTM.
            # This is a hack to choose one of the multimodal models and evaluate on
            # ground truth label for intent input.  May need to change key in general
            # if different beta/gammas are used.
            name = model.split('b1.000_g1.000')[0] + 'gt_intent'
            traj_dist_vs_N = model_res_dict[model][split]['gtraj_dist_vs_N']
        else:
            continue
    
        for i_fold, td in enumerate(traj_dist_vs_N):
            for j_timestep, dist_timestep in enumerate(td):
                # j_timestep is 0-indexed but should be 1-indexed in plotting.
                data_list.append([name, split, i_fold, j_timestep+1, dist_timestep])

# Assemble dataframes and split into train/test results.            
traj_df = pd.DataFrame(data_list, columns=['Model', 'Split', 'Fold', 'Timestep', 'Distance Error'],dtype=float)
traj_df_train = traj_df[traj_df.Split == 'train']
traj_df_test  = traj_df[traj_df.Split == 'test']

# TODO: x and y ticks manually adjusted below based on data range.  
# Can use data limits to automate in future.
for split, df in zip(['TRAIN', 'TEST'], [traj_df_train, traj_df_test]):
    print('='*50, split, '='*50)
    
    # All models on one plot for the given split.
    f1 = plt.figure(dpi=200)
    ax = sns.lineplot(x="Timestep", y="Distance Error", hue="Model", data=df)
    plt.legend()
    plt.ylabel('Distance Error (m)')
    plt.xticks(np.arange(0, 21, step=2))      
    plt.yticks(np.arange(0, 1.51, step=0.25))
    plt.draw(); plt.pause(0.01)
    f1.savefig('%s/traj_info_level_all_%s.svg' % (figs_dir, split), bbox_inches='tight')
    
    # All "no intent" models + EKF for the given split.
    f2 = plt.figure(dpi=200)
    ax = sns.lineplot(x="Timestep", y="Distance Error", hue="Model", data=df[~df.Model.str.contains('gt')])
    plt.legend()
    plt.ylabel('Distance Error (m)')
    plt.xticks(np.arange(0, 21, step=2))
    plt.yticks(np.arange(0, 1.51, step=0.25))
    plt.draw(); plt.pause(0.01)
    f2.savefig('%s/traj_info_level_ng_%s.svg' % (figs_dir, split), bbox_inches='tight')
    
    # All "ground truth intent" models + EKF for the given split.
    f3 = plt.figure(dpi=200)
    ax = sns.lineplot(x="Timestep", y="Distance Error", hue="Model", data=df[~df.Model.str.contains('no')])
    plt.legend()
    plt.ylabel('Distance Error (m)')
    plt.xticks(np.arange(0, 21, step=2))
    plt.yticks(np.arange(0, 1.51, step=0.25))
    plt.draw(); plt.pause(0.01)
    f3.savefig('%s/traj_info_level_gt_%s.svg' % (figs_dir, split), bbox_inches='tight')

### Plot 2: Multimodal trajectory evaluation using minimum distance to trajectory metric.

In [None]:
# Visualizes trajectory distance error by timestep for the following models:
# LSTM multimodal, LSTM no intent, CNN multimodal, and CNN no intent variants.
# gt_intent generated by passing ground truth intent label to trajectory submodule of a multimodal LSTM/CNN-LSTM.

# Load data to put into a Pandas dataframe.
data_list = []
for model in sorted(model_res_dict.keys()):
    for split in model_res_dict[model].keys(): # split = 'train' or 'test'
        if 'no_goal' in model:
            # Intent-agnostic trajectory prediction with CNN/LSTM.
            name = model
            name = name.replace('goal', 'intent') # for plots, want name to say 'no_intent'
            traj_dist_vs_N = model_res_dict[model][split]['traj_dist_vs_N']
        elif 'EKF' in model:
            continue
        else:
            # Use the min_dist_by_timestep metric, which finds the dist_by_timestep
            # to the trajectory that is the argmin entry to the min average displacement
            # error metric (e.g. used by Waymo in the MultiPath paper).
            name = model.split('_g')[0]
            name = name.replace('_', '\_')
            name = name.replace('b', '\\beta')
            name = r'$%s$' % name
            traj_dist_vs_N = model_res_dict[model][split]['mtraj_dist_vs_N']
    
        for i_fold, td in enumerate(traj_dist_vs_N):
            for j_timestep, dist_timestep in enumerate(td):
                # j_timestep is 0-indexed but should be 1-indexed in plotting.
                data_list.append([name, split, i_fold, j_timestep+1, dist_timestep])

# Assemble dataframes and split into train/test results.            
traj_df = pd.DataFrame(data_list, columns=['Model', 'Split', 'Fold', 'Timestep', 'Distance Error'],dtype=float)
traj_df_train = traj_df[traj_df.Split == 'train']
traj_df_test  = traj_df[traj_df.Split == 'test']

# TODO: x and y ticks manually adjusted below based on data range.  
# Can use data limits to automate in future.
for split, df in zip(['TRAIN', 'TEST'], [traj_df_train, traj_df_test]):
    print('='*50, split, '='*50)
    
    # All models on one plot for the given split.
    f1 = plt.figure(dpi=200)
    ax = sns.lineplot(x="Timestep", y="Distance Error", hue="Model", data=df)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.ylabel('Distance Error (m)')
    plt.xticks(np.arange(0, 21, step=2))      
    plt.yticks(np.arange(0, 1.51, step=0.25))
    #plt.minorticks_on()
    #plt.grid(which='minor', linestyle='-', linewidth=2)
    plt.draw(); plt.pause(0.01)
    f1.savefig('%s/traj_multimodal_mdist_all_%s.svg' % (figs_dir, split), bbox_inches='tight')
    
    # CNN models only for the given split.
    f2 = plt.figure(dpi=200)
    with sns.color_palette('bright'):
        ax = sns.lineplot(x="Timestep", y="Distance Error", hue="Model", data=df[df.Model.str.contains('CNN')])
        plt.legend()
        plt.ylabel('Distance Error (m)')
        plt.xticks(np.arange(0, 21, step=2))
        plt.yticks(np.arange(0, 1.51, step=0.25))
        #plt.minorticks_on()
        #plt.grid(which='minor', linestyle='-', linewidth=2)
        plt.draw(); plt.pause(0.01)
    f2.savefig('%s/traj_multimodal_mdist_cnn_%s.svg' % (figs_dir, split), bbox_inches='tight')
    
    # LSTM models only for the given split.
    f3 = plt.figure(dpi=200)
    with sns.color_palette('deep'):
        ax = sns.lineplot(x="Timestep", y="Distance Error", hue="Model", data=df[df.Model.str.contains('LSTM')])
        plt.legend()
        plt.ylabel('Distance Error (m)')
        plt.xticks(np.arange(0, 21, step=2))
        plt.yticks(np.arange(0, 1.51, step=0.25))
        #plt.minorticks_on()
        #plt.grid(which='minor', linestyle='-', linewidth=2)
        plt.draw(); plt.pause(0.01)
    f3.savefig('%s/traj_multimodal_mdist_lstm_%s.svg' % (figs_dir, split), bbox_inches='tight')

### Plot 3: Multimodal trajectory evaluation using minimum average displacement error (min ADE) metric.

In [None]:
# Visualizes minimum average displacement (Waymo MultiPath paper) for the following models:
# LSTM multimodal, LSTM no intent, CNN multimodal, and CNN no intent variants.
# gt_intent generated by passing ground truth intent label to trajectory submodule of a multimodal LSTM/CNN-LSTM.

# Load data to put into a Pandas dataframe.
data_list = []
for model in sorted(model_res_dict.keys()):
    for split in model_res_dict[model].keys(): # split = 'train' or 'test'
        if 'no_goal' in model:
            # Intent-agnostic trajectory prediction with CNN/LSTM.
            name = model
            name = name.replace('goal', 'intent') # for plots, want name to say 'no_intent'
            ades = [np.mean(x) for x in model_res_dict[model][split]['traj_dist_vs_N']]
        elif 'EKF' in model:
            name = model
            ades = [np.mean(x) for x in model_res_dict[model][split]['traj_dist_vs_N']]
        else:
            # Use the min_dist_by_timestep metric, which finds the dist_by_timestep
            # to the trajectory that is the argmin entry to the min average displacement
            # error metric (e.g. used by Waymo in the MultiPath paper).
            name = model.split('_g')[0]
            name = name.replace('_', '\_')
            name = name.replace('b', '\\beta')
            name = r'$%s$' % name
            ades = model_res_dict[model][split]['min_ade']
        
        for i_fold, ade_fold in enumerate(ades):
            data_list.append([name, split, i_fold, ade_fold])

# Assemble dataframes and split into train/test results.            
traj_df = pd.DataFrame(data_list, columns=['Model', 'Split', 'Fold', 'Min ADE'],dtype=float)
traj_df_train = traj_df[traj_df.Split == 'train']
traj_df_test  = traj_df[traj_df.Split == 'test']


# TODO: x and y ticks manually adjusted below based on data range.  
# Can use data limits to automate in future.
for split, df in zip(['TRAIN', 'TEST'], [traj_df_train, traj_df_test]):
    print('='*50, split, '='*50)
    from IPython.display import Latex
    
    print( df.groupby("Model")["Min ADE"].agg([np.mean, np.std]).sort_values(by='mean') )
    
#     # All models on one plot for the given split.
#     f1 = plt.figure(dpi=200)
#     ax = sns.barplot(x="Model", y="Min ADE", data=df)
#     plt.ylabel('Min Average Distance Error (m)')
#     print([label for label in ax.get_xticklabels()])
#     ax.set_xticklabels(['']*len(ax.get_xticklabels()))
#     #plt.xticks(np.arange(0, 21, step=2))      
#     plt.yticks(np.arange(0, 1.01, step=0.25))
#     plt.draw(); plt.pause(0.01)
#     #f1.savefig('%s/traj_multimodal_min_ade_%s.svg' % (figs_dir, split), bbox_inches='tight')

### Plot 4: Intent Prediction Evaluation using Top-k accuracy (also called Top-n in the paper).

In [None]:
# Load data to put into a Pandas dataframe.
data_list = []
model_list = sorted(model_res_dict.keys())
model_list.remove('EKF_CV')
model_list.insert(0, 'EKF_CV') # put EKF result first in the barplot

for model in model_list:
    if 'no_goal' in model:
        continue
    if 'EKF' in model:
        name = model
    else:
        name = model.split('_g')[0]
        name = name.replace('_', '\_')
        name = name.replace('b', '\\beta')
        name = r'$%s$' % name
        
    for split in model_res_dict[model].keys():
        goal_top_1_acc = model_res_dict[model][split]['goal_top_1_acc']
        goal_top_3_acc = model_res_dict[model][split]['goal_top_3_acc']
        goal_top_5_acc = model_res_dict[model][split]['goal_top_5_acc']
        
        for i_fold, (t1, t3, t5) in enumerate(zip(goal_top_1_acc, 
                                                  goal_top_3_acc,
                                                  goal_top_5_acc)):
            data_list.append([name, split, i_fold, 1, t1])
            data_list.append([name, split, i_fold, 3, t3])
            data_list.append([name, split, i_fold, 5, t5])
                        
goal_df = pd.DataFrame(data_list, columns=['Model', 'Split', 'Fold', 'k', 'Accuracy'],dtype=float)
goal_train_df = goal_df[goal_df.Split == 'train']
goal_test_df = goal_df[goal_df.Split == 'test']

# Top-k Accuracy across all models.
for fold, df in zip(['TRAIN', 'TEST'], [goal_train_df, goal_test_df]):
    print('='*50, fold, '='*50)
    f = plt.figure(dpi=200)
    ax = sns.barplot(x='k', y='Accuracy', hue='Model', data=df)
    plt.xlabel('n')
    plt.ylabel('Top-n Accuracy')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()  
    
    plt.draw(); plt.pause(0.01)
    f.savefig('%s/intent_acc_%s.svg' % (figs_dir, fold), bbox_inches='tight')
    
    print('TOP 1')
    print( df[df.k == 1.0].groupby("Model")["Accuracy"].agg([np.mean, np.std]) )
    
    print('TOP 3')
    print( df[df.k == 3.0].groupby("Model")["Accuracy"].agg([np.mean, np.std]) )
    
    print('TOP 5')
    print( df[df.k == 5.0].groupby("Model")["Accuracy"].agg([np.mean, np.std]) )