In [None]:
# Author: Tian Yun
import os
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

target_domain = "block"  # ["block", "kitchen"]

In [None]:
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 14}

matplotlib.rc('font', **font)

In [None]:
case_category_tuples = {
    "Single Goal, Correct Steps": [1, 2, 3],
    "Multiple Goal, Correct Steps": [5, 6],  # skip case 4
    "Single Goal, Wrong Steps": [7, 8, 9, 10],
    "Multiple Goal, Wrong Steps": [11, 12],
}

baseline_agent_dict = {
    "random": "Random - Correct",
    "htn": "HTN",
    "fixed_always_ask": "ALWAYS-ASK",
    "standard": "DGR-POMDP",
}

# 1. Parsing HTN-based Results

In [None]:
def get_case_name(case_id):
    # This function can be replaced by a case_id to case_category mapping
    for case_cat, cases in case_category_tuples.items():
        if case_id in cases:
            return case_cat

In [None]:
class parseArguments:
    def __init__(self):
        self.base_dir_path = f"./outputs/{target_domain}"
        self.agent_type = ["htn", "fixed_always_ask"]
        self.hyperparams = "dp17_sn5_df0.95_e1_wp-5_qr5_qp-5_oh0.76_dt0.001"
        self.reliability_scores = [0.8, 0.9, 0.95, 0.99]
        self.num_goals = 5
        self.num_cases = 2
        self.category_list = ["single_correct", "single_wrong", "multi_correct", "multi_wrong"]        
        
args = parseArguments()

In [None]:
# Load acc and std csv files
goal_acc_csv_dict, goal_sem_csv_dict = {},{}
plan_acc_csv_dict, plan_sem_csv_dict = {},{}

for agent in args.agent_type:
    # Load goal accuracy and standard error of mean
    goal_acc_csv_dict[agent] = pd.read_csv(os.path.join(args.base_dir_path, agent, args.hyperparams, "goal_accuracy.csv"))
    goal_sem_csv_dict[agent] = pd.read_csv(os.path.join(args.base_dir_path, agent, args.hyperparams, "goal_std.csv"))
    
    # Load plan accuracy and standard error of mean
    plan_acc_csv_dict[agent] = pd.read_csv(os.path.join(args.base_dir_path, agent, args.hyperparams, "plan_accuracy.csv"))
    plan_sem_csv_dict[agent] = pd.read_csv(os.path.join(args.base_dir_path, agent, args.hyperparams, "plan_std.csv"))

In [None]:
!ls ./outputs/block/htn/dp17_sn5_df0.95_e1_wp-5_qr5_qp-5_oh0.76_dt0.001


In [None]:
# Load runtime/num_questions/reward csv files
cumu_runtime_mean_csv_dict, cumu_runtime_sem_csv_dict = {},{}
cumu_questions_mean_csv_dict, cumu_questions_sem_csv_dict = {},{}
cumu_reward_mean_csv_dict, cumu_reward_sem_csv_dict = {},{}

for agent in args.agent_type:

    df_mean_time = goal_acc_csv_dict[agent].copy()  # same dataframe format
    df_sem_time = goal_acc_csv_dict[agent].copy()
    df_mean_questions = goal_acc_csv_dict[agent].copy()  # same dataframe format
    df_sem_questions = goal_acc_csv_dict[agent].copy()
    df_mean_reward = goal_acc_csv_dict[agent].copy()  # same dataframe format
    df_sem_reward = goal_acc_csv_dict[agent].copy()

    for sensor_reliability in args.reliability_scores:
        
        row_id = 0
        for case_id in range(1, 12+1):
            
            if case_id == 4 or case_id == 11 or case_id ==10 or case_id ==9 or case_id ==7:
                # Case 4 is intended to be skipped
                continue
            row_id += 1

            case_csv_name = f"Episode-Case{case_id}_{sensor_reliability}.csv"
            curr_df = pd.read_csv(os.path.join(args.base_dir_path, agent, args.hyperparams, "episode_reward", case_csv_name))
            x = []

            # Process runtime
            curr_mean_time = curr_df["normalized_time"].mean()
            curr_sem_time = curr_df["normalized_time"].std() / np.sqrt(len(curr_df))
            df_mean_time.loc[row_id, str(sensor_reliability)] = curr_mean_time
            df_sem_time.loc[row_id, str(sensor_reliability)] = curr_sem_time
            
            # Process number of questions asked
            curr_mean_questions = curr_df["normalized_num_question_asked"].mean()
            curr_sem_questions = curr_df["normalized_num_question_asked"].std() / np.sqrt(len(curr_df))
            df_mean_questions.loc[row_id, str(sensor_reliability)] = curr_mean_questions
            df_sem_questions.loc[row_id, str(sensor_reliability)] = curr_sem_questions
            
            # Process discounted rewards
            curr_mean_reward = curr_df["cumu_discounted_reward"].mean()
            curr_sem_reward = curr_df["cumu_discounted_reward"].std() / np.sqrt(len(curr_df))
            df_mean_reward.loc[row_id, str(sensor_reliability)] = curr_mean_reward
            df_sem_reward.loc[row_id, str(sensor_reliability)] = curr_sem_reward

    cumu_runtime_mean_csv_dict[agent] = df_mean_time.copy()
    cumu_runtime_sem_csv_dict[agent] = df_sem_time.copy()
    cumu_questions_mean_csv_dict[agent] = df_mean_questions.copy()
    cumu_questions_sem_csv_dict[agent] = df_sem_questions.copy()
    cumu_reward_mean_csv_dict[agent] = df_mean_reward.copy()
    cumu_reward_sem_csv_dict[agent] = df_sem_reward.copy()

# 2. Parsing SIPS Results

In [None]:
!ls ./outputs/block/SIPS/optimal/block-words-0.8

In [None]:
class parseArguments:
    def __init__(self):
        self.base_dir_path = f"./outputs/{target_domain}"
        self.agent_type = ["SIPS"]
        self.hyperparams = "dp17_sn5_df0.95_e1_wp-5_qr5_qp-5_oh0.76_dt0.001"
        self.reliability_scores = [0.8, 0.9, 0.95, 0.99]
        self.num_goals = 7
        self.num_cases = 2
        self.category_list = ["single_correct", "single_wrong", "multi_correct", "multi_wrong"]        
        
args = parseArguments()

In [None]:
goal_acc_csv_dict, goal_sem_csv_dict = {},{}
plan_acc_csv_dict, plan_sem_csv_dict = {},{}



In [None]:
data_root_path

In [None]:
# Processing "optimal" folder, including ["single_correct", "multi_correct"]

for agent in args.agent_type:
    data_dir_path = os.path.join(args.base_dir_path, agent, "optimal")
    
    for sensor_reliability in args.reliability_scores:
        
        # "single_correct"
        for goal in range(0, 6):
            curr_df = pd.read_csv(
                os.path.join(data_dir_path, f"block-words-{sensor_reliability}", f"block-words_problem_0_goal{goal}.csv")
            )
            break
        
        # "multi_correct"
        for goal in range(6, 7):
            curr_df = pd.read_csv(
                os.path.join(data_dir_path, f"block-words-{sensor_reliability}", f"block-words_problem_0_goal{goal}.csv")
            )
            break

In [None]:
curr_df

# 3. Visualizations

## 3.1. Goal Accuracy vs. Reliability Score

In [None]:
# Get aggregated results for each case for each agent
goal_acc_line_df = defaultdict(dict)
goal_sem_line_df = defaultdict(dict)
x_ticklabels = None

for case, case_nums in case_category_tuples.items():
    #print(case, case_nums)
    for agent in args.agent_type:
        curr_df = goal_acc_csv_dict[agent].copy()
        #print(curr_df)
        curr_df = curr_df[curr_df["Case_Num"].isin(case_nums)]
        curr_df.drop("Case_Num", axis=1, inplace=True)
        goal_acc_line_df[case][agent] = list(curr_df.mean().to_dict().values())[::-1]  # Make the sensor reliability ascending
        
        curr_df = goal_sem_csv_dict[agent].copy()
        curr_df = curr_df[curr_df["Case_Num"].isin(case_nums)]
        curr_df.drop("Case_Num", axis=1, inplace=True)
        goal_sem_line_df[case][agent] = list(curr_df.mean().to_dict().values())[::-1] # Make the sensor reliability ascending
        x_ticklabels = curr_df.columns.to_list()[::-1]  # Make the sensor reliability ascending

In [None]:
# Visualize Performance by Case Category
x = [float(val) for val in x_ticklabels]

n_row = 2
n_col = 2
fig, ax = plt.subplots(n_row, n_col, figsize=(16,10))

for case_id, case in enumerate(case_category_tuples.keys()):
    
    plot_row = case_id // n_col
    plot_col = case_id % n_col

    for agent in (args.agent_type):
        acc = goal_acc_line_df[case][agent]
        sem = goal_sem_line_df[case][agent]
        ax[plot_row][plot_col].errorbar(x, acc, yerr=sem, label=baseline_agent_dict[agent])

    ax[plot_row][plot_col].legend(loc="lower right")
    ax[plot_row][plot_col].set_title(case)
    ax[plot_row][plot_col].set_ylabel('Goal Accuracy')
    if plot_row == 1:
        ax[plot_row][plot_col].set_xlabel('Sensor Reliability')
        ax[plot_row][plot_col].set_ylim([0.5, 1.05])

## 3.2. Plan Accuracy vs. Reliability Score

In [None]:
# Get aggregated results for each case for each agent
plan_acc_line_df = defaultdict(dict)
plan_sem_line_df = defaultdict(dict)
x_ticklabels = None

for case, case_nums in case_category_tuples.items():
    #print(case, case_nums)
    for agent in args.agent_type:
        curr_df = plan_acc_csv_dict[agent].copy()
        #print(curr_df)
        curr_df = curr_df[curr_df["Case_Num"].isin(case_nums)]
        curr_df.drop("Case_Num", axis=1, inplace=True)
        plan_acc_line_df[case][agent] = list(curr_df.mean().to_dict().values())[::-1]  # Make the sensor reliability ascending
        
        curr_df = plan_sem_csv_dict[agent].copy()
        curr_df = curr_df[curr_df["Case_Num"].isin(case_nums)]
        curr_df.drop("Case_Num", axis=1, inplace=True)
        plan_sem_line_df[case][agent] = list(curr_df.mean().to_dict().values())[::-1] # Make the sensor reliability ascending
        x_ticklabels = curr_df.columns.to_list()[::-1]  # Make the sensor reliability ascending

In [None]:
# Visualize Performance by Case Category
x = [float(val) for val in x_ticklabels]

n_row = 2
n_col = 2
fig, ax = plt.subplots(n_row, n_col, figsize=(16,10))

for case_id, case in enumerate(case_category_tuples.keys()):
    
    plot_row = case_id // n_col
    plot_col = case_id % n_col

    for agent in (args.agent_type):
        acc = plan_acc_line_df[case][agent]
        sem = plan_sem_line_df[case][agent]
        ax[plot_row][plot_col].errorbar(x, acc, yerr=sem, label=baseline_agent_dict[agent])

    ax[plot_row][plot_col].legend(loc="lower right")
    ax[plot_row][plot_col].set_title(case)
    ax[plot_row][plot_col].set_ylabel('Goal Accuracy')
    if plot_row == 1:
        ax[plot_row][plot_col].set_xlabel('Sensor Reliability')
        ax[plot_row][plot_col].set_ylim([0.5, 1.05])

## 3.3. Aggregating #Questions/Rewards/Runtime

In [None]:
!ls outputs/block/SIPS/optimal


In [None]:
class parseArguments:
    def __init__(self):
        self.results_dir_path = "./outputs/block"
        self.models = ["htn"]
        self.category_list = ["single_correct", "single_wrong", "multi_correct", "multi_wrong"]
        self.hyperparams = "dp17_sn5_df0.95_e1_wp-5_qr5_qp-5_oh0.76_dt0.001"
        self.reliability_list = [0.8, 0.9, 0.95, 0.99]
        self.num_goals = 5
        self.num_cases = 2

args = parseArguments()


In [None]:
def get_top1_accuracy(data_df, label, num_goals):
    """ Top-1 accuracy for goal inference. """
    goal_inference_correct = 0
    for row_id in range(len(data_df)):
        row = data_df.iloc[row_id]
        
        all_goal_probs = []
        for goal_id in range(args.num_goals):
            all_goal_probs.append(row[f"goal_probs_{goal_id}"])
        max_goal = max(all_goal_probs)
        predicted_goals = np.where(all_goal_probs == max_goal)[0]
        
        if label in predicted_goals:
            goal_inference_correct += 1
            
    accuracy = goal_inference_correct / len(data_df)
    return accuracy

In [None]:
template_dict = {
    model: {
        "single_correct": defaultdict(dict), 
        "single_wrong": defaultdict(dict),
        "multi_correct": defaultdict(dict),
        "multi_wrong": defaultdict(dict),
    }
    for model in args.models
}

goal_acc_dict = template_dict.copy()
goal_sem_dict = template_dict.copy()
plan_acc_dict = template_dict.copy()
plan_sem_dict = template_dict.copy()
runtime_mean_dict = template_dict.copy()
runtime_sem_dict = template_dict.copy()

# 1. Parse SIPS Results

In [None]:
!ls ./outputs/block/SIPS/optimal/block-words-0.8

In [None]:
result_dir_root = os.path.join(args.results_dir_path, "SIPS")

for group in ["optimal", "suboptimal"]:
    for reliability in [0.8, 0.9, 0.95, 0.99]:
        data_dir_path = os.path.join(result_dir_root, group, f"block-words-{reliability}")
        for goal_label in [0,1,2,3,4,5]
            data = pd.read_csv(data_path)
            break

In [None]:
!ls ./outputs/block/SIPS/optimal/block-words-0.8


# 2. Parse Other Results

# 3. Visualizations

In [None]:
!ls ./outputs/block/htn/dp17_sn5_df0.95_e1_wp-5_qr5_qp-5_oh0.76_dt0.001/episode_reward


In [None]:
block_category_case_map = {
    "single_correct": [1,2,3,4,5],
    "single_wrong": [6,7,8,9,10],
    "multi_correct": [11,12,13,23,24,25,26,27,28,29],
    "multi_wrong": [14,15,16,17,18,19,22,30],
}

In [None]:
for model in args.models:
    data_dir_path = os.path.join(args.results_dir_path, model, args.hyperparams)
    goal_acc_data = pd.read_csv(data_dir_path, "goal_accuracy.csv")
    goal_std_data = pd.read_csv(data_dir_path, "goal_std.csv")
    plan_acc_data = pd.read_csv(data_dir_path, "plan_accuracy.csv")
    plan_std_data = pd.read_csv(data_dir_path, "plan_std.csv")
    
    data_dir_path = 
    for reliability in args.reliability_list:
        
        break

In [None]:
goal_acc_data

In [None]:
# Load & process data

for model in args.models:
    
    for reliability in args.reliability_list:
        data_dir_name = f"{output_name}-{reliability}"

        # Compute the average top-1 accuracy for Case 0 (correct steps)
        acc_list, runtime_list = [], []
        for goal_id in range(args.num_goals):
            data_path = os.path.join(
                args.results_dir_path, 
                data_dir_name, 
                f"block-words_problem_0_goal{goal_id}.csv"
            )
            data = pd.read_csv(data_path)
            acc = get_top1_accuracy(data, goal_id, args.num_goals)
            acc_list.append(acc)
            runtime_list.append(data["step_durs"].mean())
            
        acc_list = np.array(acc_list)
        #print(acc_list)
        runtime_list = np.array(runtime_list)
        acc_dict[model]["correct_steps"][str(reliability)] = acc_list.mean()
        sem_dict[model]["correct_steps"][str(reliability)] = acc_list.std() / np.sqrt(len(acc_list))
        runtime_mean_dict[model]["correct_steps"][str(reliability)] = runtime_list.mean()
        runtime_sem_dict[model]["correct_steps"][str(reliability)] = runtime_list.std() / np.sqrt(len(runtime_list))

        # Compute the average top-1 accuracy for Case 1 & 2 (wrong steps)
        acc_list, runtime_list = [], []
        for case_id in range(args.num_cases):
            for goal_id in range(args.num_goals):
                data_path = os.path.join(
                    args.results_dir_path, 
                    data_dir_name, 
                    f"block-words_problem_0_goal{goal_id}_{case_id}.csv"
                )
                data = pd.read_csv(data_path)
                acc = get_top1_accuracy(data, goal_id, args.num_goals)
                acc_list.append(acc)
                runtime_list.append(data["step_durs"].mean())
        acc_list = np.array(acc_list)
        print(acc_list)
        runtime_list = np.array(runtime_list)
        acc_dict[model]["wrong_steps"][str(reliability)] = acc_list.mean()
        sem_dict[model]["wrong_steps"][str(reliability)] = acc_list.std() / np.sqrt(len(acc_list))
        runtime_mean_dict[model]["wrong_steps"][str(reliability)] = runtime_list.mean()
        runtime_sem_dict[model]["wrong_steps"][str(reliability)] = runtime_list.std() / np.sqrt(len(runtime_list))


# 1. Top-1 Accuracy vs. Sensor Reliability

In [None]:
n_row = 1
n_col = 2
fig, ax = plt.subplots(n_row, n_col, figsize=(12,4))

for plot_id, category in enumerate(args.category_list):
    x = acc_dict[model][category].keys()
    acc_list = acc_dict[model][category].values()
    sem_list = sem_dict[model][category].values()

    ax[plot_id].errorbar(x, acc_list, yerr=sem_list, label=model)

    ax[plot_id].legend()
    ax[plot_id].set_title(category, fontweight="bold")
    ax[plot_id].set_ylabel("Top1 Accuracy", fontweight="bold")
    ax[plot_id].set_xlabel("Sensor Reliability", fontweight="bold")

# 2. Runtime vs. Sensor Reliability

In [None]:
n_row = 1
n_col = 2
fig, ax = plt.subplots(n_row, n_col, figsize=(12,4))

for plot_id, category in enumerate(args.category_list):
    x = runtime_mean_dict[model][category].keys()
    runtime_mean = runtime_mean_dict[model][category].values()
    runtime_sem = runtime_sem_dict[model][category].values()

    ax[plot_id].errorbar(x, runtime_mean, yerr=runtime_sem, label=model)

    ax[plot_id].legend()
    ax[plot_id].set_title(category, fontweight="bold")
    ax[plot_id].set_ylabel("Runtime", fontweight="bold")
    ax[plot_id].set_xlabel("Sensor Reliability", fontweight="bold")