In [1]:
import matplotlib.pyplot as plt
import matplotlib
import tensorflow as tf
import time, gym
from collections import defaultdict
import seaborn as sns
import seaborn
import numpy as np
from stable_baselines import GAIL
from hr_coordination.utils import save_pickle, load_pickle, reset_tf, load_dict_from_file, create_dir_if_not_exists, profile, cross_entropy, accuracy
from hr_coordination.agents.agent import AgentPair, RandomAgent, GreedyHumanModel, AgentFromPolicy, StayAgent, EmbeddedPlanningAgent
from hr_coordination.agents.benchmarking import AgentEvaluator
from hr_coordination.pbt.pbt_utils import get_config_from_pbt_dir, load_pickle, setup_mdp_env, get_vectorized_gym_env, create_model, update_model
from hr_coordination.imitation.behavioural_cloning import train_bc_agent, get_bc_agent_from_saved, DEFAULT_DATA_PARAMS, DEFAULT_ENV_PARAMS, eval_with_benchmarking_from_model, eval_with_benchmarking_from_saved, BC_SAVE_DIR, get_bc_agent_from_model, symmetric_bc
from hr_coordination.mdp.overcooked_mdp import Action, NO_REW_SHAPING_PARAMS, Direction, OvercookedGridworld
from hr_coordination.mdp.overcooked_env import OvercookedEnv
from hr_coordination.human.process_data import save_npz_file, get_trajs_from_data
from hr_coordination.ppo.ppo import load_training_data, get_ppo_agent
from hr_coordination.planning.planners import NO_COUNTERS_PARAMS, MediumLevelPlanner

In [2]:
BC_MODELS_TRAINING_BEST = {
    "simple": ("simple_training_nr2", None), ##
    "unident_s": ("unident_s_training_nr2", None),
    "random1": ("random1_training_nr1", None),
    "random0": ("random0_training_nr3", None),
    "random3": ("random3_training_nr0", None)
}

BC_MODELS_TESTING_BEST = {
    "simple": ("simple_training_nr4", None),
    "unident_s": ("unident_s_training_nr2", "acc"),
    "random1": ("random1_training_nr3", None),
    "random0": ("random0_training_nr3", None),
    "random3": ("random3_training_nr0", None)
}

In [3]:
env_params = DEFAULT_ENV_PARAMS.copy()
env_params['ENV_HORIZON'] = 100

In [6]:
def get_delivery_horizon(layout):
    if layout == "simple" or layout == "random1":
        return 2
    return 3

def P_BC_evaluation_for_layout(layout):
    env_params["FIXED_MDP"] = layout
    delivery_horizon = get_delivery_horizon(layout)
    print(delivery_horizon)
    
    train_model_name, train_best_type = BC_MODELS_TRAINING_BEST[layout]
    agent_bc_train, _, _ = get_bc_agent_from_saved(train_model_name, train_best_type)
    agent_bc_train.stochastic = False
    
    agent_bc_train_embedded, _, _ = get_bc_agent_from_saved(train_model_name, train_best_type)
    p_bc_train = EmbeddedPlanningAgent(agent_bc_train_embedded, agent_bc_train_embedded.mlp, delivery_horizon)
    p_bc_train.env = OvercookedEnv.from_config(env_params)
    p_bc_train.debug = True
    
    test_model_name, test_best_type = BC_MODELS_TESTING_BEST[layout]
    agent_bc_test, _, _ = get_bc_agent_from_saved(test_model_name, test_best_type)
    agent_bc_test.stochastic = False
    
    agent_bc_test_embedded, _, _ = get_bc_agent_from_saved(test_model_name, test_best_type)
    p_bc_test = EmbeddedPlanningAgent(agent_bc_test_embedded, agent_bc_test_embedded.mlp, delivery_horizon)
    p_bc_test.env = OvercookedEnv.from_config(env_params)
    p_bc_test.debug = True
    
    # P_BC_test + BC_test
    ave = AgentEvaluator.from_config(env_params)
    ap_training = AgentPair(p_bc_test, agent_bc_train)
    data = ave.evaluate_agent_pair(ap_training, num_games=1)
    rew0 = data['ep_returns'][0]
    
    ave = AgentEvaluator.from_config(env_params)
    ap_training = AgentPair(agent_bc_train, p_bc_test)
    data = ave.evaluate_agent_pair(ap_training, num_games=1)
    rew1 = data['ep_returns'][0]
    print("P_BC_test + BC_test", rew0, rew1)
    
    # P_BC_train + BC_test
    ave = AgentEvaluator.from_config(env_params)
    ap_testing = AgentPair(p_bc_train, agent_bc_test)
    data = ave.evaluate_agent_pair(ap_testing, num_games=1)
    rew0 = data['ep_returns'][0]
    
    ave = AgentEvaluator.from_config(env_params)
    ap_testing = AgentPair(agent_bc_test, p_bc_train)
    data = ave.evaluate_agent_pair(ap_testing, num_games=1)
    rew1 = data['ep_returns'][0]
    print("P_BC_train + BC_test", rew0, rew1)

In [7]:
layouts = ['simple', 'unident_s', 'random1']
layouts = ['random1']

In [8]:
for layout in layouts:
    P_BC_evaluation_for_layout(layout)

2
Loading a model without an environment, this model cannot be trained until it has a valid environment.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Loaded MediumLevelPlanner from data/planners/random1_am.pkl
Loading a model without an environment, this model cannot be trained until it has a valid environment.
Loaded MediumLevelPlanner from data/planners/random1_am.pkl
{'pot-delivery': 5, 'dish-pot': 4}
Loading a model without an environment, this model cannot be trained until it has a valid environment.
Loaded MediumLevelPlanner from data/planners/random1_am.pkl
Loading a model without an environment, this model cannot be trained until it has a valid environment.


  0%|          | 0/1 [00:00<?, ?it/s]

Loaded MediumLevelPlanner from data/planners/random1_am.pkl
{'pot-delivery': 5, 'dish-pot': 4}
X X X P X 
X   ↑0  P 
D ↑1X   X 
O       X 
X O S X X 
Current orders: 20/20 are any's

Found goal after: 	 971.9179887771606 seconds, 	 5763 state expanded (0.6559083810515357 frac. unique) 	 ~5.929512640516966 expansions/s
expected joint action ((-1, 0), (0, 0))
Timestep: 1
Joint action: ((-1, 0), (0, 0)) 	 Reward: 0 + shape * 0 
X X X P X 
X ←0    P 
D ↑1X   X 
O       X 
X O S X X 
Current orders: 20/20 are any's

Found goal after: 	 766.1907057762146 seconds, 	 4521 state expanded (0.6509621765096217 frac. unique) 	 ~5.900619735944007 expansions/s
expected joint action ((1, 0), (0, -1))
Timestep: 2
Joint action: ((1, 0), (0, -1)) 	 Reward: 0 + shape * 0 
X X X P X 
X ↑1→0  P 
D   X   X 
O       X 
X O S X X 
Current orders: 20/20 are any's

Found goal after: 	 573.1503758430481 seconds, 	 5553 state expanded (0.6509994597514857 frac. unique) 	 ~9.688556850079843 expansions/s
expected joi

KeyboardInterrupt: 

P_BC with horizon 100

simple
P_BC_test + P_BC_test: 40, 40 -> 160
P_BC_train + BC_test: 40, 40 -> 160

unident_s
P_BC_test + P_BC_test: 80, 60 -> 280
P_BC_train + BC_test: 60, 40 -> 200

random1 (two delivery horizon)
P_BC_test + P_BC_test: 
P_BC_train + BC_test: 

-----

180, 180
P_BC_train + BC_test: 180, 200

