# Setup

In [None]:
# Imports
import os
import os.path as osp

import joblib
import numpy as np
import gym
import pandas as pd
import matplotlib.pyplot as plt

import pirl
from pirl.experiments import config, experiments, plots as myplots

In [None]:
# Config
experiment = 'dummy-continuous-test-slow-20180423_171736-f12823ea19bd9c1321eb9b7bfbec69066ae73641'
experiment_dir = osp.join('data', experiment)

# Value difference

In [None]:
def plot_value(experiment_dir, algo_pattern='(.*)', env_pattern='(.*)', algos=['.*'], dps=2):
    fname = osp.join(experiment_dir, 'results.pkl')
    data = pd.read_pickle(fname)
    
    value = myplots.extract_value(data)
    value.columns = value.columns.str.extract(algo_pattern, expand=False)
    envs = value.index.levels[0].str.extract(env_pattern, expand=False)
    value.index = value.index.set_levels(envs, level=0)
    
    matches = []
    mask = pd.Series(False, index=value.columns)
    for p in algos:
        m = value.columns.str.match(p)
        matches += list(value.columns[m & (~mask)])
        mask |= m
    value = value.loc[:, matches]
    
    value.columns = value.columns.str.split('_').str.join(' ')  # so lines wrap
    value = value.round(dps)
    return value

In [None]:
algo_pattern = '(.*)'
env_pattern = '(.*)'
plot_value(experiment_dir, algo_pattern, env_pattern)

# Policy rollout

In [None]:
def expert_value(rl, env_name, discount=0.99, seed=1234, episodes=100):
    '''Rollout a cached expert policy for episodes.
       WARNING: This will be slow or just break if policy is not in cache!'''
    gen_policy, _sample, compute_value = config.RL_ALGORITHMS[rl]
    policy, value = experiments._train_policy(rl, discount, env_name, seed, None)
    vmean, vse = value
    print('Cached value: {:.3f} +/- {:.3f}'.format(vmean, 1.96 * vse))
    
    env = gym.make(env_name)
    rmean, rse = compute_value(env, policy, discount, num_episodes=episodes)
    print('Rollout value: {:.3f} +/- {:.3f}'.format(rmean, 1.96 * rse))
    
def irl_value(experiment_dir, irl_name, env_name, num_traj, discount=0.99, episodes=100):
    _irl_algo, _reward_wrapper, compute_value = experiments.make_irl_algo(irl_name)
    irl_dir = osp.join(experiment_dir, 'irl', irl_name)
    if not os.path.exists(irl_dir):
        raise FileNotFoundError("No result directory {}".format(irl_dir))
    
    pop_fname = osp.join(irl_dir, str(num_traj), 'policies.pkl')
    sin_fname = osp.join(irl_dir, env_name, str(num_traj), 'policy.pkl')
    if os.path.exists(pop_fname):
        policies = joblib.load(pop_fname)
        print(policies.keys())
        policy = policies[env_name]
    elif os.path.exists(sin_fname):
        policy = joblib.load(sin_fname)
    else:
        raise FileNotFoundError("Neither {} or {} exists".format(pop_fname, sin_fname))
    
    env = gym.make(env_name)
    mean, se = compute_value(env, policy, discount, num_episodes=episodes)
    print('Rollout value: {} +/- {}'.format(mean, 1.96 * se))

In [None]:
expert_value('ppo_cts', 'Reacher-v2', episodes=10)

In [None]:
irl_value(experiment_dir, 'airl', 'Reacher-v2', 1000, episodes=100)

# Visualizing rewards (gridworld only)

In [None]:
shape = (4,4)
irl_algo = 'mces'
figs = myplots.gridworld_heatmap(data['rewards'][irl_algo], shape)
for fig in figs:
    display(fig[1])

In [None]:
# Save to disk
out_dir = './figs/some-experiment'
for k, v in data['reward'].items():
    pirl.experiments.plots.save_figs(pirl.experiments.plots.gridworld_heatmap(v, (4,4)), os.path.join(out_dir, k))

# Loss curve (PPO only)

In [None]:
def ppo_progress(results_dir):
    path = osp.join(results_dir, 'progress.csv')
    df = pd.read_csv(path)
    df = df.set_index('serial_timesteps')
    return df

def expert_ppo_progress(experiment_dir, env_name, rl_name):
    results_dir = osp.join(experiment_dir, 'expert', env_name, rl_name)
    return ppo_progress(results_dir)

df = expert_ppo_progress(experiment_dir, 'Reacher-v2', 'ppo_cts')
df['eprewmean'].plot()