# Setup

In [None]:
# Imports
import os
import os.path as osp

import joblib
import numpy as np
import gym
import pandas as pd
import matplotlib.pyplot as plt

import pirl
from pirl import config
from pirl.experiments import experiments

from analysis import common

In [None]:
# Config
experiment = 'dummy-test-20180520_185729-e7ea21a03a6d0c94184d024631196687a4c018a7'
experiment_dir = osp.join('..', 'data', 'experiments', experiment)

# Value difference

In [None]:
algo_pattern = '(.*)'
env_pattern = '(.*)'
df = common.load_value(experiment_dir, algo_pattern, env_pattern)
mean, se = common.aggregate_value(df)

In [None]:
common.plot_ci(mean, se)

# Policy rollout

In [None]:
def expert_cached_value(rl, env_name, pol_discount=0.99, eval_discount=1.00, seed=1234, episodes=100):
    '''Rollout a cached expert policy for episodes.
       WARNING: This will be slow or just break if policy is not in cache!'''
    gen_policy, _sample, compute_value = config.RL_ALGORITHMS[rl]
    policy, value = experiments._train_policy(rl, pol_discount, env_name, seed, None)
    vmean, vse = value
    print('Cached value: {:.3f} +/- {:.3f}'.format(vmean, 1.96 * vse))
    
    env = gym.make(env_name)
    rmean, rse = compute_value(env, policy, eval_discount, num_episodes=episodes, seed=seed)
    print('Rollout value: {:.3f} +/- {:.3f}'.format(rmean, 1.96 * rse))
    return (vmean, vse), (rmean, rse)

def _policy_value(results_dir, rl, env_name, pol_discount, eval_discount, episodes, seed):
    _gen_policy, _sample, compute_value = config.RL_ALGORITHMS[rl]
    fname = osp.join(results_dir, 'policy.pkl')
    print('Loading policy from ', fname)
    policy = joblib.load(fname)
    env = gym.make(env_name)
    
    mean, se = compute_value(env, policy, eval_discount, num_episodes=episodes, seed=seed)
    print('Rollout value: {:.3f} +/- {:.3f}'.format(mean, 1.96 * se))
    return mean, se

def expert_value(experiment_dir, rl, env_name, pol_discount=0.99, eval_discount=1.00, episodes=100, seed=1234):
    results_dir = osp.join(experiment_dir, 'expert', env_name, rl)
    return _policy_value(results_dir, rl, env_name, pol_discount, eval_discount, episodes, seed)

def irl_eval_value(experiment_dir, irl_name, num_traj, rl, env_name, pol_discount=0.99, eval_discount=1.00, episodes=100, seed=1234):
    results_dir = osp.join(experiment_dir, 'eval', env_name, 
                           '{}:{}:{}'.format(irl_name, num_traj, num_traj), rl)
    return _policy_value(results_dir, rl, env_name, pol_discount, eval_discount, episodes, seed)
    
def irl_value(experiment_dir, irl_name, env_name, num_traj, eval_discount=1.00, episodes=100):
    _irl_algo, _reward_wrapper, compute_value = experiments.make_irl_algo(irl_name)
    irl_dir = osp.join(experiment_dir, 'irl', irl_name)
    if not os.path.exists(irl_dir):
        raise FileNotFoundError("No result directory {}".format(irl_dir))
    
    pop_fname = osp.join(irl_dir, str(num_traj), 'policies.pkl')
    sin_fname = osp.join(irl_dir, env_name, str(num_traj), 'policy.pkl')
    if os.path.exists(pop_fname):
        policies = joblib.load(pop_fname)
        print(policies.keys())
        policy = policies[env_name]
    elif os.path.exists(sin_fname):
        policy = joblib.load(sin_fname)
    else:
        raise FileNotFoundError("Neither {} or {} exists".format(pop_fname, sin_fname))
    
    env = gym.make(env_name)
    mean, se = compute_value(env, policy, discount=eval_discount, num_episodes=episodes)
    print('Rollout value: {} +/- {}'.format(mean, 1.96 * se))

In [None]:
expert_cached_value('ppo_cts', 'Reacher-v2', episodes=100)

In [None]:
expert_value(experiment_dir, 'ppo_cts', 'Reacher-v2', episodes=500, seed=1234)

In [None]:
irl_value(experiment_dir, 'airl', 'Reacher-v2', 1000, episodes=100)

# Visualizing rewards (gridworld only)

In [None]:
def show_heatmaps(irl_algo, kind='inline', out_dir=None, shape=(9,9), **kwargs):
    data = pd.read_pickle(osp.join(experiment_dir, 'results.pkl'))
    rewards = data['rewards'][irl_algo]
    if kind in ['inline', 'pdf']:
        figs = common.gridworld_heatmap(rewards, shape)
        if out_dir is None:
            for fig in figs:
                display(fig[1])
        else:
            common.save_figs(figs, out_dir)
    elif kind == 'movie':
        common.gridworld_heatmap_movie(out_dir, rewards, shape)
    else:
        assert False

In [None]:
irl_algos = ['mce', 'mcep_reg1e0']
for irl in irl_algos:
    show_heatmaps(irl, kind='inline', shape=(4,4))
    #show_heatmaps(irl, kind='movie', out_dir='figs/jungle/movies/' + irl)
    #show_heatmaps(irl, kind='movie', out_dir='figs/jungle/' + irl)

# Loss curve (PPO only)

In [None]:
def ppo_progress(results_dir):
    path = osp.join(results_dir, 'progress.csv')
    df = pd.read_csv(path)
    df = df.set_index('serial_timesteps')
    return df

def expert_ppo_progress(experiment_dir, env_name, rl_name):
    results_dir = osp.join(experiment_dir, 'expert', 
                       experiments.sanitize_env_name(env_name),
                       rl_name)
    return ppo_progress(results_dir)

In [None]:
envs = ['InvertedPendulum-v2', 'InvertedDoublePendulum-v2', 'Reacher-v2']
for env in envs:
    df = expert_ppo_progress(experiment_dir, env, 'ppo_cts')
    plt.figure()
    df['eprewmean'].plot()
    print(df['eprewmean'].max())
    plt.title(env)

# Evaluate checkpoint

In [None]:
def load_checkpoint(results_dir, checkpoint_num=None):
    checkpoint_dir = osp.join(results_dir, 'checkpoints')
    if checkpoint_num is None:
        checkpoint_num = max(os.listdir(checkpoint_dir))
    checkpoint_fname = osp.join(checkpoint_dir, checkpoint_num)
    print('Loading from ', checkpoint_fname)
    policy = joblib.load(checkpoint_fname)
    
    return policy

def ppo_value2(results_dir, rl, env_name, pol_discount, eval_discount, episodes):
    _gen_policy, _sample, compute_value = config.RL_ALGORITHMS[rl]
    fname = osp.join(results_dir, 'policy.pkl')
    print('Loading policy from ', fname)
    policy = joblib.load(fname)
    env = gym.make(env_name)
    
    mean, se = compute_value(env, policy, eval_discount, num_episodes=episodes)
    print('Rollout value: {:.3f} +/- {:.3f}'.format(mean, 1.96 * se))
    return mean, se

def ppo_value(policy, env_name, episodes, seed=1234):
    _, _, compute_value = config.RL_ALGORITHMS['ppo_cts']
    env = gym.make(env_name)
    mean, se = compute_value(env, policy, 1.00, num_episodes=episodes, seed=seed)
    print('Rollout value: {:.3f} +/- {:.3f}'.format(mean, 1.96 * se))

In [None]:
policy = load_checkpoint(osp.join(experiment_dir, 'expert/InvertedPendulum-v2/ppo_cts'), '00488')
ppo_value(policy, 'InvertedPendulum-v2', 500, 1234)