In [1]:
"""
Launcher for experiments with PEARL

"""
import os
import pathlib
import numpy as np
import click
import json
import torch

from rlkit.envs import ENVS
from rlkit.envs.wrappers import NormalizedBoxEnv
from rlkit.torch.sac.policies import TanhGaussianPolicy
from rlkit.torch.networks import FlattenMlp, MlpEncoder, RecurrentEncoder
from rlkit.torch.sac.sac import PEARLSoftActorCritic
from rlkit.torch.sac.agent import PEARLAgent
from rlkit.launchers.launcher_util import setup_logger
import rlkit.torch.pytorch_util as ptu
from configs.default import default_config



import sys
sys.path.append("/home/user/anaconda3/envs/pearl/lib/python3.5/site-packages")
print(sys.path)
import gym

['mujoco_env.py', 'ant_dir.py', 'assets', 'ant_goal.py', 'half_cheetah_vel.py', 'half_cheetah.py', 'wrappers.py', 'ant.py', 'ant_multitask_base.py', '__init__.py', '__pycache__', 'humanoid_dir.py', 'trash']
mujoco_env.py
mujoco_env
ant_dir.py
ant_dir
assets
ant_goal.py
ant_goal
half_cheetah_vel.py
half_cheetah_vel
half_cheetah.py
half_cheetah
wrappers.py
wrappers
ant.py
ant
ant_multitask_base.py
ant_multitask_base
__init__.py
__pycache__
humanoid_dir.py
humanoid_dir
trash
['/home/user/Dokumente/#master/Sem_03/ALDR/git/tum-adlr-ws20-08/papers/oyster', '/home/user/anaconda3/lib/python38.zip', '/home/user/anaconda3/lib/python3.8', '/home/user/anaconda3/lib/python3.8/lib-dynload', '', '/home/user/.local/lib/python3.8/site-packages', '/home/user/anaconda3/lib/python3.8/site-packages', '/home/user/Dokumente/#master/Sem_03/ALDR/nbdev', '/home/user/.local/lib/python3.8/site-packages/IPython/extensions', '/home/user/.ipython', '/home/user/anaconda3/envs/pearl/lib/python3.5/site-packages', '/hom

In [2]:
def experiment(variant):

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant['algo_params']['use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(
        latent_dim,
        context_encoder,
        policy,
        zIsZeros=False,
        **variant['algo_params']
    )
    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params']
    )

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()

In [3]:
def deep_update_dict(fr, to):
    ''' update dict of dicts with new values '''
    # assume dicts have same keys
    for k, v in fr.items():
        if type(v) is dict:
            deep_update_dict(v, to[k])
        else:
            to[k] = v
    return to

In [4]:
def main(config, gpu, docker, debug):

    variant = default_config
    if config:
        with open(os.path.join(config)) as f:
            exp_params = json.load(f)
        variant = deep_update_dict(exp_params, variant)
    variant['util_params']['gpu_id'] = gpu

    experiment(variant)

In [None]:
main(config='./configs/cheetah-vel.json', gpu=False, docker=False, debug=False)

hello
<HalfCheetahVelEnv instance>
[1. 1. 1. 1. 1. 1.]
Box(-1.0, 1.0, (6,), float32)
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6
20
6




HBox(children=(FloatProgress(value=0.0, description='Total Iterations', max=500.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Train Tasks', max=5.0, style=ProgressStyle(description_wi…

HBox(children=(FloatProgress(value=0.0, description='Train Step', max=2000.0, style=ProgressStyle(description_…

collecting initial pool of data for train and eval
fixed z:  False


  Variable._execution_engine.run_backward(


[-274.0129071805964]
----------------------------------  ----------------
Z mean train                             0.00272045
Z variance train                         0.00694189
KL Divergence                          159.084
KL Loss                                 15.9084
QF Loss                                138.917
VF Loss                                 16.24
Policy Loss                             -3.99314
Q Predictions Mean                      -0.00578031
Q Predictions Std                        0.00204433
Q Predictions Max                        0.00215009
Q Predictions Min                       -0.0128725
V Predictions Mean                      -0.00192036
V Predictions Std                        0.00213404
V Predictions Max                        0.00473923
V Predictions Min                       -0.00879124
Log Pis Mean                            -4.01894
Log Pis Std                              0.528611
Log Pis Max                             -2.09103
Log Pis Min           

Policy log std Mean                     -0.35407
Policy log std Std                       0.195993
Policy log std Max                       0.221035
Policy log std Min                      -1.15144
Z mean eval                              0.101555
Z variance eval                          0.0695711
AverageTrainReturn_all_train_tasks    -202.042
AverageReturn_all_train_tasks         -161.236
AverageReturn_all_test_tasks          -150.448
Number of train steps total           8000
Number of env steps total           220000
Number of rollouts total                 0
Train Time (s)                        2101.4
(Previous) Eval Time (s)               134.503
Sample Time (s)                         13.0179
Epoch Time (s)                        2248.92
Total Train Time (s)                  9899.53
Epoch                                    3
----------------------------------  --------------
2020-12-29 19:20:15.002527 CET | [2020_12_29_16_35_15] Iteration #3 | Epoch Duration: 2275.3693327903748


[-274.0129071805964, -230.5835926598467, -253.69840493585428, -150.44784922840003, -136.77510359879528, -126.0996444202939, -140.34931504838934, -126.19928426976692]
----------------------------------  --------------
Z mean train                             0.0533855
Z variance train                         0.592025
KL Divergence                           49.8966
KL Loss                                  4.98966
QF Loss                                 32.9631
VF Loss                                  7.46208
Policy Loss                             60.8565
Q Predictions Mean                     -67.3031
Q Predictions Std                       70.2118
Q Predictions Max                        0.60479
Q Predictions Min                     -355.06
V Predictions Mean                     -60.3706
V Predictions Std                       68.7536
V Predictions Max                        5.75492
V Predictions Min                     -351.522
Log Pis Mean                            -2.42562
Log Pis 

V Predictions Std                      109.738
V Predictions Max                        5.72042
V Predictions Min                     -731.337
Log Pis Mean                            -2.02244
Log Pis Std                              2.12854
Log Pis Max                              9.66628
Log Pis Min                             -9.60878
Policy mu Mean                           0.0133468
Policy mu Std                            0.630439
Policy mu Max                            2.27757
Policy mu Min                           -2.87624
Policy log std Mean                     -0.405639
Policy log std Std                       0.227891
Policy log std Max                       0.130197
Policy log std Min                      -1.83979
Z mean eval                              0.3789
Z variance eval                          0.135977
AverageTrainReturn_all_train_tasks     -74.6979
AverageReturn_all_train_tasks          -93.9099
AverageReturn_all_test_tasks          -111.286
Number of train steps 

Policy log std Max                       0.0569306
Policy log std Min                      -1.87726
Z mean eval                              0.3464
Z variance eval                          0.119236
AverageTrainReturn_all_train_tasks     -75.7713
AverageReturn_all_train_tasks          -92.3541
AverageReturn_all_test_tasks           -80.6583
Number of train steps total          28000
Number of env steps total           270000
Number of rollouts total                 0
Train Time (s)                        1525.83
(Previous) Eval Time (s)               112.061
Sample Time (s)                         10.3067
Epoch Time (s)                        1648.2
Total Train Time (s)                 27161.5
Epoch                                   13
----------------------------------  --------------
2020-12-30 00:07:56.981959 CET | [2020_12_29_16_35_15] Iteration #13 | Epoch Duration: 1616.0638563632965
2020-12-30 00:07:56.996153 CET | [2020_12_29_16_35_15] Iteration #13 | Started Training: True
[-27

Number of rollouts total                 0
Train Time (s)                        1625.01
(Previous) Eval Time (s)               106.85
Sample Time (s)                          7.9771
Epoch Time (s)                        1739.83
Total Train Time (s)                 32309.3
Epoch                                   16
----------------------------------  --------------
2020-12-30 01:33:44.767483 CET | [2020_12_29_16_35_15] Iteration #16 | Epoch Duration: 1739.024486541748
2020-12-30 01:33:44.769404 CET | [2020_12_29_16_35_15] Iteration #16 | Started Training: True
[-274.0129071805964, -230.5835926598467, -253.69840493585428, -150.44784922840003, -136.77510359879528, -126.0996444202939, -140.34931504838934, -126.19928426976692, -103.16878583327883, -98.5243732837948, -111.28630985642714, -85.33634599694174, -75.7445294857944, -80.65833918667438, -83.36920843009855, -70.56862311006127, -68.64829346671951, -64.66744328437376]
----------------------------------  -------------
Z mean train     

Epoch                                   19
----------------------------------  ---------------
2020-12-30 02:58:05.484006 CET | [2020_12_29_16_35_15] Iteration #19 | Epoch Duration: 1728.4080967903137
2020-12-30 02:58:05.495125 CET | [2020_12_29_16_35_15] Iteration #19 | Started Training: True
[-274.0129071805964, -230.5835926598467, -253.69840493585428, -150.44784922840003, -136.77510359879528, -126.0996444202939, -140.34931504838934, -126.19928426976692, -103.16878583327883, -98.5243732837948, -111.28630985642714, -85.33634599694174, -75.7445294857944, -80.65833918667438, -83.36920843009855, -70.56862311006127, -68.64829346671951, -64.66744328437376, -68.77288061437083, -65.68487037920701, -66.84640300449469]
----------------------------------  --------------
Z mean train                             0.46881
Z variance train                         0.709125
KL Divergence                           73.9663
KL Loss                                  7.39663
QF Loss                         

2020-12-30 04:22:15.877142 CET | [2020_12_29_16_35_15] Iteration #22 | Epoch Duration: 1750.074473142624
2020-12-30 04:22:15.881813 CET | [2020_12_29_16_35_15] Iteration #22 | Started Training: True
[-274.0129071805964, -230.5835926598467, -253.69840493585428, -150.44784922840003, -136.77510359879528, -126.0996444202939, -140.34931504838934, -126.19928426976692, -103.16878583327883, -98.5243732837948, -111.28630985642714, -85.33634599694174, -75.7445294857944, -80.65833918667438, -83.36920843009855, -70.56862311006127, -68.64829346671951, -64.66744328437376, -68.77288061437083, -65.68487037920701, -66.84640300449469, -65.55570608492596, -68.90744423205724, -60.94047704276206]
----------------------------------  ---------------
Z mean train                             0.383867
Z variance train                         0.846625
KL Divergence                           62.6593
KL Loss                                  6.26593
QF Loss                                 37.124
VF Loss            

Epoch                                   25
----------------------------------  -------------
2020-12-30 05:47:30.809833 CET | [2020_12_29_16_35_15] Iteration #25 | Epoch Duration: 1754.3115832805634
2020-12-30 05:47:30.812149 CET | [2020_12_29_16_35_15] Iteration #25 | Started Training: True
[-274.0129071805964, -230.5835926598467, -253.69840493585428, -150.44784922840003, -136.77510359879528, -126.0996444202939, -140.34931504838934, -126.19928426976692, -103.16878583327883, -98.5243732837948, -111.28630985642714, -85.33634599694174, -75.7445294857944, -80.65833918667438, -83.36920843009855, -70.56862311006127, -68.64829346671951, -64.66744328437376, -68.77288061437083, -65.68487037920701, -66.84640300449469, -65.55570608492596, -68.90744423205724, -60.94047704276206, -67.30722624263971, -67.22452844380275, -68.6277523747863]
----------------------------------  --------------
Z mean train                             0.561342
Z variance train                         0.670754
KL Divergen

Number of rollouts total                 0
Train Time (s)                        1563.81
(Previous) Eval Time (s)                81.0358
Sample Time (s)                          6.05671
Epoch Time (s)                        1650.91
Total Train Time (s)                 52612.9
Epoch                                   28
----------------------------------  --------------
2020-12-30 07:12:08.452446 CET | [2020_12_29_16_35_15] Iteration #28 | Epoch Duration: 1671.4055936336517
2020-12-30 07:12:08.465903 CET | [2020_12_29_16_35_15] Iteration #28 | Started Training: True
[-274.0129071805964, -230.5835926598467, -253.69840493585428, -150.44784922840003, -136.77510359879528, -126.0996444202939, -140.34931504838934, -126.19928426976692, -103.16878583327883, -98.5243732837948, -111.28630985642714, -85.33634599694174, -75.7445294857944, -80.65833918667438, -83.36920843009855, -70.56862311006127, -68.64829346671951, -64.66744328437376, -68.77288061437083, -65.68487037920701, -66.84640300449469, -65

Policy log std Max                       0.0469297
Policy log std Min                      -2.07324
Z mean eval                              0.344856
Z variance eval                          0.139713
AverageTrainReturn_all_train_tasks     -63.5813
AverageReturn_all_train_tasks          -67.9411
AverageReturn_all_test_tasks           -69.032
Number of train steps total          64000
Number of env steps total           360000
Number of rollouts total                 0
Train Time (s)                        1475.5
(Previous) Eval Time (s)               114.093
Sample Time (s)                         10.9148
Epoch Time (s)                        1600.51
Total Train Time (s)                 57699.5
Epoch                                   31
----------------------------------  ---------------
2020-12-30 08:36:55.011369 CET | [2020_12_29_16_35_15] Iteration #31 | Epoch Duration: 1589.7790400981903
2020-12-30 08:36:55.012956 CET | [2020_12_29_16_35_15] Iteration #31 | Started Training: True


In [None]:
import sim_policy 

In [None]:
sim_policy.sim_policy(variant=)