In [None]:
# Copyright 2022 Maximilien Le Clei.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
# We report in 'Neuroevolution of Recurrent Architectures on Control Tasks'
# the highest performing baselines that we could find. They are from 4 sources:
# 1. Stable Baselines 3
# (https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/benchmark.md)
# We download and evaluate their pre-trained agents but report their results
# when they are much better than the ones that we find
# For tasks Ant-v3, HalfCheetah-v3, Hopper-v3 and Humanoid-v3, we instead
# report results from other sources with higher baseline performance
# 2. Controlling Overestimation Bias with Truncated Mixture of Continuous
# Distributional Quantile Critics (https://arxiv.org/pdf/2005.04269.pdf)
# 3. Generative Actor-Critic: An Off-policy Algorithm Using the Push-forward
# Model (https://arxiv.org/pdf/2105.03733.pdf)
# 4. Addressing Function Approximation Error in Actor-Critic Methods
# (https://arxiv.org/pdf/1802.09477.pdf)

# Below is the evaluation of pre-trained Stable Baselines 3 agents

import os
import sys
import time
import warnings

warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath('') + '/../../..')

import gym
import numpy as np
import random
from sb3_contrib import TQC, QRDQN
from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
import torch

from utils.functions.control import get_task_name

homedir = os.path.expanduser('~')

In [2]:
def load(model_name, task):

    task_name = get_task_name(task)

    path = homedir + '/rl-baselines3-zoo/rl-trained-agents/' + model_name + '/'
    path += task_name + '_1/' + task_name + '.zip'
    custom_objects = {'learning_rate': 0.0,
                      'lr_schedule': lambda _: 0.0,
                      'clip_range': lambda _: 0.0}

    if model_name == 'a2c':
        model = A2C.load(path, custom_objects=custom_objects)
    elif model_name == 'ddpg':
        model = DDPG.load(path, custom_objects=custom_objects)
    elif model_name == 'dqn':
        model = DQN.load(path, custom_objects=custom_objects)
    elif model_name == 'ppo':
        model = PPO.load(path, custom_objects=custom_objects)
    elif model_name == 'qrdqn':
        model = QRDQN.load(path, custom_objects=custom_objects)
    elif model_name == 'sac':
        model = SAC.load(path, custom_objects=custom_objects)
    elif model_name == 'td3':
        model = TD3.load(path, custom_objects=custom_objects)
    else: # model == 'tqc':
        model = TQC.load(path, custom_objects=custom_objects)

    return model

In [3]:
def evaluate(env, model, nb_tests=10, max_nb_states=2**31-1, render=False):

    scores = []

    for i in range(nb_tests):

        env.seed(2**31-1-i)
        np.random.seed(2**31-1-i)
        torch.manual_seed(2**31-1-i)
        random.seed(2**31-1-i)

        state = env.reset()

        score = 0
        done = False
        nb_states = 0

        while not done:

            action, _ = model.predict(state)
            state, reward, done, _ = env.step(action)
            score += reward

            if render:
                env.render()
                time.sleep(0.01)

            nb_states += 1

            if nb_states == max_nb_states:
                break

        scores.append(score)

    return '-> ' + str(np.round(np.mean(scores), 1)) + '±' + \
        str(np.round(np.std(scores), 1))

# Acrobot-v1

In [4]:
env = gym.make('Acrobot-v1')

model = load('a2c', 'acrobot')
print('A2C ' + evaluate(env, model) )

model = load('dqn', 'acrobot')
print('DQN ' + evaluate(env, model) )

model = load('ppo', 'acrobot')
print('PPO ' + evaluate(env, model) )

model = load('qrdqn', 'acrobot')
print('QRDQN ' + evaluate(env, model) )

A2C -> -81.0±12.8
DQN -> -80.4±8.6
PPO -> -89.0±23.7
QRDQN -> -81.5±16.7


# CartPole-v1

In [5]:
env = gym.make('CartPole-v1')

model = load('a2c', 'cart_pole')
print('A2C ' + evaluate(env, model) )

model = load('dqn', 'cart_pole')
print('DQN ' + evaluate(env, model) )

model = load('ppo', 'cart_pole')
print('PPO ' + evaluate(env, model) )

model = load('qrdqn', 'cart_pole')
print('QRDQN ' + evaluate(env, model) )

A2C -> 500.0±0.0
DQN -> 500.0±0.0
PPO -> 500.0±0.0
QRDQN -> 500.0±0.0


# MountainCar-v0

In [6]:
env = gym.make('MountainCar-v0')

model = load('a2c', 'mountain_car')
print('A2C ' + evaluate(env, model) ) # -111.3	24.1

model = load('dqn', 'mountain_car')
print('DQN ' + evaluate(env, model) )

model = load('ppo', 'mountain_car')
print('PPO ' + evaluate(env, model) ) # -110.4	19.473

model = load('qrdqn', 'mountain_car')
print('QRDQN ' + evaluate(env, model) )

A2C -> -200.0±0.0
DQN -> -119.9±23.5
PPO -> -200.0±0.0
QRDQN -> -128.7±31.7


# MountainCarContinuous-v0

In [7]:
env = gym.make('MountainCarContinuous-v0')

model = load('a2c', 'mountain_car_continuous')
print('A2C ' + evaluate(env, model) ) # 91.2	0.3

model = load('ddpg', 'mountain_car_continuous')
print('DDPG ' + evaluate(env, model) )

model = load('ppo', 'mountain_car_continuous')
print('PPO ' + evaluate(env, model) ) # 88.3	2.6

model = load('sac', 'mountain_car_continuous')
print('SAC ' + evaluate(env, model) )

model = load('td3', 'mountain_car_continuous')
print('TD3 ' + evaluate(env, model) )

model = load('tqc', 'mountain_car_continuous')
print('TQC ' + evaluate(env, model) )

A2C -> -99.9±0.0
DDPG -> 93.5±0.1
PPO -> -18.7±0.7
SAC -> 94.6±1.0
TD3 -> 93.4±0.1
TQC -> 83.9±30.9


# Pendulum-v1

In [8]:
env = gym.make('Pendulum-v1')

model = load('a2c', 'pendulum')
print('A2C ' + evaluate(env, model) ) # -163.0	103.2

model = load('ddpg', 'pendulum')
print('DDPG ' + evaluate(env, model) )

model = load('ppo', 'pendulum')
print('PPO ' + evaluate(env, model) )

model = load('sac', 'pendulum')
print('SAC ' + evaluate(env, model) )

model = load('td3', 'pendulum')
print('TD3 ' + evaluate(env, model) )

model = load('tqc', 'pendulum')
print('TQC ' + evaluate(env, model) )

A2C -> -1593.8±21.3
DDPG -> -149.5±60.6
PPO -> -206.9±76.8
SAC -> -176.7±64.5
TD3 -> -154.1±64.4
TQC -> -150.6±61.2


# BipedalWalker-v3

In [9]:
env = gym.make('BipedalWalker-v3')

model = load('a2c', 'bipedal_walker')
print('A2C ' + evaluate(env, model) ) # 299.8	23.5

model = load('ddpg', 'bipedal_walker')
print('DDPG ' + evaluate(env, model) )

model = load('ppo', 'bipedal_walker')
print('PPO ' + evaluate(env, model) ) # 213.3	129.5

model = load('sac', 'bipedal_walker')
print('SAC ' + evaluate(env, model) )

model = load('td3', 'bipedal_walker')
print('TD3 ' + evaluate(env, model) )

model = load('tqc', 'bipedal_walker')
print('TQC ' + evaluate(env, model) )

A2C -> -114.3±0.7
DDPG -> 213.9±145.3
PPO -> -117.1±4.2
SAC -> 295.4±1.1
TD3 -> 318.2±0.5
TQC -> 335.0±0.3


# BipedalWalkerHardcore-v3

In [10]:
env = gym.make('BipedalWalkerHardcore-v3')

model = load('a2c', 'bipedal_walker_hardcore')
print('A2C ' + evaluate(env, model) ) # 96.2	122.9

model = load('ppo', 'bipedal_walker_hardcore')
print('PPO ' + evaluate(env, model) ) # 122.4	117.6

model = load('sac', 'bipedal_walker_hardcore')
print('SAC ' + evaluate(env, model) )

model = load('td3', 'bipedal_walker_hardcore')
print('TD3 ' + evaluate(env, model) )

model = load('tqc', 'bipedal_walker_hardcore')
print('TQC ' + evaluate(env, model) )

A2C -> -119.1±0.2
PPO -> -116.7±4.1
SAC -> 16.9±124.2
TD3 -> -87.3±13.0
TQC -> 266.5±84.1


# LunarLander-v2

In [11]:
env = gym.make('LunarLander-v2')

model = load('a2c', 'lunar_lander')
print('A2C ' + evaluate(env, model) )

model = load('dqn', 'lunar_lander')
print('DQN ' + evaluate(env, model) )

model = load('ppo', 'lunar_lander')
print('PPO ' + evaluate(env, model) ) # 242.1	31.8

model = load('qrdqn', 'lunar_lander')
print('QRDQN ' + evaluate(env, model) )

A2C -> 150.8±132.3
DQN -> 115.0±103.1
PPO -> 142.7±21.0
QRDQN -> 156.4±133.1


# LunarLanderContinuous-v2

In [12]:
env = gym.make('LunarLanderContinuous-v2')

model = load('a2c', 'lunar_lander_continuous')
print('A2C ' + evaluate(env, model) ) # 84.2	145.9

model = load('ddpg', 'lunar_lander_continuous')
print('DDPG ' + evaluate(env, model) )

model = load('ppo', 'lunar_lander_continuous')
print('PPO ' + evaluate(env, model) )

model = load('sac', 'lunar_lander_continuous')
print('SAC ' + evaluate(env, model) )

model = load('td3', 'lunar_lander_continuous')
print('TD3 ' + evaluate(env, model) )

model = load('tqc', 'lunar_lander_continuous')
print('TQC ' + evaluate(env, model) )

A2C -> -102.5±17.5
DDPG -> 194.4±147.7
PPO -> 128.7±41.4
SAC -> 269.7±20.4
TD3 -> 228.8±50.8
TQC -> 239.1±75.2


# Swimmer-v3

In [13]:
env = gym.make('Swimmer-v3')

model = load('a2c', 'swimmer')
print('A2C ' + evaluate(env, model) )

# ValueError: Error: Unexpected observation shape (8,) for Box environment,
# please use (9,) or (n_env, 9) for the observation shape.
# model = load('ppo', 'swimmer')
# print('PPO ' + evaluate(env, model) ) # 281.6	9.7

model = load('sac', 'swimmer')
print('SAC ' + evaluate(env, model) )

model = load('td3', 'swimmer')
print('TD3 ' + evaluate(env, model) )

model = load('tqc', 'swimmer')
print('TQC ' + evaluate(env, model) )

A2C -> 122.9±5.7
SAC -> 334.6±2.8
TD3 -> 358.3±1.6
TQC -> 328.7±1.7
