In [1]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
#conda create -n <ENVNAME> python=3.10
#conda activate <ENVNAME>
#conda install jupyter notebook; python3 -m ipykernel install --user --name RL
#conda install pytorch torchvision -c pytorch
!pip install -q tqdm d4rl stable-baselines3 seaborn scikit-learn colorama tensorboard tensorboardX opencv-python structlog "gym[classic_control]"

In [None]:
# Run only once.
# Downloading `d3rlpy` package from source.

#!git clone https://github.com/takuseno/d3rlpy d3rlpy_GitHub
#!pip3 install -e d3rlpy_GitHub/
#!mv d3rlpy_GitHub/d3rlpy .
#!rm -rf d3rlpy_GitHub/

In [None]:
# Reflection of modifications:
# We will implement SAM optimizer into this package.

#!cp -rf changes/algos/* d3rlpy/algos/
#!rm -rf changes/algos/
#!mv -f changes/* d3rlpy/

In [2]:
# Ignore This
#import os
#os.environ['CC']="/opt/homebrew/bin/gcc-12"
#os.environ['CXX']="/opt/homebrew/bin/g++-12"
#import d4rl  # solely running this line, we meet GCC error

## Get Dataset

In [3]:
import d3rlpy
from sklearn.model_selection import train_test_split

In [4]:
dataset, env = d3rlpy.datasets.get_pendulum()
task_name='Pendulum'

In [5]:
env.reset()

array([-0.909701 ,  0.4152639,  0.7662913], dtype=float32)

In [6]:
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
len(train_episodes), len(test_episodes)

(400, 100)

In [7]:
epi0 = train_episodes[0]
print(epi0.observations.shape, epi0.actions.shape, len(epi0.transitions))

(199, 3) (199, 1) 198


## Algorithms

In [8]:
#!pip3 install tensorboard --upgrade
%load_ext tensorboard

In [23]:
from d3rlpy.models.optimizers import OptimizerFactory
import torch
from torch.optim import SGD, Adam
from sam import SAM
# optimizer
# Learning rates: Use default values.
# Notice: `base_optimizer` must be a `str`
rho = 0.05
opt_factories = {
    #'SGD': OptimizerFactory(optim_cls=SGD),
    'Adam': OptimizerFactory(optim_cls=Adam),
    #'SamSGD': OptimizerFactory(optim_cls=SAM, base_optimizer="SGD", rho=rho),
    'SamAdam': OptimizerFactory(optim_cls=SAM, base_optimizer="Adam", rho=rho)
}

In [101]:
# optimizers for temperature and alpha: we fix them by Adam
# otherwise we may or may not apply SAM.
algo_opt_lists = {
    'DDPG': ['actor_optim_factory', 'critic_optim_factory'],
    'SAC': ['actor_optim_factory', 'critic_optim_factory'],
    'TD3': ['actor_optim_factory', 'critic_optim_factory'],
    'BCQ': ['actor_optim_factory', 'critic_optim_factory', 'imitator_optim_factory'],
    'BEAR': ['actor_optim_factory', 'critic_optim_factory', 'imitator_optim_factory'],
    'CQL': ['actor_optim_factory', 'critic_optim_factory'],
    'AWAC': ['actor_optim_factory', 'critic_optim_factory'],
    'IQL': ['actor_optim_factory', 'critic_optim_factory'],
    'BC': ['optim_factory'],
}
misc_opt_kwargs={
    'temp_optim_factory': opt_factories['Adam'],
    'alpha_optim_factory': opt_factories['Adam']
}

In [99]:
import os

def run(algo, opt_dict, train_episodes, test_episodes, use_gpu=False, n_epochs=20, pretrained_path=None):
    """
    - algo (d3rlpy.algos.AlgoBase): RL algorithms (e.g. DQN, DDPG, CQL, ...)
    - opt_dict (dict[str->str]): optimizer info in strings (e.g. {'actor_optim_factory': 'SGD', 'critic_optim_factor': 'SamAdam'})
    - train_episodes: training data
    - test_episodes: evaluation data
    - use_gpu (bool): use CUDA if True
    - n_epochs (int): number of epochs
    """

    opt_kwargs = {opt_type: opt_factories[opt_name] for opt_type, opt_name in opt_dict.items()}
    opt_string = '_'.join(sum([[opt_type.split('_')[0], opt_name] for opt_type, opt_name in opt_dict.items()], start=[]))
    experiment_name = f"{algo.__name__}_{opt_string}"
    opt_kwargs.update(misc_opt_kwargs)
    model = algo(use_gpu=use_gpu, **opt_kwargs)
    
    if pretrained_path:
        model.build_with_env(env)
        model.load_model(pretrained_path)

    # Train
    model.fit(
        train_episodes,
        eval_episodes=None if algo.__name__=='BC' else test_episodes,
        experiment_name=os.path.join(task_name, experiment_name),
        n_epochs=n_epochs,
        scorers={
            'environment': d3rlpy.metrics.evaluate_on_environment(env),
            'td_error': d3rlpy.metrics.td_error_scorer, # smaller is better
            'advantage': d3rlpy.metrics.discounted_sum_of_advantage_scorer, # smaller is better
            'value_scale': d3rlpy.metrics.average_value_estimation_scorer # smaller is better
        },
        tensorboard_dir="tensorboard",
        verbose=False,
        show_progress=False,
        save_interval=n_epochs-1,
    )

    # Deployment
    writer = 
    print("Deployment:", d3rlpy.metrics.evaluate_on_environment(env)(model))
    print()
    return model

In [100]:
from itertools import product
from d3rlpy.algos import DDPG, SAC, TD3, BCQ, BEAR, CQL, AWAC, IQL, BC

In [98]:
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # New split.
for algorithm in [DDPG, SAC, TD3, BCQ, BEAR, CQL, AWAC, IQL]:
    opt_arguments = algo_opt_lists[algorithm.__name__]
    for opts in product(opt_factories.keys(), repeat=len(opt_arguments)):
        opt_dict = {opt_type: opt_name for opt_type, opt_name in zip(opt_arguments, opts)}
        run(algorithm, opt_dict, train_episodes, test_episodes, use_gpu=False, n_epochs=50)

2022-12-01 03:33:49 [debug    ] RoundIterator is selected.
2022-12-01 03:33:49 [info     ] Directory is created at d3rlpy_logs/Pendulum/DDPG_actor_Adam_critic_Adam_20221201033349
2022-12-01 03:33:49 [debug    ] Building models...
2022-12-01 03:33:49 [debug    ] Models have been built.
2022-12-01 03:35:35 [info     ] Model parameters are saved to d3rlpy_logs/Pendulum/DDPG_actor_Adam_critic_Adam_20221201033349/model_38808.pt
Deployment: -1338.9880413035285

2022-12-01 03:35:38 [debug    ] RoundIterator is selected.
2022-12-01 03:35:38 [info     ] Directory is created at d3rlpy_logs/Pendulum/DDPG_actor_Adam_critic_SamAdam_20221201033538
2022-12-01 03:35:38 [debug    ] Building models...
2022-12-01 03:35:38 [debug    ] Models have been built.
2022-12-01 03:37:44 [info     ] Model parameters are saved to d3rlpy_logs/Pendulum/DDPG_actor_Adam_critic_SamAdam_20221201033538/model_38808.pt
Deployment: -888.3039452453089

2022-12-01 03:37:46 [debug    ] RoundIterator is selected.
2022-12-01 03:37

In [102]:
bc = run(
    BC, 
    {opt_type: opt_name for opt_type, opt_name in zip(algo_opt_lists['BC'], ['Adam'])},
    train_episodes, None,
    n_epochs=50
)

2022-12-01 12:41:38 [debug    ] RoundIterator is selected.
2022-12-01 12:41:38 [info     ] Directory is created at d3rlpy_logs/Pendulum/BC_optim_Adam_20221201124138
2022-12-01 12:41:38 [debug    ] Building models...
2022-12-01 12:41:38 [debug    ] Models have been built.
2022-12-01 12:42:05 [info     ] Model parameters are saved to d3rlpy_logs/Pendulum/BC_optim_Adam_20221201124138/model_38808.pt
Deployment: -653.3113198285071



In [112]:
d3rlpy.metrics.evaluate_on_environment(env)(bc)

-758.8095278283542