In [1]:
# Run only once.
# Downloading `d3rlpy` package from source.

!pip3 install gym Cython numpy opencv-python seaborn scikit-learn PyOpenGL
!git clone https://github.com/takuseno/d3rlpy d3rlpy_GitHub
!pip3 install -e d3rlpy_GitHub/
!mv d3rlpy_GitHub/d3rlpy .
!rm -rf d3rlpy_GitHub/

In [None]:
# Reflection of modifications:
# We will implement SAM optimizer into this package.

!cp -rf changes/algos/* d3rlpy/algos/
!rm -rf changes/algos/
!mv -f changes/* d3rlpy/

In [2]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## Get Dataset

In [9]:
import d3rlpy
from sklearn.model_selection import train_test_split

In [10]:
dataset, env = d3rlpy.datasets.get_cartpole()
task_name='Cartpole'

Downloading cartpole.pkl into d3rlpy_data/cartpole_replay_v1.1.0.h5...


In [11]:
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
len(train_episodes), len(test_episodes)

(1265, 317)

In [12]:
epi0 = train_episodes[0]
print(epi0.observations.shape, epi0.actions.shape, len(epi0.transitions))

(41, 4) (41,) 41


## Algorithms

In [34]:
#!pip3 install tensorboard
%load_ext tensorboard

In [23]:
from d3rlpy.algos import DoubleDQN, DiscreteSAC, DiscreteBCQ, DiscreteCQL
from d3rlpy.models.optimizers import OptimizerFactory
import torch
from torch.optim import SGD, Adam
from sam import SAM

In [24]:
lr = 1e-4
kwargs_lr = dict(
    learning_rate=lr,
    temp_learning_rate=lr,
    actor_learning_rate=lr,
    critic_learning_rate=lr,
    alpha_learning_rate=lr,
    imitator_learning_rate=lr)

In [25]:
# optimizer

# If you want SAM optimizer, uncomment the line below.
# Notice: `base_optimizer` must be a `str`
#opt_factory = OptimizerFactory(optim_cls=SAM, base_optimizer="SGD", rho=0.05)

# If you want other optimizer, use the line below.
opt_factory = OptimizerFactory(optim_cls=Adam)

In [26]:
opt_name = opt_factory._optim_cls.__name__
if opt_name == 'SAM':
    opt_name += '_'+opt_factory._optim_kwargs['base_optimizer']
opt_name

'Adam'

In [27]:
# prepare algorithm
model = d3rlpy.algos.DiscreteCQL(
    use_gpu=False,              # Using GPU or not
    optim_factory=opt_factory,  # optimizer
    **kwargs_lr                 # set learning rates (can cause some warning but it is OK)
)

model.__class__.__name__



'DiscreteCQL'

In [28]:
# If you want to use pretrained model, change it to the name: e.g., 'd3rlpy_logs/DiscreteCQL_Adam_20221129131616/model_5020.pt'

model_checkpoint_path = None
if model_checkpoint_path:
    model.build_with_env(env)
    model.load_model(model_checkpoint_path)
    d3rlpy.metrics.evaluate_on_environment(env)(model)  # deploy on the environment. (may output different results by each run)

In [29]:
# train the model
model.fit(
    train_episodes,
    eval_episodes=test_episodes,
    experiment_name=model.__class__.__name__+'_'+opt_name,
    n_epochs=2,
    scorers={
        'environment': d3rlpy.metrics.evaluate_on_environment(env),
        'td_error': d3rlpy.metrics.td_error_scorer,                     # smaller is better
        'advantage': d3rlpy.metrics.discounted_sum_of_advantage_scorer, # smaller is better
        'value_scale': d3rlpy.metrics.average_value_estimation_scorer   # smaller is better
    },
    tensorboard_dir='./tensorboard',
)

# Simple deployment
print("\nEVALUATION SCORE:")
d3rlpy.metrics.evaluate_on_environment(env)(model)

2022-11-29 16:04:24 [debug    ] RoundIterator is selected.
2022-11-29 16:04:24 [info     ] Directory is created at d3rlpy_logs/DiscreteCQL_Adam_20221129160424
2022-11-29 16:04:24 [debug    ] Building models...
2022-11-29 16:04:24 [debug    ] Models have been built.
2022-11-29 16:04:24 [info     ] Parameters are saved to d3rlpy_logs/DiscreteCQL_Adam_20221129160424/params.json params={'action_scaler': None, 'alpha': 1.0, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 0.0001, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam'}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DiscreteCQL', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/2:   0%|          | 0/2522 [00:00<?, ?it/s]

2022-11-29 16:04:30 [info     ] DiscreteCQL_Adam_20221129160424: epoch=1 step=2522 epoch=1 metrics={'time_sample_batch': 5.0462776474495147e-05, 'time_algorithm_update': 0.0019154307579445896, 'loss': 0.6789304184166804, 'time_step': 0.0020183654742804338, 'environment': 200.0, 'td_error': 1.1924059022242497, 'advantage': -3.3904248963698085, 'value_scale': 1.1119657761956516} step=2522
2022-11-29 16:04:30 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_Adam_20221129160424/model_2522.pt


Epoch 2/2:   0%|          | 0/2522 [00:00<?, ?it/s]

2022-11-29 16:04:36 [info     ] DiscreteCQL_Adam_20221129160424: epoch=2 step=5044 epoch=2 metrics={'time_sample_batch': 4.409079888243225e-05, 'time_algorithm_update': 0.0019012997776246278, 'loss': 0.6657996094245744, 'time_step': 0.0019914517792135266, 'environment': 200.0, 'td_error': 1.200659126589767, 'advantage': -3.4002273799498743, 'value_scale': 1.109628295481205} step=5044
2022-11-29 16:04:36 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_Adam_20221129160424/model_5044.pt

EVALUATION SCORE:


200.0

In [None]:
# see tensorboard:
%tensorboard --logdir=tensorboard/

In [32]:
!kill 74067

## Monitoring (not completely implemented)

In [15]:
from d3rlpy.envs import Monitor

In [16]:
out = './video'
frame_rate = 60
record_rate = 1
n_episodes = 3
epsilon = 0.
algo = model

In [17]:
import os

if os.path.exists(out):
    os.rmdir(out)

In [18]:

wrapped_env = Monitor(
    env,
    out,
    video_callable=lambda ep: ep % 1 == 0,
    frame_rate=float(frame_rate),
    record_rate=int(record_rate),
)

# run episodes
d3rlpy.metrics.evaluate_on_environment(wrapped_env, n_episodes, epsilon=epsilon)(algo)

NameError: name 'glPushMatrix' is not defined

: 

In [1]:
from pyglet.gl import glPushMatrix

: 

### Multiple run

In [None]:
for _ in range(3):  # three different experiments
    print("\n\n############ Train/Test split #############\n")
    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # New split.
    for algorithm in [DoubleDQN, DiscreteSAC, DiscreteBCQ, DiscreteCQL]:
        for opt in [Adam, SGD]:
            for use_SAM in [False, True]:
                # Choose optimizer factory & optimizer name
                if use_SAM:
                    opt_factory = OptimizerFactory(optim_cls=SAM, base_optimizer=opt.__name__, rho=0.05)
                    opt_name = 'SAM_'+opt.__name__
                else:
                    opt_factory = OptimizerFactory(optim_cls=opt)
                    opt_name = opt.__name__
                
                # algorithm
                model = algorithm(use_gpu=False, optim_factory=opt_factory, **kwargs_lr)
                # train & evaluate
                model.fit(
                    train_episodes,
                    eval_episodes=test_episodes,
                    experiment_name=model.__class__.__name__+'_'+opt_name,
                    n_epochs=16,
                    scorers={
                        'environment': d3rlpy.metrics.evaluate_on_environment(env),
                        'td_error': d3rlpy.metrics.td_error_scorer, # smaller is better
                        'advantage': d3rlpy.metrics.discounted_sum_of_advantage_scorer, # smaller is better
                        'value_scale': d3rlpy.metrics.average_value_estimation_scorer # smaller is better
                    },
                    tensorboard_dir=f"tensorboard/{task_name}/{model.__class__.__name__}/{opt_name}/",
                    verbose=False,
                    show_progress=False
                )