In [1]:
# Run only once.

# Download `d3rlpy` package from source.
# We will implement SAM optimizer into this package.

!pip3 install gym Cython numpy
!git clone https://github.com/takuseno/d3rlpy d3rlpy_GitHub
!pip install -e d3rlpy_GitHub/
!mv d3rlpy_GitHub/d3rlpy .
!rm -rf d3rlpy_GitHub/
!mv -f changes/* d3rlpy/

In [None]:
# %%bash
# pip3 install gym

### Get Dataset

In [2]:
import d3rlpy
from sklearn.model_selection import train_test_split

In [3]:
dataset, env = d3rlpy.datasets.get_cartpole()
task_name='Cartpole'

In [11]:
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
len(train_episodes), len(test_episodes)

(1265, 317)

In [12]:
epi0 = train_episodes[0]
print(epi0.observations.shape, epi0.actions.shape, len(epi0.transitions))

(199, 4) (199,) 198


### Algorithms

In [17]:
from d3rlpy.algos import NFQ, DoubleDQN, DiscreteSAC, DiscreteBCQ, DiscreteCQL
from d3rlpy.models.optimizers import OptimizerFactory
import torch
from torch.optim import SGD, Adam
from sam import SAM

In [18]:
lr = 1e-4
kwargs_lr = dict(
    learning_rate=lr,
    temp_learning_rate=lr,
    actor_learning_rate=lr,
    critic_learning_rate=lr,
    alpha_learning_rate=lr,
    imitator_learning_rate=lr)

In [19]:
for _ in range(3):  # three different experiments
    print("\n\n############ Train/Test split #############\n")
    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # New split.
    for algorithm in [DoubleDQN, DiscreteSAC, DiscreteBCQ, DiscreteCQL]:
        for opt in [Adam, SGD]:
            for use_SAM in [False, True]:
                # Choose optimizer factory & optimizer name
                if use_SAM:
                    opt_factory = OptimizerFactory(optim_cls=SAM, base_optimizer=opt.__name__, rho=0.05)
                    opt_name = 'SAM_'+opt.__name__
                else:
                    opt_factory = OptimizerFactory(optim_cls=opt)
                    opt_name = opt.__name__
                
                # algorithm
                model = algorithm(use_gpu=False, optim_factory=opt_factory, **kwargs_lr)
                # train & evaluate
                model.fit(
                    train_episodes,
                    eval_episodes=test_episodes,
                    experiment_name=model.__class__.__name__+'_'+opt_name,
                    n_epochs=16,
                    scorers={
                        'environment': d3rlpy.metrics.evaluate_on_environment(env),
                        'td_error': d3rlpy.metrics.td_error_scorer, # smaller is better
                        'advantage': d3rlpy.metrics.discounted_sum_of_advantage_scorer, # smaller is better
                        'value_scale': d3rlpy.metrics.average_value_estimation_scorer # smaller is better
                    },
                    tensorboard_dir=f"tensorboard/{task_name}/{model.__class__.__name__}/{opt_name}/",
                    verbose=False,
                    show_progress=False
                )



############ First train/test split #############

2022-11-25 15:56:00 [debug    ] RoundIterator is selected.
2022-11-25 15:56:00 [info     ] Directory is created at d3rlpy_logs/DoubleDQN_Adam_20221125155600
2022-11-25 15:56:00 [debug    ] Building models...
2022-11-25 15:56:00 [debug    ] Models have been built.
2022-11-25 15:56:05 [info     ] Model parameters are saved to d3rlpy_logs/DoubleDQN_Adam_20221125155600/model_2488.pt
2022-11-25 15:56:10 [info     ] Model parameters are saved to d3rlpy_logs/DoubleDQN_Adam_20221125155600/model_4976.pt
2022-11-25 15:56:14 [info     ] Model parameters are saved to d3rlpy_logs/DoubleDQN_Adam_20221125155600/model_7464.pt
2022-11-25 15:56:17 [info     ] Model parameters are saved to d3rlpy_logs/DoubleDQN_Adam_20221125155600/model_9952.pt
2022-11-25 15:56:21 [info     ] Model parameters are saved to d3rlpy_logs/DoubleDQN_Adam_20221125155600/model_12440.pt
2022-11-25 15:56:26 [info     ] Model parameters are saved to d3rlpy_logs/DoubleDQN_Adam_202

### Single run

In [19]:
# optimizer
#opt_factory = OptimizerFactory(optim_cls=SAM, base_optimizer="SGD", rho=0.05)
opt_factory = OptimizerFactory(optim_cls=Adam)

In [20]:
opt_name = opt_factory._optim_cls.__name__
if opt_name == 'SAM':
    opt_name += '_'+opt_factory._optim_kwargs['base_optimizer']
opt_name

'Adam'

In [21]:
# prepare algorithm


model = d3rlpy.algos.CQL(
    use_gpu=False,
    optim_factory=opt_factory,
    **kwargs_lr
)



In [22]:
model.__class__.__name__

'CQL'

In [23]:
# train
model.fit(
    train_episodes,
    eval_episodes=test_episodes,
    experiment_name=model.__class__.__name__+'_'+opt_name,
    n_epochs=6,
    scorers={
        'environment': d3rlpy.metrics.evaluate_on_environment(env),
        'td_error': d3rlpy.metrics.td_error_scorer, # smaller is better
        'advantage': d3rlpy.metrics.discounted_sum_of_advantage_scorer, # smaller is better
        'value_scale': d3rlpy.metrics.average_value_estimation_scorer # smaller is better
    },
    tensorboard_dir='./tensorboard',
)
# Simple evaluation, no video
print("EVALUATION SCORE:")
d3rlpy.metrics.evaluate_on_environment(env)(model)

2022-11-25 14:51:06 [debug    ] RoundIterator is selected.
2022-11-25 14:51:06 [info     ] Directory is created at d3rlpy_logs/CQL_Adam_20221125145106
2022-11-25 14:51:06 [debug    ] Building models...
2022-11-25 14:51:06 [debug    ] Models have been built.
2022-11-25 14:51:06 [info     ] Parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 0.0001, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_learning_rate': 0.0001, 'alpha_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_threshold': 10.0, 'batch_size': 256, 'conservative_weight': 5.0, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'd

Epoch 1/6:   0%|          | 0/309 [00:00<?, ?it/s]

2022-11-25 14:51:26 [info     ] CQL_Adam_20221125145106: epoch=1 step=309 epoch=1 metrics={'time_sample_batch': 0.00026612914495869363, 'time_algorithm_update': 0.059728625522848085, 'temp_loss': 1.6445452341369826, 'temp': 0.9847319801262667, 'alpha_loss': -10.050573753307553, 'alpha': 1.0154497554001298, 'critic_loss': 14.053355828072261, 'actor_loss': 0.77819493277342, 'time_step': 0.06005730829578387, 'environment': -1268.4343286285768, 'td_error': 8.688805318336676, 'advantage': 35.94140934700047, 'value_scale': -2.5621141567133896} step=309
2022-11-25 14:51:26 [info     ] Model parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/model_309.pt


Epoch 2/6:   0%|          | 0/309 [00:00<?, ?it/s]

2022-11-25 14:51:45 [info     ] CQL_Adam_20221125145106: epoch=2 step=618 epoch=2 metrics={'time_sample_batch': 0.0002641384655603699, 'time_algorithm_update': 0.05978964370431252, 'temp_loss': 1.57066376579618, 'temp': 0.9553442635968279, 'alpha_loss': -8.244375772074974, 'alpha': 1.044601938099537, 'critic_loss': 9.888421311733406, 'actor_loss': 2.9583288319288332, 'time_step': 0.06011512518700658, 'environment': -1204.3984287364406, 'td_error': 10.311675805622901, 'advantage': 44.4011565061486, 'value_scale': -4.8468259799119195} step=618
2022-11-25 14:51:45 [info     ] Model parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/model_618.pt


Epoch 3/6:   0%|          | 0/309 [00:00<?, ?it/s]

2022-11-25 14:52:04 [info     ] CQL_Adam_20221125145106: epoch=3 step=927 epoch=3 metrics={'time_sample_batch': 0.0002692301296493382, 'time_algorithm_update': 0.057864148254147624, 'temp_loss': 1.5126188345325804, 'temp': 0.9273077320898235, 'alpha_loss': -7.154325974411949, 'alpha': 1.0723976213183604, 'critic_loss': 9.1261273942719, 'actor_loss': 5.460112259997519, 'time_step': 0.05819496213425324, 'environment': -1316.3768126348464, 'td_error': 11.040522317692178, 'advantage': 42.342920708579946, 'value_scale': -7.050200566354424} step=927
2022-11-25 14:52:04 [info     ] Model parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/model_927.pt


Epoch 4/6:   0%|          | 0/309 [00:00<?, ?it/s]

2022-11-25 14:52:23 [info     ] CQL_Adam_20221125145106: epoch=4 step=1236 epoch=4 metrics={'time_sample_batch': 0.00026680119215091844, 'time_algorithm_update': 0.05787296434050625, 'temp_loss': 1.449540714615757, 'temp': 0.9004979336146012, 'alpha_loss': -6.48975617214314, 'alpha': 1.1000042841272446, 'critic_loss': 8.850449268871913, 'actor_loss': 7.97049947078174, 'time_step': 0.05820102290428186, 'environment': -1148.0472052270502, 'td_error': 11.16160213083222, 'advantage': 42.728771720533864, 'value_scale': -9.482634384439448} step=1236
2022-11-25 14:52:23 [info     ] Model parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/model_1236.pt


Epoch 5/6:   0%|          | 0/309 [00:00<?, ?it/s]

2022-11-25 14:52:42 [info     ] CQL_Adam_20221125145106: epoch=5 step=1545 epoch=5 metrics={'time_sample_batch': 0.0002675642859202758, 'time_algorithm_update': 0.057939102734562646, 'temp_loss': 1.3713045610193295, 'temp': 0.874974835651978, 'alpha_loss': -5.934909218723334, 'alpha': 1.1280435640063486, 'critic_loss': 8.694654540336634, 'actor_loss': 10.555530024963675, 'time_step': 0.05826703321586535, 'environment': -1161.131573910322, 'td_error': 11.447497932231988, 'advantage': 42.67554864529683, 'value_scale': -11.789008042186198} step=1545
2022-11-25 14:52:42 [info     ] Model parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/model_1545.pt


Epoch 6/6:   0%|          | 0/309 [00:00<?, ?it/s]

2022-11-25 14:53:01 [info     ] CQL_Adam_20221125145106: epoch=6 step=1854 epoch=6 metrics={'time_sample_batch': 0.0002664208026379829, 'time_algorithm_update': 0.05811441754831851, 'temp_loss': 1.2861715586054288, 'temp': 0.8508392597479342, 'alpha_loss': -5.314888575316247, 'alpha': 1.156222185270686, 'critic_loss': 8.567703254786124, 'actor_loss': 13.17135453147024, 'time_step': 0.05844202705185776, 'environment': -1261.3131667855132, 'td_error': 12.338109225578638, 'advantage': 46.368703182659885, 'value_scale': -14.406772040574237} step=1854
2022-11-25 14:53:01 [info     ] Model parameters are saved to d3rlpy_logs/CQL_Adam_20221125145106/model_1854.pt
EVALUATION SCORE:


-1235.3251330857627

In [None]:
# tensorboard --logdir=tensorboard/runs