# **Libreria de Reinforcement Learning Offline**

In [1]:
import warnings
warnings.filterwarnings("ignore")
import d3rlpy
import numpy as np

# Getting Stated

In [2]:
from d3rlpy.datasets import get_cartpole # CartPole-v0 dataset
from d3rlpy.datasets import get_pendulum # Pendulum-v0 dataset
from d3rlpy.datasets import get_pybullet # PyBullet task datasets
from d3rlpy.datasets import get_atari    # Atari 2600 task datasets
from d3rlpy.datasets import get_d4rl     # D4RL datasets


In [3]:
# Prepare Dataset
# You can make your own dataset without any efforts. In this tutorial, let’s use integrated datasets to start. If you want to make a new dataset, see MDPDataset.
dataset, env = get_cartpole()

In [4]:
# You can split dataset into a training dataset and a test dataset just like supervised learning as follows.
from sklearn.model_selection import train_test_split

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

In [5]:
# Setup Algorithm
# There are many algorithms avaiable in d3rlpy. Since CartPole is the simple task, let’s start from DQN, 
# which is the Q-learnig algorithm proposed as the first deep reinforcement learning algorithm.
from d3rlpy.algos import DQN

# if you don't use GPU, set use_gpu=False instead.
dqn = DQN(use_gpu=False)

# initialize neural networks with the given observation shape and action size.
# this is not necessary when you directly call fit or fit_online method.
dqn.build_with_dataset(dataset)

In [6]:
# Setup metrics
# Collecting evaluation metrics is important to train algorithms properly. In d3rlpy, the metrics is computed through scikit-learn style scorer functions.
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer

# calculate metrics with test dataset
td_error = td_error_scorer(dqn, test_episodes)

In [7]:
# Since evaluating algorithms without access to environment is still difficult, 
# the algorithm can be directly evaluated with evaluate_on_environment function if the environment is available to interactF
from d3rlpy.metrics.scorer import evaluate_on_environment

# set environment in scorer function
evaluate_scorer = evaluate_on_environment(env)

# evaluate algorithm on the environment
rewards = evaluate_scorer(dqn)

In [8]:
rewards

9.6

In [11]:
# Start Training
dqn.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=10,
        scorers={
            'td_error': td_error_scorer,
            'value_scale': average_value_estimation_scorer,
            'environment': evaluate_scorer
        })

2023-07-21 12:45.02 [debug    ] RoundIterator is selected.
2023-07-21 12:45.02 [info     ] Directory is created at d3rlpy_logs\DQN_20230721124502
2023-07-21 12:45.02 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230721124502\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/10: 100%|██████████| 2440/2440 [00:12<00:00, 191.17it/s, loss=0.00633]


2023-07-21 12:45.16 [info     ] DQN_20230721124502: epoch=1 step=2440 epoch=1 metrics={'time_sample_batch': 0.0001715829137895928, 'time_algorithm_update': 0.004842376220421713, 'loss': 0.0063401496063694345, 'time_step': 0.005166433092023506, 'td_error': 0.9887531877239056, 'value_scale': 2.1462713150384523, 'environment': 12.9} step=2440
2023-07-21 12:45.16 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_2440.pt


Epoch 2/10: 100%|██████████| 2440/2440 [00:12<00:00, 200.56it/s, loss=0.0129] 


2023-07-21 12:45.29 [info     ] DQN_20230721124502: epoch=2 step=4880 epoch=2 metrics={'time_sample_batch': 0.00016577742138846975, 'time_algorithm_update': 0.004608243215279501, 'loss': 0.01294155354284396, 'time_step': 0.004920014592467762, 'td_error': 0.9989416994397493, 'value_scale': 3.1374054795578044, 'environment': 12.7} step=4880
2023-07-21 12:45.29 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_4880.pt


Epoch 3/10: 100%|██████████| 2440/2440 [00:12<00:00, 202.72it/s, loss=0.0161]


2023-07-21 12:45.43 [info     ] DQN_20230721124502: epoch=3 step=7320 epoch=3 metrics={'time_sample_batch': 0.00015969618422086122, 'time_algorithm_update': 0.004557164184382704, 'loss': 0.016107622990689453, 'time_step': 0.00486732828812521, 'td_error': 1.002993860932244, 'value_scale': 3.1556021397670673, 'environment': 12.4} step=7320
2023-07-21 12:45.43 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_7320.pt


Epoch 4/10: 100%|██████████| 2440/2440 [00:11<00:00, 215.70it/s, loss=0.0159]


2023-07-21 12:45.55 [info     ] DQN_20230721124502: epoch=4 step=9760 epoch=4 metrics={'time_sample_batch': 0.00015843041607590971, 'time_algorithm_update': 0.004269175549022487, 'loss': 0.01592312576585904, 'time_step': 0.004571344520224899, 'td_error': 1.0039634617008921, 'value_scale': 3.1293724631769315, 'environment': 12.5} step=9760
2023-07-21 12:45.55 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_9760.pt


Epoch 5/10: 100%|██████████| 2440/2440 [00:10<00:00, 224.83it/s, loss=0.0192]


2023-07-21 12:46.07 [info     ] DQN_20230721124502: epoch=5 step=12200 epoch=5 metrics={'time_sample_batch': 0.00014062875606974618, 'time_algorithm_update': 0.004105119920167767, 'loss': 0.019289738690609053, 'time_step': 0.0043922926558822885, 'td_error': 1.0216993619783863, 'value_scale': 4.127768122991788, 'environment': 200.0} step=12200
2023-07-21 12:46.07 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_12200.pt


Epoch 6/10: 100%|██████████| 2440/2440 [00:10<00:00, 229.75it/s, loss=0.025] 


2023-07-21 12:46.20 [info     ] DQN_20230721124502: epoch=6 step=14640 epoch=6 metrics={'time_sample_batch': 0.00014899070145653895, 'time_algorithm_update': 0.004014461939452124, 'loss': 0.025042197123828554, 'time_step': 0.004296955612839245, 'td_error': 1.0206864424018842, 'value_scale': 4.1172733920874345, 'environment': 200.0} step=14640
2023-07-21 12:46.20 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_14640.pt


Epoch 7/10: 100%|██████████| 2440/2440 [00:10<00:00, 227.51it/s, loss=0.0247]


2023-07-21 12:46.32 [info     ] DQN_20230721124502: epoch=7 step=17080 epoch=7 metrics={'time_sample_batch': 0.00016906867261792794, 'time_algorithm_update': 0.004027238243915996, 'loss': 0.024717364357668692, 'time_step': 0.0043375322076140855, 'td_error': 1.0119994953007887, 'value_scale': 4.129594029074958, 'environment': 200.0} step=17080
2023-07-21 12:46.32 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_17080.pt


Epoch 8/10: 100%|██████████| 2440/2440 [00:10<00:00, 223.44it/s, loss=0.0245]


2023-07-21 12:46.45 [info     ] DQN_20230721124502: epoch=8 step=19520 epoch=8 metrics={'time_sample_batch': 0.0001572194646616451, 'time_algorithm_update': 0.004111890323826524, 'loss': 0.024464265018668514, 'time_step': 0.004418250087831841, 'td_error': 1.0116160353612327, 'value_scale': 4.113668713631029, 'environment': 200.0} step=19520
2023-07-21 12:46.45 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_19520.pt


Epoch 9/10: 100%|██████████| 2440/2440 [00:11<00:00, 219.87it/s, loss=0.0348]


2023-07-21 12:46.58 [info     ] DQN_20230721124502: epoch=9 step=21960 epoch=9 metrics={'time_sample_batch': 0.0001308032723723865, 'time_algorithm_update': 0.004215994037565638, 'loss': 0.034716994769304184, 'time_step': 0.004490793142162386, 'td_error': 1.067666762769251, 'value_scale': 5.1628017496732665, 'environment': 200.0} step=21960
2023-07-21 12:46.58 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_21960.pt


Epoch 10/10: 100%|██████████| 2440/2440 [00:13<00:00, 184.72it/s, loss=0.0335]


2023-07-21 12:47.13 [info     ] DQN_20230721124502: epoch=10 step=24400 epoch=10 metrics={'time_sample_batch': 0.00018423033542320377, 'time_algorithm_update': 0.005010115514036084, 'loss': 0.03352285356546626, 'time_step': 0.005343269226980991, 'td_error': 1.0540503472733067, 'value_scale': 5.140501433507642, 'environment': 200.0} step=24400
2023-07-21 12:47.13 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230721124502\model_24400.pt


[(1,
  {'time_sample_batch': 0.0001715829137895928,
   'time_algorithm_update': 0.004842376220421713,
   'loss': 0.0063401496063694345,
   'time_step': 0.005166433092023506,
   'td_error': 0.9887531877239056,
   'value_scale': 2.1462713150384523,
   'environment': 12.9}),
 (2,
  {'time_sample_batch': 0.00016577742138846975,
   'time_algorithm_update': 0.004608243215279501,
   'loss': 0.01294155354284396,
   'time_step': 0.004920014592467762,
   'td_error': 0.9989416994397493,
   'value_scale': 3.1374054795578044,
   'environment': 12.7}),
 (3,
  {'time_sample_batch': 0.00015969618422086122,
   'time_algorithm_update': 0.004557164184382704,
   'loss': 0.016107622990689453,
   'time_step': 0.00486732828812521,
   'td_error': 1.002993860932244,
   'value_scale': 3.1556021397670673,
   'environment': 12.4}),
 (4,
  {'time_sample_batch': 0.00015843041607590971,
   'time_algorithm_update': 0.004269175549022487,
   'loss': 0.01592312576585904,
   'time_step': 0.004571344520224899,
   'td_erro

In [12]:
# calculate metrics with test dataset
td_error = td_error_scorer(dqn, test_episodes)
td_error

1.0540503472733067

In [9]:
# Once the training is done, your algorithm is ready to make decisions.
observation = env.reset()

# return actions based on the greedy-policy
action = dqn.predict([observation])[0]

# estimate action-values
value = dqn.predict_value([observation], [action])[0]

In [12]:
# Save and load
# d3rlpy provides several ways to save trained models.

# save full parameters
dqn.save_model('dqn.pt')

# load full parameters
dqn2 = DQN()
dqn2.build_with_dataset(dataset)
dqn2.load_model('dqn.pt')

# save the greedy-policy as TorchScript
dqn.save_policy('policy.pt')

# save the greedy-policy as ONNX
dqn.save_policy('policy.onnx')

# Play with MDPDataset

In [15]:
# prepare dataset
dataset, _ = d3rlpy.datasets.get_dataset("cartpole-random")

In [49]:
# first episode
episode = dataset.episodes[0]

# access to episode data
episode.observations
episode.actions
episode.rewards

# first transition
transition = episode.transitions[0]

# access to tuple
transition.observation
transition.action
transition.reward
transition.next_observation

# linked list structure
next_transition = transition.next_transition
assert transition is next_transition.prev_transition

In [17]:
# Feed MDPDataset to Algorithm
dqn = d3rlpy.algos.DQN()

# feed as MDPDataset
dqn.fit(dataset, n_steps=10000)

# feed as Episode
dqn.fit(dataset.episodes, n_steps=10000)

# feed as Transition
transitions = []
for episode in dataset.episodes:
    transitions.extend(episode.transitions)
dqn.fit(transitions, n_steps=10000)


2023-07-11 11:25.42 [debug    ] RandomIterator is selected.
2023-07-11 11:25.42 [info     ] Directory is created at d3rlpy_logs\DQN_20230711112542
2023-07-11 11:25.42 [debug    ] Building models...
2023-07-11 11:25.42 [debug    ] Models have been built.
2023-07-11 11:25.42 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230711112542\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN

Epoch 1/1: 100%|██████████| 10000/10000 [00:56<00:00, 177.85it/s, loss=0.00598]

2023-07-11 11:26.38 [info     ] DQN_20230711112542: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0003293832302093506, 'time_algorithm_update': 0.004991362094879151, 'loss': 0.005985863227147752, 'time_step': 0.005548441505432129} step=10000





2023-07-11 11:26.38 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711112542\model_10000.pt
2023-07-11 11:26.38 [debug    ] RandomIterator is selected.
2023-07-11 11:26.38 [info     ] Directory is created at d3rlpy_logs\DQN_20230711112638
2023-07-11 11:26.38 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230711112638\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQ

Epoch 1/1: 100%|██████████| 10000/10000 [01:02<00:00, 160.27it/s, loss=0.0173]

2023-07-11 11:27.41 [info     ] DQN_20230711112638: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0003325154542922974, 'time_algorithm_update': 0.005597547674179077, 'loss': 0.017282132790329342, 'time_step': 0.006155061841011047} step=10000
2023-07-11 11:27.41 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711112638\model_10000.pt
2023-07-11 11:27.41 [debug    ] RandomIterator is selected.
2023-07-11 11:27.41 [info     ] Directory is created at d3rlpy_logs\DQN_20230711112741





2023-07-11 11:27.41 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230711112741\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/1: 100%|██████████| 10000/10000 [01:02<00:00, 160.47it/s, loss=0.0227]

2023-07-11 11:28.43 [info     ] DQN_20230711112741: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.00035825092792510986, 'time_algorithm_update': 0.0055570454120635986, 'loss': 0.022723227619935643, 'time_step': 0.006148022413253784} step=10000





2023-07-11 11:28.43 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711112741\model_10000.pt


[(1,
  {'time_sample_batch': 0.00035825092792510986,
   'time_algorithm_update': 0.0055570454120635986,
   'loss': 0.022723227619935643,
   'time_step': 0.006148022413253784})]

In [18]:
# use scikit-learn utility
from sklearn.model_selection import train_test_split

# episode-wise split
train_episodes, test_episodes = train_test_split(dataset.episodes)

# setup metrics
metrics = {
  "soft_opc": d3rlpy.metrics.scorer.soft_opc_scorer(return_threshold=180),
  "initial_value": d3rlpy.metrics.scorer.initial_state_value_estimation_scorer,
}

# start training with episode-wise splits
dqn.fit(
    train_episodes,
    n_steps=10000,
    scorers=metrics,
    eval_episodes=test_episodes,
)

2023-07-11 11:29.07 [debug    ] RandomIterator is selected.
2023-07-11 11:29.07 [info     ] Directory is created at d3rlpy_logs\DQN_20230711112907
2023-07-11 11:29.07 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230711112907\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/1: 100%|██████████| 10000/10000 [01:00<00:00, 164.72it/s, loss=0.0256]


2023-07-11 11:30.09 [info     ] DQN_20230711112907: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0003408435583114624, 'time_algorithm_update': 0.005417067193984985, 'loss': 0.025643449136131678, 'time_step': 0.00598792986869812, 'soft_opc': nan, 'initial_value': 4.949024677276611} step=10000
2023-07-11 11:30.09 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711112907\model_10000.pt


[(1,
  {'time_sample_batch': 0.0003408435583114624,
   'time_algorithm_update': 0.005417067193984985,
   'loss': 0.025643449136131678,
   'time_step': 0.00598792986869812,
   'soft_opc': nan,
   'initial_value': 4.949024677276611})]

In [20]:
# Mix Datasets
from d3rlpy.dataset import MDPDataset

In [21]:
# You can also mix multiple datasets to train algorithms.
replay_dataset, _ = d3rlpy.datasets.get_dataset("cartpole-replay")

# extends replay dataset with random dataset
replay_dataset.extend(dataset)

# you can also save it and load it later
replay_dataset.dump("mixed_dataset.h5")
mixed_dataset = MDPDataset.load("mixed_dataset.h5")

# Data collection

**d3rlpy provides APIs to support data collection from environments. This feature is specifically useful if you want to build your own original datasets for research or practice purposes.**

**Prepare Environment
d3rlpy supports environments with OpenAI Gym interface. In this tutorial, let’s use simple CartPole environment.**

In [23]:
import gym

env = gym.make("CartPole-v0")

**Data Collection with Random Policy**

In [25]:
# If you want to collect experiences with uniformly random policy, you can use RandomPolicy and DiscreteRandomPolicy. This procedure corresponds to random datasets in D4RL.
import d3rlpy

# setup algorithm
random_policy = d3rlpy.algos.DiscreteRandomPolicy()

# prepare experience replay buffer
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=100000, env=env)

# start data collection
random_policy.collect(env, buffer, n_steps=100000)

# export as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump("random_policy_dataset.h5")

2023-07-11 11:38.08 [debug    ] Building model...
2023-07-11 11:38.08 [debug    ] Model has been built.


100%|██████████| 100000/100000 [00:04<00:00, 23225.88it/s]


**Data Collection with Trained Policy**

In [None]:
# If you want to collect experiences with previously trained policy, you can still use the same set of APIs. This procedure corresponds to medium datasets in D4RL.
# setup algorithm
dqn = d3rlpy.algos.DQN()

# initialize neural networks before loading parameters
dqn.build_with_env(env)

# load pretrained parameters
dqn.load_model("dqn_model.pt")

# prepare experience replay buffer
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=100000, env=env)

# start data collection
dqn.collect(env, buffer, n_steps=100000)

# export as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump("trained_policy_dataset.h5")

**Data Collection while Training Policy**

In [None]:
# setup algorithm
dqn = d3rlpy.algos.DQN()

# prepare experience replay buffer
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=100000, env=env)

# prepare exploration strategy if necessary
explorer = d3rlpy.online.explorers.ConstantEpsilonGreedy(0.3)

# start data collection
dqn.fit_online(env, buffer, n_steps=100000)

# export as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump("replay_dataset.h5")

# Create Your Dataset

**The data collection API is introduced in Data Collection. In this tutorial, you can learn how to build your dataset from logged data such as the user data collected in your web service.**

### Prepare Logged Data

**First of all, you need to prepare your logged data. In this tutorial, let’s use randomly generated data. terminals represents the last step of episodes. If terminals[i] == 1.0, i-th step is the terminal state. Otherwise you need to set zeros for non-terminal states.**

In [41]:
import numpy as np

# vector observation
# 1000 steps of observations with shape of (100,)
observations = np.random.random((1000, 100))

# 1000 steps of actions with shape of (4,)
actions = np.random.random((1000, 4))

# 1000 steps of rewards
rewards = np.random.random(1000)

# 1000 steps of terminal flags
terminals = np.random.randint(2, size=1000)

In [42]:
dataset = d3rlpy.dataset.MDPDataset(
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals,
)

In [None]:
# episode-wise split
train_episodes, test_episodes = train_test_split(dataset.episodes)

# setup metrics
metrics = {
  "soft_opc": d3rlpy.metrics.scorer.soft_opc_scorer(return_threshold=180),
  "initial_value": d3rlpy.metrics.scorer.initial_state_value_estimation_scorer,
}

dqn = d3rlpy.algos.CQL()

# start training with episode-wise splits
dqn.fit(
    train_episodes,
    n_steps=10000,
    scorers=metrics,
    eval_episodes=test_episodes,
)

In [22]:
# 1000 steps of observations with shape of (100,)
observations = np.random.random((1000, 100))
# 1000 steps of actions with shape of (4,)
actions = np.random.random((1000, 4))
# 1000 steps of rewards
rewards = np.random.random(1000)
# 1000 steps of terminal flags
terminals = np.random.randint(2, size=1000)

dataset = MDPDataset(observations, actions, rewards, terminals)

# automatically splitted into d3rlpy.dataset.Episode objects
dataset.episodes

# each episode is also splitted into d3rlpy.dataset.Transition objects
episode = dataset.episodes[0]
episode[0].observation
episode[0].action
episode[0].reward
episode[0].next_observation
episode[0].terminal

# d3rlpy.dataset.Transition object has pointers to previous and next
# transitions like linked list.
transition = episode[0]
while transition.next_transition:
    transition = transition.next_transition

# save as HDF5
dataset.dump('dataset.h5')

# load from HDF5
new_dataset = MDPDataset.load('dataset.h5')

In [None]:
# episode-wise split
train_episodes, test_episodes = train_test_split(dataset.episodes)

# setup metrics
metrics = {
  "soft_opc": d3rlpy.metrics.scorer.soft_opc_scorer(return_threshold=180),
  "initial_value": d3rlpy.metrics.scorer.initial_state_value_estimation_scorer,
}

dqn = d3rlpy.algos.CQL()

# start training with episode-wise splits
dqn.fit(
    train_episodes,
    n_steps=10000,
    scorers=metrics,
    eval_episodes=test_episodes,
)

# Evaluation on environment

In [60]:
import d3rlpy

# setup replay CartPole-v0 dataset and environment
dataset, env = d3rlpy.datasets.get_dataset("cartpole-replay")

# setup algorithm
dqn = d3rlpy.algos.DQN()

# start offline training
dqn.fit(
   dataset,
   eval_episodes=dataset.episodes,
   n_steps=100000,
   n_steps_per_epoch=10000,
   scorers={
       "environment": d3rlpy.metrics.evaluate_on_environment(env),
   },
)

2023-07-11 15:00.17 [debug    ] RandomIterator is selected.
2023-07-11 15:00.17 [info     ] Directory is created at d3rlpy_logs\DQN_20230711150017
2023-07-11 15:00.17 [debug    ] Building models...
2023-07-11 15:00.17 [debug    ] Models have been built.
2023-07-11 15:00.17 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230711150017\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN

Epoch 1/10: 100%|██████████| 10000/10000 [00:57<00:00, 174.70it/s, loss=0.00445]


2023-07-11 15:01.15 [info     ] DQN_20230711150017: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0003295729637145996, 'time_algorithm_update': 0.005085583972930908, 'loss': 0.004443957178714391, 'time_step': 0.005646009087562561, 'environment': 11.5} step=10000
2023-07-11 15:01.15 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_10000.pt


Epoch 2/10: 100%|██████████| 10000/10000 [01:03<00:00, 158.64it/s, loss=0.00974]


2023-07-11 15:02.18 [info     ] DQN_20230711150017: epoch=2 step=20000 epoch=2 metrics={'time_sample_batch': 0.00034461705684661864, 'time_algorithm_update': 0.005644531631469727, 'loss': 0.00974919555754168, 'time_step': 0.006218902659416199, 'environment': 11.7} step=20000
2023-07-11 15:02.18 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_20000.pt


Epoch 3/10: 100%|██████████| 10000/10000 [01:05<00:00, 152.43it/s, loss=0.0206]


2023-07-11 15:03.25 [info     ] DQN_20230711150017: epoch=3 step=30000 epoch=3 metrics={'time_sample_batch': 0.00036180675029754637, 'time_algorithm_update': 0.005866950345039368, 'loss': 0.02059645506235247, 'time_step': 0.006469796752929687, 'environment': 200.0} step=30000
2023-07-11 15:03.25 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_30000.pt


Epoch 4/10: 100%|██████████| 10000/10000 [01:14<00:00, 134.51it/s, loss=0.0299]


2023-07-11 15:04.40 [info     ] DQN_20230711150017: epoch=4 step=40000 epoch=4 metrics={'time_sample_batch': 0.0003942628860473633, 'time_algorithm_update': 0.0066790269851684575, 'loss': 0.029912366072440636, 'time_step': 0.007328744006156921, 'environment': 200.0} step=40000
2023-07-11 15:04.40 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_40000.pt


Epoch 5/10: 100%|██████████| 10000/10000 [00:50<00:00, 199.99it/s, loss=0.0408]


2023-07-11 15:05.31 [info     ] DQN_20230711150017: epoch=5 step=50000 epoch=5 metrics={'time_sample_batch': 0.0002907989740371704, 'time_algorithm_update': 0.004437735104560852, 'loss': 0.04076685120333568, 'time_step': 0.004935858654975891, 'environment': 200.0} step=50000
2023-07-11 15:05.31 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_50000.pt


Epoch 6/10: 100%|██████████| 10000/10000 [00:50<00:00, 196.38it/s, loss=0.0509]


2023-07-11 15:06.23 [info     ] DQN_20230711150017: epoch=6 step=60000 epoch=6 metrics={'time_sample_batch': 0.00030595717430114745, 'time_algorithm_update': 0.004517669820785522, 'loss': 0.05093365594213537, 'time_step': 0.005028146553039551, 'environment': 200.0} step=60000
2023-07-11 15:06.23 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_60000.pt


Epoch 7/10: 100%|██████████| 10000/10000 [00:54<00:00, 183.81it/s, loss=0.0589]


2023-07-11 15:07.18 [info     ] DQN_20230711150017: epoch=7 step=70000 epoch=7 metrics={'time_sample_batch': 0.00030831203460693357, 'time_algorithm_update': 0.004849077177047729, 'loss': 0.05888998315929493, 'time_step': 0.005369064044952392, 'environment': 186.2} step=70000
2023-07-11 15:07.18 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_70000.pt


Epoch 8/10: 100%|██████████| 10000/10000 [01:03<00:00, 157.10it/s, loss=0.0682]


2023-07-11 15:08.23 [info     ] DQN_20230711150017: epoch=8 step=80000 epoch=8 metrics={'time_sample_batch': 0.00035675320625305177, 'time_algorithm_update': 0.005683566427230835, 'loss': 0.06820846518946345, 'time_step': 0.006278869700431824, 'environment': 133.3} step=80000
2023-07-11 15:08.23 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_80000.pt


Epoch 9/10: 100%|██████████| 10000/10000 [01:11<00:00, 140.49it/s, loss=0.0764]


2023-07-11 15:09.34 [info     ] DQN_20230711150017: epoch=9 step=90000 epoch=9 metrics={'time_sample_batch': 0.00034275228977203367, 'time_algorithm_update': 0.006470977687835693, 'loss': 0.07638739716591081, 'time_step': 0.0070374038696289064, 'environment': 181.6} step=90000
2023-07-11 15:09.34 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_90000.pt


Epoch 10/10: 100%|██████████| 10000/10000 [00:49<00:00, 200.22it/s, loss=0.0815]


2023-07-11 15:10.25 [info     ] DQN_20230711150017: epoch=10 step=100000 epoch=10 metrics={'time_sample_batch': 0.0002918296098709106, 'time_algorithm_update': 0.004440448784828186, 'loss': 0.08148027238445939, 'time_step': 0.00493109176158905, 'environment': 113.6} step=100000
2023-07-11 15:10.25 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230711150017\model_100000.pt


[(1,
  {'time_sample_batch': 0.0003295729637145996,
   'time_algorithm_update': 0.005085583972930908,
   'loss': 0.004443957178714391,
   'time_step': 0.005646009087562561,
   'environment': 11.5}),
 (2,
  {'time_sample_batch': 0.00034461705684661864,
   'time_algorithm_update': 0.005644531631469727,
   'loss': 0.00974919555754168,
   'time_step': 0.006218902659416199,
   'environment': 11.7}),
 (3,
  {'time_sample_batch': 0.00036180675029754637,
   'time_algorithm_update': 0.005866950345039368,
   'loss': 0.02059645506235247,
   'time_step': 0.006469796752929687,
   'environment': 200.0}),
 (4,
  {'time_sample_batch': 0.0003942628860473633,
   'time_algorithm_update': 0.0066790269851684575,
   'loss': 0.029912366072440636,
   'time_step': 0.007328744006156921,
   'environment': 200.0}),
 (5,
  {'time_sample_batch': 0.0002907989740371704,
   'time_algorithm_update': 0.004437735104560852,
   'loss': 0.04076685120333568,
   'time_step': 0.004935858654975891,
   'environment': 200.0}),
 (