# **Libreria de Reinforcement Learning Offline**

In [1]:
import warnings
warnings.filterwarnings("ignore")
import d3rlpy
import numpy as np

# Getting Stated

In [2]:
from d3rlpy.datasets import get_cartpole # CartPole-v0 dataset
from d3rlpy.datasets import get_pendulum # Pendulum-v0 dataset
from d3rlpy.datasets import get_pybullet # PyBullet task datasets
from d3rlpy.datasets import get_atari    # Atari 2600 task datasets
from d3rlpy.datasets import get_d4rl     # D4RL datasets


In [3]:
# Prepare Dataset
# You can make your own dataset without any efforts. In this tutorial, let’s use integrated datasets to start. If you want to make a new dataset, see MDPDataset.
dataset, env = get_cartpole()

Donwloading cartpole.pkl into d3rlpy_data\cartpole_replay.h5...


In [4]:
# You can split dataset into a training dataset and a test dataset just like supervised learning as follows.
from sklearn.model_selection import train_test_split

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

In [5]:
# Setup Algorithm
# There are many algorithms avaiable in d3rlpy. Since CartPole is the simple task, let’s start from DQN, 
# which is the Q-learnig algorithm proposed as the first deep reinforcement learning algorithm.
from d3rlpy.algos import DQN

# if you don't use GPU, set use_gpu=False instead.
dqn = DQN(use_gpu=False)

# initialize neural networks with the given observation shape and action size.
# this is not necessary when you directly call fit or fit_online method.
dqn.build_with_dataset(dataset)

In [6]:
# Setup metrics
# Collecting evaluation metrics is important to train algorithms properly. In d3rlpy, the metrics is computed through scikit-learn style scorer functions.
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer

# calculate metrics with test dataset
td_error = td_error_scorer(dqn, test_episodes)

In [7]:
# Since evaluating algorithms without access to environment is still difficult, 
# the algorithm can be directly evaluated with evaluate_on_environment function if the environment is available to interactF
from d3rlpy.metrics.scorer import evaluate_on_environment

# set environment in scorer function
evaluate_scorer = evaluate_on_environment(env)

# evaluate algorithm on the environment
rewards = evaluate_scorer(dqn)

In [8]:
# Start Training
dqn.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=10,
        scorers={
            'td_error': td_error_scorer,
            'value_scale': average_value_estimation_scorer,
            'environment': evaluate_scorer
        })

2023-07-22 15:57.34 [debug    ] RoundIterator is selected.
2023-07-22 15:57.34 [info     ] Directory is created at d3rlpy_logs\DQN_20230722155734
2023-07-22 15:57.34 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230722155734\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/10: 100%|██████████| 2455/2455 [00:08<00:00, 283.62it/s, loss=0.0123]


2023-07-22 15:57.44 [info     ] DQN_20230722155734: epoch=1 step=2455 epoch=1 metrics={'time_sample_batch': 0.00013246565389536064, 'time_algorithm_update': 0.0032221866965051094, 'loss': 0.012281907083886802, 'time_step': 0.0034810172079051344, 'td_error': 0.9828593678075406, 'value_scale': 1.0686198768667219, 'environment': 24.7} step=2455
2023-07-22 15:57.44 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_2455.pt


Epoch 2/10: 100%|██████████| 2455/2455 [00:08<00:00, 283.92it/s, loss=4.35e-5]


2023-07-22 15:57.53 [info     ] DQN_20230722155734: epoch=2 step=4910 epoch=2 metrics={'time_sample_batch': 0.00013296084588511169, 'time_algorithm_update': 0.003215025010274044, 'loss': 4.352232922438073e-05, 'time_step': 0.0034783150659316426, 'td_error': 0.9814925859578073, 'value_scale': 1.069139284579945, 'environment': 23.7} step=4910
2023-07-22 15:57.53 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_4910.pt


Epoch 3/10: 100%|██████████| 2455/2455 [00:09<00:00, 254.07it/s, loss=4.39e-5]


2023-07-22 15:58.04 [info     ] DQN_20230722155734: epoch=3 step=7365 epoch=3 metrics={'time_sample_batch': 0.0001451052376306227, 'time_algorithm_update': 0.0036011406457594117, 'loss': 4.388858133245715e-05, 'time_step': 0.0038847468778217885, 'td_error': 0.9848852583939095, 'value_scale': 1.071899532763913, 'environment': 17.8} step=7365
2023-07-22 15:58.04 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_7365.pt


Epoch 4/10: 100%|██████████| 2455/2455 [00:09<00:00, 249.42it/s, loss=0.00596]


2023-07-22 15:58.15 [info     ] DQN_20230722155734: epoch=4 step=9820 epoch=4 metrics={'time_sample_batch': 0.00014725964812300113, 'time_algorithm_update': 0.0036692880564454617, 'loss': 0.005957207730484891, 'time_step': 0.003961686447050324, 'td_error': 0.9823096207421752, 'value_scale': 2.043620167373371, 'environment': 13.9} step=9820
2023-07-22 15:58.15 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_9820.pt


Epoch 5/10: 100%|██████████| 2455/2455 [00:09<00:00, 261.94it/s, loss=0.00555]


2023-07-22 15:58.25 [info     ] DQN_20230722155734: epoch=5 step=12275 epoch=5 metrics={'time_sample_batch': 0.00013872717645649026, 'time_algorithm_update': 0.003494910844231817, 'loss': 0.005546377595540614, 'time_step': 0.003769836406358148, 'td_error': 0.9897012044913254, 'value_scale': 2.072487372928269, 'environment': 10.8} step=12275
2023-07-22 15:58.25 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_12275.pt


Epoch 6/10: 100%|██████████| 2455/2455 [00:09<00:00, 260.52it/s, loss=0.00543]


2023-07-22 15:58.36 [info     ] DQN_20230722155734: epoch=6 step=14730 epoch=6 metrics={'time_sample_batch': 0.00014040678921639312, 'time_algorithm_update': 0.0035124654439704726, 'loss': 0.005424922674712597, 'time_step': 0.00378511869737427, 'td_error': 0.9886783514698539, 'value_scale': 2.070560606486448, 'environment': 10.5} step=14730
2023-07-22 15:58.36 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_14730.pt


Epoch 7/10: 100%|██████████| 2455/2455 [00:10<00:00, 229.85it/s, loss=0.0115] 


2023-07-22 15:58.47 [info     ] DQN_20230722155734: epoch=7 step=17185 epoch=7 metrics={'time_sample_batch': 0.00016654581741985623, 'time_algorithm_update': 0.003976325444922671, 'loss': 0.01153813735067431, 'time_step': 0.004287922989326679, 'td_error': 0.9807624643832595, 'value_scale': 3.0503221229626734, 'environment': 11.3} step=17185
2023-07-22 15:58.47 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_17185.pt


Epoch 8/10: 100%|██████████| 2455/2455 [00:09<00:00, 267.52it/s, loss=0.015] 


2023-07-22 15:58.57 [info     ] DQN_20230722155734: epoch=8 step=19640 epoch=8 metrics={'time_sample_batch': 0.00013864754173760501, 'time_algorithm_update': 0.0034182579599912684, 'loss': 0.015012446653268341, 'time_step': 0.0036913715409164272, 'td_error': 0.9830217093781463, 'value_scale': 3.041459730816445, 'environment': 12.8} step=19640
2023-07-22 15:58.57 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_19640.pt


Epoch 9/10: 100%|██████████| 2455/2455 [00:09<00:00, 266.94it/s, loss=0.0148]


2023-07-22 15:59.08 [info     ] DQN_20230722155734: epoch=9 step=22095 epoch=9 metrics={'time_sample_batch': 0.0001406926001647825, 'time_algorithm_update': 0.0034274097372703786, 'loss': 0.014792572408774678, 'time_step': 0.003701819616034172, 'td_error': 0.9748706296610429, 'value_scale': 3.041053770956903, 'environment': 155.0} step=22095
2023-07-22 15:59.08 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_22095.pt


Epoch 10/10: 100%|██████████| 2455/2455 [00:09<00:00, 258.19it/s, loss=0.0183]


2023-07-22 15:59.18 [info     ] DQN_20230722155734: epoch=10 step=24550 epoch=10 metrics={'time_sample_batch': 0.0001419148959841602, 'time_algorithm_update': 0.0035513148531165968, 'loss': 0.018254761820733253, 'time_step': 0.0038233139364394052, 'td_error': 0.9966057883545109, 'value_scale': 3.995312971342641, 'environment': 200.0} step=24550
2023-07-22 15:59.18 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722155734\model_24550.pt


[(1,
  {'time_sample_batch': 0.00013246565389536064,
   'time_algorithm_update': 0.0032221866965051094,
   'loss': 0.012281907083886802,
   'time_step': 0.0034810172079051344,
   'td_error': 0.9828593678075406,
   'value_scale': 1.0686198768667219,
   'environment': 24.7}),
 (2,
  {'time_sample_batch': 0.00013296084588511169,
   'time_algorithm_update': 0.003215025010274044,
   'loss': 4.352232922438073e-05,
   'time_step': 0.0034783150659316426,
   'td_error': 0.9814925859578073,
   'value_scale': 1.069139284579945,
   'environment': 23.7}),
 (3,
  {'time_sample_batch': 0.0001451052376306227,
   'time_algorithm_update': 0.0036011406457594117,
   'loss': 4.388858133245715e-05,
   'time_step': 0.0038847468778217885,
   'td_error': 0.9848852583939095,
   'value_scale': 1.071899532763913,
   'environment': 17.8}),
 (4,
  {'time_sample_batch': 0.00014725964812300113,
   'time_algorithm_update': 0.0036692880564454617,
   'loss': 0.005957207730484891,
   'time_step': 0.003961686447050324,
  

In [10]:
# calculate metrics with test dataset
td_error = td_error_scorer(dqn, test_episodes)
td_error

0.9966057883545109

In [11]:
# Once the training is done, your algorithm is ready to make decisions.
observation = env.reset()

# return actions based on the greedy-policy
action = dqn.predict([observation])[0]

# estimate action-values
value = dqn.predict_value([observation], [action])[0]

In [12]:
# Save and load
# d3rlpy provides several ways to save trained models.

# save full parameters
dqn.save_model('dqn.pt')

# load full parameters
dqn2 = DQN()
dqn2.build_with_dataset(dataset)
dqn2.load_model('dqn.pt')

# save the greedy-policy as TorchScript
dqn.save_policy('policy.pt')

# save the greedy-policy as ONNX
dqn.save_policy('policy.onnx')

# Play with MDPDataset

In [13]:
# prepare dataset
dataset, _ = d3rlpy.datasets.get_dataset("cartpole-random")

Donwloading cartpole.pkl into d3rlpy_data\cartpole_random.h5...


In [14]:
# first episode
episode = dataset.episodes[0]

# access to episode data
episode.observations
episode.actions
episode.rewards

# first transition
transition = episode.transitions[0]

# access to tuple
transition.observation
transition.action
transition.reward
transition.next_observation

# linked list structure
next_transition = transition.next_transition
assert transition is next_transition.prev_transition

In [15]:
# Feed MDPDataset to Algorithm
dqn = d3rlpy.algos.DQN()

# feed as MDPDataset
dqn.fit(dataset, n_steps=10000)

# feed as Episode
dqn.fit(dataset.episodes, n_steps=10000)

# feed as Transition
transitions = []
for episode in dataset.episodes:
    transitions.extend(episode.transitions)
dqn.fit(transitions, n_steps=10000)


2023-07-22 17:06.24 [debug    ] RandomIterator is selected.
2023-07-22 17:06.24 [info     ] Directory is created at d3rlpy_logs\DQN_20230722170624
2023-07-22 17:06.24 [debug    ] Building models...
2023-07-22 17:06.24 [debug    ] Models have been built.
2023-07-22 17:06.24 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230722170624\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN

Epoch 1/1: 100%|██████████| 10000/10000 [00:37<00:00, 264.49it/s, loss=0.00643]

2023-07-22 17:07.02 [info     ] DQN_20230722170624: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0002513176918029785, 'time_algorithm_update': 0.0033109166383743288, 'loss': 0.00643815588111861, 'time_step': 0.0037374186992645262} step=10000
2023-07-22 17:07.02 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722170624\model_10000.pt





2023-07-22 17:07.02 [debug    ] RandomIterator is selected.
2023-07-22 17:07.02 [info     ] Directory is created at d3rlpy_logs\DQN_20230722170702
2023-07-22 17:07.02 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230722170702\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/1: 100%|██████████| 10000/10000 [00:43<00:00, 229.81it/s, loss=0.0168]

2023-07-22 17:07.45 [info     ] DQN_20230722170702: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0002714428663253784, 'time_algorithm_update': 0.003835451602935791, 'loss': 0.016797652586744517, 'time_step': 0.0042962648153305055} step=10000
2023-07-22 17:07.45 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722170702\model_10000.pt
2023-07-22 17:07.45 [debug    ] RandomIterator is selected.





2023-07-22 17:07.45 [info     ] Directory is created at d3rlpy_logs\DQN_20230722170745
2023-07-22 17:07.45 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230722170745\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/1: 100%|██████████| 10000/10000 [00:46<00:00, 212.84it/s, loss=0.0217]

2023-07-22 17:08.32 [info     ] DQN_20230722170745: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0002897111654281616, 'time_algorithm_update': 0.0041527874946594235, 'loss': 0.021663313985122657, 'time_step': 0.004640909504890442} step=10000





2023-07-22 17:08.32 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722170745\model_10000.pt


[(1,
  {'time_sample_batch': 0.0002897111654281616,
   'time_algorithm_update': 0.0041527874946594235,
   'loss': 0.021663313985122657,
   'time_step': 0.004640909504890442})]

In [16]:
# use scikit-learn utility
from sklearn.model_selection import train_test_split

# episode-wise split
train_episodes, test_episodes = train_test_split(dataset.episodes)

# setup metrics
metrics = {
  "soft_opc": d3rlpy.metrics.scorer.soft_opc_scorer(return_threshold=180),
  "initial_value": d3rlpy.metrics.scorer.initial_state_value_estimation_scorer,
}

# start training with episode-wise splits
dqn.fit(
    train_episodes,
    n_steps=10000,
    scorers=metrics,
    eval_episodes=test_episodes,
)

2023-07-22 17:08.35 [debug    ] RandomIterator is selected.
2023-07-22 17:08.35 [info     ] Directory is created at d3rlpy_logs\DQN_20230722170835
2023-07-22 17:08.35 [info     ] Parameters are saved to d3rlpy_logs\DQN_20230722170835\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


Epoch 1/1:   0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 1/1: 100%|██████████| 10000/10000 [00:43<00:00, 229.46it/s, loss=0.0233]


2023-07-22 17:09.19 [info     ] DQN_20230722170835: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0002636854887008667, 'time_algorithm_update': 0.0038532357692718506, 'loss': 0.023311340535826456, 'time_step': 0.004305591797828674, 'soft_opc': nan, 'initial_value': 4.9983391761779785} step=10000
2023-07-22 17:09.20 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20230722170835\model_10000.pt


[(1,
  {'time_sample_batch': 0.0002636854887008667,
   'time_algorithm_update': 0.0038532357692718506,
   'loss': 0.023311340535826456,
   'time_step': 0.004305591797828674,
   'soft_opc': nan,
   'initial_value': 4.9983391761779785})]

In [17]:
# Mix Datasets
from d3rlpy.dataset import MDPDataset

In [18]:
# You can also mix multiple datasets to train algorithms.
replay_dataset, _ = d3rlpy.datasets.get_dataset("cartpole-replay")

# extends replay dataset with random dataset
replay_dataset.extend(dataset)

# you can also save it and load it later
replay_dataset.dump("mixed_dataset.h5")
mixed_dataset = MDPDataset.load("mixed_dataset.h5")

# Data collection

**d3rlpy provides APIs to support data collection from environments. This feature is specifically useful if you want to build your own original datasets for research or practice purposes.**

**Prepare Environment
d3rlpy supports environments with OpenAI Gym interface. In this tutorial, let’s use simple CartPole environment.**

In [19]:
import gym

env = gym.make("CartPole-v0")

**Data Collection with Random Policy**

In [20]:
# If you want to collect experiences with uniformly random policy, you can use RandomPolicy and DiscreteRandomPolicy. This procedure corresponds to random datasets in D4RL.
import d3rlpy

# setup algorithm
random_policy = d3rlpy.algos.DiscreteRandomPolicy()

# prepare experience replay buffer
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=100000, env=env)

# start data collection
random_policy.collect(env, buffer, n_steps=100000)

# export as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump("random_policy_dataset.h5")

2023-07-22 17:10.04 [debug    ] Building model...
2023-07-22 17:10.04 [debug    ] Model has been built.


100%|██████████| 100000/100000 [00:03<00:00, 31430.53it/s]


**Data Collection while Training Policy**

In [22]:
# setup algorithm
dqn = d3rlpy.algos.DQN()

# prepare experience replay buffer
buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=100000, env=env)

# prepare exploration strategy if necessary
explorer = d3rlpy.online.explorers.ConstantEpsilonGreedy(0.3)

# start data collection
dqn.fit_online(env, buffer, n_steps=100000)

# export as MDPDataset
dataset = buffer.to_mdp_dataset()

# save MDPDataset
dataset.dump("replay_dataset.h5")

2023-07-22 17:10.20 [info     ] Directory is created at d3rlpy_logs\DQN_online_20230722171020
2023-07-22 17:10.20 [debug    ] Building model...
2023-07-22 17:10.20 [debug    ] Model has been built.
2023-07-22 17:10.20 [info     ] Parameters are saved to d3rlpy_logs\DQN_online_20230722171020\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


 10%|▉         | 9986/100000 [00:38<05:44, 260.93it/s]

2023-07-22 17:10.59 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_10000.pt
2023-07-22 17:10.59 [info     ] DQN_online_20230722171020: epoch=1 step=10000 epoch=1 metrics={'time_inference': 0.00047210686206817625, 'time_environment_step': 5.235147086984402e-05, 'time_step': 0.0038315746307373048, 'rollout_return': 9.697002141327623, 'time_sample_batch': 0.00015877052657051958, 'time_algorithm_update': 0.0030934831695066593, 'loss': 0.006750749032357123} step=10000


 20%|█▉        | 19990/100000 [01:18<05:01, 265.74it/s]

2023-07-22 17:11.39 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_20000.pt
2023-07-22 17:11.39 [info     ] DQN_online_20230722171020: epoch=2 step=20000 epoch=2 metrics={'time_inference': 0.0004633970022201538, 'time_environment_step': 5.215113156183146e-05, 'time_sample_batch': 0.00015937888622283937, 'time_algorithm_update': 0.003209559440612793, 'loss': 0.017139763402959217, 'time_step': 0.003952449345588684, 'rollout_return': 17.22040072859745} step=20000


 30%|██▉       | 29985/100000 [02:08<05:25, 215.24it/s]

2023-07-22 17:12.29 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_30000.pt
2023-07-22 17:12.29 [info     ] DQN_online_20230722171020: epoch=3 step=30000 epoch=3 metrics={'time_inference': 0.0006068842172622681, 'time_environment_step': 6.684611831229256e-05, 'time_sample_batch': 0.00019521639347076417, 'time_algorithm_update': 0.003957726454734802, 'loss': 0.01692864636857703, 'time_step': 0.004913048768043518, 'rollout_return': 23.608374384236452} step=30000


 40%|███▉      | 39980/100000 [02:59<03:48, 262.26it/s]

2023-07-22 17:13.19 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_40000.pt
2023-07-22 17:13.19 [info     ] DQN_online_20230722171020: epoch=4 step=40000 epoch=4 metrics={'time_inference': 0.0005936670541763306, 'time_environment_step': 6.44308381631862e-05, 'time_sample_batch': 0.00020092337131500244, 'time_algorithm_update': 0.004069307279586792, 'loss': 0.01467409262912115, 'time_step': 0.005013094902038574, 'rollout_return': 94.0576923076923} step=40000


 50%|████▉     | 49989/100000 [03:55<06:49, 122.17it/s]

2023-07-22 17:14.16 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_50000.pt
2023-07-22 17:14.16 [info     ] DQN_online_20230722171020: epoch=5 step=50000 epoch=5 metrics={'time_inference': 0.0006939249992370606, 'time_environment_step': 5.972176939997841e-05, 'time_sample_batch': 0.00021233177185058592, 'time_algorithm_update': 0.00455575442314148, 'loss': 0.01425086851908127, 'time_step': 0.005612196755409241, 'rollout_return': 200.0} step=50000


 60%|█████▉    | 59995/100000 [05:00<03:24, 196.01it/s]

2023-07-22 17:15.21 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_60000.pt
2023-07-22 17:15.21 [info     ] DQN_online_20230722171020: epoch=6 step=60000 epoch=6 metrics={'time_inference': 0.0007891584634780884, 'time_environment_step': 7.415888896539583e-05, 'time_sample_batch': 0.00022498390674591064, 'time_algorithm_update': 0.005196636295318604, 'loss': 0.013201305827248143, 'time_step': 0.006386096835136414, 'rollout_return': 200.0} step=60000


 70%|██████▉   | 69976/100000 [05:45<02:10, 230.67it/s]

2023-07-22 17:16.06 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_70000.pt
2023-07-22 17:16.06 [info     ] DQN_online_20230722171020: epoch=7 step=70000 epoch=7 metrics={'time_inference': 0.000527955675125122, 'time_environment_step': 5.458477285923928e-05, 'time_sample_batch': 0.00017214436531066895, 'time_algorithm_update': 0.00362412006855011, 'loss': 0.01430074846589705, 'time_step': 0.0044466694831848145, 'rollout_return': 200.0} step=70000


 80%|███████▉  | 79985/100000 [06:52<01:31, 218.22it/s]

2023-07-22 17:17.13 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_80000.pt
2023-07-22 17:17.13 [info     ] DQN_online_20230722171020: epoch=8 step=80000 epoch=8 metrics={'time_inference': 0.0007444080829620361, 'time_environment_step': 7.314001495514683e-05, 'time_sample_batch': 0.00021344006061553954, 'time_algorithm_update': 0.005481677055358887, 'loss': 0.013926662999260589, 'time_step': 0.00660682737827301, 'rollout_return': 200.0} step=80000


 90%|████████▉ | 89980/100000 [07:46<00:43, 227.87it/s]

2023-07-22 17:18.06 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_90000.pt
2023-07-22 17:18.06 [info     ] DQN_online_20230722171020: epoch=9 step=90000 epoch=9 metrics={'time_inference': 0.0006250709533691406, 'time_environment_step': 6.749057290542066e-05, 'time_sample_batch': 0.0001952970266342163, 'time_algorithm_update': 0.00433855791091919, 'loss': 0.016248252808925463, 'time_step': 0.0053152955293655395, 'rollout_return': 200.0} step=90000


100%|█████████▉| 99988/100000 [08:35<00:00, 208.36it/s]

2023-07-22 17:18.55 [info     ] Model parameters are saved to d3rlpy_logs\DQN_online_20230722171020\model_100000.pt
2023-07-22 17:18.55 [info     ] DQN_online_20230722171020: epoch=10 step=100000 epoch=10 metrics={'time_inference': 0.0005566834688186645, 'time_environment_step': 5.422606540085682e-05, 'time_sample_batch': 0.0001874948024749756, 'time_algorithm_update': 0.003967680883407593, 'loss': 0.015128838215080032, 'time_step': 0.004846218276023865, 'rollout_return': 200.0} step=100000


100%|██████████| 100000/100000 [08:35<00:00, 194.00it/s]


# Create Your Dataset

**The data collection API is introduced in Data Collection. In this tutorial, you can learn how to build your dataset from logged data such as the user data collected in your web service.**

### Prepare Logged Data

**First of all, you need to prepare your logged data. In this tutorial, let’s use randomly generated data. terminals represents the last step of episodes. If terminals[i] == 1.0, i-th step is the terminal state. Otherwise you need to set zeros for non-terminal states.**

In [23]:
import numpy as np

# vector observation
# 1000 steps of observations with shape of (100,)
observations = np.random.random((1000, 100))

# 1000 steps of actions with shape of (4,)
actions = np.random.random((1000, 4))

# 1000 steps of rewards
rewards = np.random.random(1000)

# 1000 steps of terminal flags
terminals = np.random.randint(2, size=1000)

In [24]:
dataset = d3rlpy.dataset.MDPDataset(
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals,
)

In [None]:
# episode-wise split
train_episodes, test_episodes = train_test_split(dataset.episodes)

# setup metrics
metrics = {
  "soft_opc": d3rlpy.metrics.scorer.soft_opc_scorer(return_threshold=180),
  "initial_value": d3rlpy.metrics.scorer.initial_state_value_estimation_scorer,
}

dqn = d3rlpy.algos.CQL()

# start training with episode-wise splits
dqn.fit(
    train_episodes,
    n_steps=10000,
    scorers=metrics,
    eval_episodes=test_episodes,
)

In [22]:
# 1000 steps of observations with shape of (100,)
observations = np.random.random((1000, 100))
# 1000 steps of actions with shape of (4,)
actions = np.random.random((1000, 4))
# 1000 steps of rewards
rewards = np.random.random(1000)
# 1000 steps of terminal flags
terminals = np.random.randint(2, size=1000)

dataset = MDPDataset(observations, actions, rewards, terminals)

# automatically splitted into d3rlpy.dataset.Episode objects
dataset.episodes

# each episode is also splitted into d3rlpy.dataset.Transition objects
episode = dataset.episodes[0]
episode[0].observation
episode[0].action
episode[0].reward
episode[0].next_observation
episode[0].terminal

# d3rlpy.dataset.Transition object has pointers to previous and next
# transitions like linked list.
transition = episode[0]
while transition.next_transition:
    transition = transition.next_transition

# save as HDF5
dataset.dump('dataset.h5')

# load from HDF5
new_dataset = MDPDataset.load('dataset.h5')

In [None]:
# episode-wise split
train_episodes, test_episodes = train_test_split(dataset.episodes)

# setup metrics
metrics = {
  "soft_opc": d3rlpy.metrics.scorer.soft_opc_scorer(return_threshold=180),
  "initial_value": d3rlpy.metrics.scorer.initial_state_value_estimation_scorer,
}

dqn = d3rlpy.algos.CQL()

# start training with episode-wise splits
dqn.fit(
    train_episodes,
    n_steps=10000,
    scorers=metrics,
    eval_episodes=test_episodes,
)

# Evaluation on environment

In [None]:
import d3rlpy

# setup replay CartPole-v0 dataset and environment
dataset, env = d3rlpy.datasets.get_dataset("cartpole-replay")

# setup algorithm
dqn = d3rlpy.algos.DQN()

# start offline training
dqn.fit(
   dataset,
   eval_episodes=dataset.episodes,
   n_steps=100000,
   n_steps_per_epoch=10000,
   scorers={
       "environment": d3rlpy.metrics.evaluate_on_environment(env),
   },
)