In [None]:
#| default_exp dqn_pong

In [None]:
# |export
import gym
import pytorch_lightning as pl
import d3rlpy
import numpy as np
import torch
from d3rlpy.algos import DQN
from d3rlpy.models.optimizers import OptimizerFactory
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from torch.optim import Adam, Optimizer
from pytorch_lightning import LightningModule, Trainer
from d3rlpy.metrics.scorer import evaluate_on_environment

The following class appears to define a DQN (Deep Q-Network) agent for training and playing games in a gym environment.

The DQN algorithm is a model-based reinforcement learning technique used to learn a policy for selecting actions in a given state to maximize the long-term reward. It involves training a neural network to approximate the Q-function, which represents the expected future reward for each action at a given state.

calc_loss: a function that calculates the loss for a given batch of experiences, using the current and target networks and the Bellman equation to calculate the expected Q-values for the current states and the Q-values from the target network for the next states. The loss is then calculated as the mean squared error between the expected and actual Q-values.

ExperienceBuffer: a class that stores and samples a fixed-size buffer of Experience tuples.

In [None]:
import gym
import pytorch_lightning as pl
import d3rlpy
from d3rlpy.algos import DQN
from d3rlpy.models.optimizers import OptimizerFactory
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from torch.optim import Adam, Optimizer
from pytorch_lightning import LightningModule, Trainer

class PongModel(LightningModule):
    def __init__(self):
        super().__init__()
        # create the environment
        self.env = gym.make("PongNoFrameskip-v4")
        # create eval environment
        self.eval_env = gym.make("PongNoFrameskip-v4")
        # set an optimizer fct
        optim_factory = OptimizerFactory(Adam, weight_decay=1e-4)
        # DQN model
        self.dqn = DQN(
            batch_size=32,
            learning_rate=2.5e-4,
            target_update_interval=100,
            optim_factory=optim_factory,
        )
        # create the experience replay buffer
        self.buffer = ReplayBuffer(maxlen=1000, env=self.env)
        # epilon-greedy explorer
        self.explorer = LinearDecayEpsilonGreedy(
            start_epsilon=1.0, end_epsilon=0.1, duration=1000
        )
        model_parameters = {name: param for name, param in self.dqn.get_params().items()}
        # create optimizer
        self.optimizer = Adam(model_parameters, lr=2.5e-4)
        
    def forward(self, x):
        """
        Passes in a state x through the network and gets the q_values of each action as an output.

        """
        return self.dqn(x)

    def training_step(self, batch, batch_idx):
        # step through environment with agent
        states, actions, rewards, next_states, dones = batch
        loss = self.dqn.update(
            states, actions, rewards, next_states, dones, self.buffer, self.explorer
        )
        return {'loss': loss}
    
    def configure_optimizers(self):
        return self.optimizer
    
    def __dataloader(self):
        """Initialize the Replay Buffer dataset used for retrieving experiences."""
        dataset = self.buffer.to_mdp_dataset
        dataloader = DataLoader(dataset=dataset, batch_size=16)
        return dataloader


In [None]:
# create an instance of PongModel
model = PongModel()

# create a Trainer instance
trainer = Trainer(max_epochs=1000)

# start training
trainer.fit(model)


TypeError: optimizer can only optimize Tensors, but one of the params is str

# Version 2

In [None]:
class DQNTrainer:
    def __init__(
        self,
        env_name: str,
        eval_env_name: str,
        buffer_maxlen: int,
        start_epsilon: float,
        end_epsilon: float,
        duration: int,
        batch_size: int,
        learning_rate: float,
        target_update_interval: int,
        optim_factory: OptimizerFactory,
        n_steps: int,
        n_steps_per_epoch: int,
        update_start_step: int,
    ):
        self.env_name = env_name
        self.eval_env_name = eval_env_name
        self.buffer_maxlen = buffer_maxlen
        self.start_epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.duration = duration
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.target_update_interval = target_update_interval
        self.optim_factory = optim_factory
        self.n_steps = n_steps
        self.n_steps_per_epoch = n_steps_per_epoch
        self.update_start_step = update_start_step

        # create the environment
        self.env = gym.make(self.env_name)
        # create the evaluation environment
        self.eval_env = gym.make(self.eval_env_name)
        # create the DQN model
        self.dqn = DQN(
            batch_size=self.batch_size,
            learning_rate=self.learning_rate,
            target_update_interval=self.target_update_interval,
        )
        # create the experience replay buffer
        self.buffer = ReplayBuffer(maxlen=self.buffer_maxlen, env=self.env)
        # create the epsilon-greedy explorer
        self.explorer = LinearDecayEpsilonGreedy(
            start_epsilon=self.start_epsilon,
            end_epsilon=self.end_epsilon,
            duration=self.duration,
        )

    def train(self):
        # start training
        self.dqn.fit_online(
            self.env,
            self.buffer,
            self.explorer,
            n_steps=self.n_steps,
            eval_env=self.eval_env,
            n_steps_per_epoch=self.n_steps_per_epoch,
            update_start_step=self.update_start_step,
        )

    def evaluate(self, render=True):
        score = evaluate_on_environment(self.env, render=render)(self.dqn)

In [None]:
# create an instance of the DQNTrainer class
trainer = DQNTrainer(
    env_name="CartPole-v0",
    eval_env_name="CartPole-v0",
    buffer_maxlen=1000,
    start_epsilon=1.0,
    end_epsilon=0.1,
    duration=1000,
    batch_size=32,
    learning_rate=2.5e-4,
    target_update_interval=100,
    optim_factory=OptimizerFactory(Adam, weight_decay=1e-4),
    n_steps=30000,
    n_steps_per_epoch=1000,
    update_start_step=1000
)

# start training
trainer.train()
score = trainer.evaluate()
print(score)

2022-12-29 19:21.13 [info     ] Directory is created at d3rlpy_logs/DQN_online_20221229192113
2022-12-29 19:21.13 [debug    ] Building model...
2022-12-29 19:21.13 [debug    ] Model has been built.
2022-12-29 19:21.13 [info     ] Parameters are saved to d3rlpy_logs/DQN_online_20221229192113/params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 0.00025, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_update_interval': 100, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (4,), 'action_size': 2}


  0%|          | 0/30000 [00:00<?, ?it/s]

2022-12-29 19:21.13 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20221229192113/model_1000.pt
2022-12-29 19:21.13 [info     ] DQN_online_20221229192113: epoch=1 step=1000 epoch=1 metrics={'time_inference': 0.0002002553939819336, 'time_environment_step': 1.322793960571289e-05, 'time_step': 0.00022890019416809082, 'rollout_return': 14.271428571428572, 'evaluation': 9.4} step=1000
2022-12-29 19:21.15 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20221229192113/model_2000.pt
2022-12-29 19:21.15 [info     ] DQN_online_20221229192113: epoch=2 step=2000 epoch=2 metrics={'time_inference': 0.00022844457626342775, 'time_environment_step': 1.7648935317993165e-05, 'time_sample_batch': 5.428791046142578e-05, 'time_algorithm_update': 0.0011993114948272705, 'loss': 0.20065757550066338, 'time_step': 0.001529106855392456, 'rollout_return': 32.733333333333334, 'evaluation': 101.3} step=2000
2022-12-29 19:21.17 [info     ] Model parameters are saved to d3rlpy_logs/D

2022-12-29 19:21.47 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20221229192113/model_17000.pt
2022-12-29 19:21.47 [info     ] DQN_online_20221229192113: epoch=17 step=17000 epoch=17 metrics={'time_inference': 0.0002535417079925537, 'time_environment_step': 1.930856704711914e-05, 'time_sample_batch': 6.063437461853027e-05, 'time_algorithm_update': 0.0014595577716827392, 'loss': 0.28266326214402215, 'time_step': 0.0018254013061523436, 'rollout_return': 196.0, 'evaluation': 183.7} step=17000
2022-12-29 19:21.49 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20221229192113/model_18000.pt
2022-12-29 19:21.49 [info     ] DQN_online_20221229192113: epoch=18 step=18000 epoch=18 metrics={'time_inference': 0.0002378098964691162, 'time_environment_step': 1.804494857788086e-05, 'time_sample_batch': 5.429720878601074e-05, 'time_algorithm_update': 0.0013373541831970216, 'loss': 0.1859688821757445, 'time_step': 0.0016769711971282959, 'rollout_return': 197.4, 'ev