# Deep Reinforcement Learning: 
### Deep Q-Networks, Double DQN, Dueling DQN & Prioritized Experience Replay (PER)
<br>
James Chapman<br>
CIS 730 Artificial Intelligence – Term Project<br>
Kansas State University

<hr style="border:2px solid gray">
This notebook was CONVERTED from OpenAI's Gym to Farama's Gymnasium.

This notebook began as part of the tutorial series ["Deep Reinforcement Learning Explained"](https://github.com/jorditorresBCN/Deep-Reinforcement-Learning-Explained/tree/master) by [Jordi Torres](https://torres.ai/). Much of Reinforcement learning is not covered in CIS730, and CIS732(Machine Learning) only covered select chapters of "Reinforcement learning: An introduction", Sutton & Barto [[1](#citations)]. This series is a great supplement for value iteration and Q-learning in Pytorch.<br>

Extending from the series, this project explores 3 variants of Deep Q-Networks, including their application in Breakout.
<hr style="border:2px solid gray">

DEEP REINFORCEMENT LEARNING EXPLAINED - 15 - 16 - 17
# **Deep Q-Network (DQN)**

OpenAI Pong

In [1]:
############################
#CONVERTED
# import gym 
# import gym.spaces 
import gymnasium as gym 
import gymnasium.spaces 

DEFAULT_ENV_NAME = "PongNoFrameskip-v4" 
test_env = gym.make(DEFAULT_ENV_NAME)
print(test_env.action_space.n)

6


In [2]:
print(test_env.unwrapped.get_action_meanings())

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


In [3]:
print(test_env.observation_space.shape)

(210, 160, 3)



Type of hardware accelerator provided by Colab

In [4]:
!nvidia-smi 

Thu Nov  2 14:19:18 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.40                 Driver Version: 536.40       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090      WDDM  | 00000000:01:00.0  On |                  Off |
|  0%   36C    P8               6W / 450W |    900MiB / 24564MiB |      6%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
import warnings
warnings.filterwarnings('ignore')

## OpenAI Gym Wrappers

In [6]:
# Taken from 
# https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py

import cv2
import numpy as np
import collections

class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    ############################
    #CONVERTED
    #def step(self, action):
    #    return self.env.step(action)
    #def reset(self):
    #    self.env.reset()
    #    obs, _, done, _ = self.env.step(1)
    #    if done:
    #        self.env.reset()
    #    obs, _, done, _ = self.env.step(2)
    #    if done:
    #        self.env.reset()
    #    return obs

    def reset(self, **kwargs):#CONVERTED
        self.env.reset(**kwargs)
        obs, _, terminated, truncated, _ = self.env.step(1)
        if terminated or truncated:
            self.env.reset(**kwargs)
        obs, _, terminated, truncated, _ = self.env.step(2)
        if terminated or truncated:
            self.env.reset(**kwargs)
        return obs, {}

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        ############################
        #CONVERTED
        #self._obs_buffer = collections.deque(maxlen=2)
        self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) #CONVERTED
        self._skip = skip

    def step(self, action):
        ############################
        #CONVERTED
        #    total_reward = 0.0
        #    done = None
        #    for _ in range(self._skip):
        #        obs, reward, done, info = self.env.step(action)
        #        self._obs_buffer.append(obs)
        #        total_reward += reward
        #        if done:
        #            break
        #    max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        #    return max_frame, total_reward, done, info
        total_reward = 0.0
        terminated = truncated = False
        for i in range(self._skip):
            obs, reward, terminated, truncated, info = self.env.step(action)
            done = terminated or truncated
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += float(reward)
            if done:
                break
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, terminated, truncated, info
    
        ############################
        #CONVERTED
        #def reset(self):
        #    self._obs_buffer.clear()
        #    obs = self.env.reset()
        #    self._obs_buffer.append(obs)
        #    return obs


class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)
    ############################
    #CONVERTED
    #def reset(self):
    #    self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
    #    return self.observation(self.env.reset())
    def reset(self, **kwargs):#CONVERTED
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        obs, info = self.env.reset(**kwargs)
        return self.observation(obs), info

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], 
                                old_shape[0], old_shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    return ScaledFloatFrame(env)

## The DQN model


In [7]:
import torch
import torch.nn as nn        # Pytorch neural network package
import torch.optim as optim  # Pytorch optimization package

device = torch.device("cuda")

In [8]:
# Taken from 
# https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/dqn_model.py

import numpy as np

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)

In [9]:
test_env = make_env(DEFAULT_ENV_NAME)
test_net = DQN(test_env.observation_space.shape, test_env.action_space.n).to(device)
print(test_net)

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


## Training

Load Tensorboard extension

In [10]:
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

Import required modules and define the hyperparameters

In [11]:
import time
import numpy as np
import collections


MEAN_REWARD_BOUND = 19.0           

gamma = 0.99                   
batch_size = 32                
replay_size = 10000            
learning_rate = 1e-4           
sync_target_frames = 1000      
replay_start_size = 10000      

eps_start=1.0
eps_decay=.999985
eps_min=0.02

Experience replay buffer

In [12]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceReplay:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)


Agent

In [13]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        ############################
        #CONVERTED
        #self.state = env.reset()
        self.state, info = env.reset()
        
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):

        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())
        ############################
        #CONVERTED
        #new_state, reward, is_done, _ = self.env.step(action)
        new_state, reward, terminated, truncated, info = self.env.step(action)
        is_done = terminated or truncated  #New
        
        self.total_reward += reward
        
        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward


In [14]:
import datetime
print(">>>Training starts at ",datetime.datetime.now())

>>>Training starts at  2023-11-02 14:19:23.824102


Main training loop

In [15]:
env = make_env(DEFAULT_ENV_NAME)

net = DQN(env.observation_space.shape, env.action_space.n).to(device)
target_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
 
buffer = ExperienceReplay(replay_size)
agent = Agent(env, buffer)

epsilon = eps_start

optimizer = optim.Adam(net.parameters(), lr=learning_rate)
total_rewards = []
frame_idx = 0  

best_mean_reward = None

while True:
        frame_idx += 1
        epsilon = max(epsilon*eps_decay, eps_min)

        reward = agent.play_step(net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)

            mean_reward = np.mean(total_rewards[-100:])

            print("%d:  %d games, mean reward %.3f, (epsilon %.2f)" % (
                frame_idx, len(total_rewards), mean_reward, epsilon))
            
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                #torch.save(net.state_dict(), DEFAULT_ENV_NAME + "-best.dat")
                best_mean_reward = mean_reward
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f" % (best_mean_reward))

            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        if len(buffer) < replay_start_size:
            continue

        batch = buffer.sample(batch_size)
        states, actions, rewards, dones, next_states = batch
    
        states_v = torch.tensor(states).to(device)
        next_states_v = torch.tensor(next_states).to(device)
        actions_v = torch.tensor(actions).to(device)
        rewards_v = torch.tensor(rewards).to(device)
        done_mask = torch.ByteTensor(dones).to(device)

        ############################
        #CONVERTED
        #state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
        state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1).long()).squeeze(-1)

        next_state_values = target_net(next_states_v).max(1)[0]

        next_state_values[done_mask] = 0.0

        next_state_values = next_state_values.detach()

        expected_state_action_values = next_state_values * gamma + rewards_v

        loss_t = nn.MSELoss()(state_action_values, expected_state_action_values)

        optimizer.zero_grad()
        loss_t.backward()
        optimizer.step()

        if frame_idx % sync_target_frames == 0:
            target_net.load_state_dict(net.state_dict())
       
writer.close()

880:  1 games, mean reward -21.000, (epsilon 0.99)
Best mean reward updated -21.000
1838:  2 games, mean reward -21.000, (epsilon 0.97)
2679:  3 games, mean reward -20.667, (epsilon 0.96)
Best mean reward updated -20.667
3469:  4 games, mean reward -20.750, (epsilon 0.95)
4413:  5 games, mean reward -20.800, (epsilon 0.94)
5402:  6 games, mean reward -20.833, (epsilon 0.92)
6398:  7 games, mean reward -20.714, (epsilon 0.91)
7222:  8 games, mean reward -20.750, (epsilon 0.90)
8032:  9 games, mean reward -20.778, (epsilon 0.89)
8947:  10 games, mean reward -20.700, (epsilon 0.87)
10216:  11 games, mean reward -20.455, (epsilon 0.86)
Best mean reward updated -20.455
10978:  12 games, mean reward -20.500, (epsilon 0.85)
11878:  13 games, mean reward -20.462, (epsilon 0.84)
12807:  14 games, mean reward -20.429, (epsilon 0.83)
Best mean reward updated -20.429
13569:  15 games, mean reward -20.467, (epsilon 0.82)
14331:  16 games, mean reward -20.500, (epsilon 0.81)
15093:  17 games, mean r

160326:  113 games, mean reward -18.200, (epsilon 0.09)
Best mean reward updated -18.200
162986:  114 games, mean reward -18.160, (epsilon 0.09)
Best mean reward updated -18.160
165687:  115 games, mean reward -18.080, (epsilon 0.08)
Best mean reward updated -18.080
168060:  116 games, mean reward -18.050, (epsilon 0.08)
Best mean reward updated -18.050
170270:  117 games, mean reward -17.970, (epsilon 0.08)
Best mean reward updated -17.970
172954:  118 games, mean reward -17.860, (epsilon 0.07)
Best mean reward updated -17.860
175062:  119 games, mean reward -17.810, (epsilon 0.07)
Best mean reward updated -17.810
177079:  120 games, mean reward -17.740, (epsilon 0.07)
Best mean reward updated -17.740
179872:  121 games, mean reward -17.630, (epsilon 0.07)
Best mean reward updated -17.630
182430:  122 games, mean reward -17.590, (epsilon 0.06)
Best mean reward updated -17.590
185276:  123 games, mean reward -17.480, (epsilon 0.06)
Best mean reward updated -17.480
187710:  124 games, m

422050:  207 games, mean reward 5.660, (epsilon 0.02)
Best mean reward updated 5.660
423880:  208 games, mean reward 6.030, (epsilon 0.02)
Best mean reward updated 6.030
425656:  209 games, mean reward 6.390, (epsilon 0.02)
Best mean reward updated 6.390
427553:  210 games, mean reward 6.670, (epsilon 0.02)
Best mean reward updated 6.670
430197:  211 games, mean reward 6.930, (epsilon 0.02)
Best mean reward updated 6.930
432218:  212 games, mean reward 7.250, (epsilon 0.02)
Best mean reward updated 7.250
433920:  213 games, mean reward 7.590, (epsilon 0.02)
Best mean reward updated 7.590
435747:  214 games, mean reward 7.930, (epsilon 0.02)
Best mean reward updated 7.930
438044:  215 games, mean reward 8.240, (epsilon 0.02)
Best mean reward updated 8.240
439858:  216 games, mean reward 8.600, (epsilon 0.02)
Best mean reward updated 8.600
442053:  217 games, mean reward 8.900, (epsilon 0.02)
Best mean reward updated 8.900
443862:  218 games, mean reward 9.200, (epsilon 0.02)
Best mean r

613992:  310 games, mean reward 18.070, (epsilon 0.02)
616116:  311 games, mean reward 18.080, (epsilon 0.02)
618190:  312 games, mean reward 18.080, (epsilon 0.02)
620148:  313 games, mean reward 18.060, (epsilon 0.02)
621804:  314 games, mean reward 18.090, (epsilon 0.02)
623659:  315 games, mean reward 18.110, (epsilon 0.02)
625589:  316 games, mean reward 18.120, (epsilon 0.02)
627285:  317 games, mean reward 18.160, (epsilon 0.02)
Best mean reward updated 18.160
629333:  318 games, mean reward 18.110, (epsilon 0.02)
631088:  319 games, mean reward 18.110, (epsilon 0.02)
633007:  320 games, mean reward 18.090, (epsilon 0.02)
634931:  321 games, mean reward 18.110, (epsilon 0.02)
636945:  322 games, mean reward 18.060, (epsilon 0.02)
639028:  323 games, mean reward 18.040, (epsilon 0.02)
640848:  324 games, mean reward 18.060, (epsilon 0.02)
642683:  325 games, mean reward 18.050, (epsilon 0.02)
644381:  326 games, mean reward 18.090, (epsilon 0.02)
646339:  327 games, mean reward 1

In [16]:
print(">>>Training ends at ",datetime.datetime.now())

>>>Training ends at  2023-11-02 15:46:18.541311


Performance

In [17]:
tensorboard  --logdir=runs

## Using the model

In [19]:
# import gym
# import time
# import numpy as np

# import torch

# import collections

# DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
FPS = 25

Tunning the image rendering in colab


In [20]:
# Taken from 
# https://towardsdatascience.com/rendering-openai-gym-envs-on-binder-and-google-colab-536f99391cc7

!apt-get install -y xvfb x11-utils

!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.*

!pip install gym[box2d]==0.17.*

import pyvirtualdisplay

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
_ = _display.start()

'apt-get' is not recognized as an internal or external command,
operable program or batch file.


Collecting pyvirtualdisplay==0.2.*
  Downloading PyVirtualDisplay-0.2.5-py2.py3-none-any.whl (13 kB)
Collecting PyOpenGL-accelerate==3.1.*
  Downloading PyOpenGL_accelerate-3.1.7-cp310-cp310-win_amd64.whl (318 kB)
     -------------------------------------- 319.0/319.0 kB 2.8 MB/s eta 0:00:00
Collecting EasyProcess
  Downloading EasyProcess-1.1-py3-none-any.whl (8.7 kB)
Installing collected packages: PyOpenGL-accelerate, EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-1.1 PyOpenGL-accelerate-3.1.7 pyvirtualdisplay-0.2.5


  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [28 lines of output]
  Using setuptools (version 65.6.3).
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-310
  creating build\lib.win-amd64-cpython-310\Box2D
  copying library\Box2D\Box2D.py -> build\lib.win-amd64-cpython-310\Box2D
  copying library\Box2D\__init__.py -> build\lib.win-amd64-cpython-310\Box2D
  creating build\lib.win-amd64-cpython-310\Box2D\b2
  copying library\Box2D\b2\__init__.py -> build\lib.win-amd64-cpython-310\Box2D\b2
  running build_ext
  building 'Box2D._Box2D' extension
  swigging Box2D\Box2D.i to Box2D\Box2D_wrap.cpp
  swig.exe -python -c++ -IBox2D -small -O -includeall -ignoremissing -w201 -globals b2Globals -outdir library\Box2D -keyword -w511 -D_SWIG_KWARGS -o Box2D\Box2D_wrap.cpp Box2D\Box2D.i
  error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C+

Collecting gym[box2d]==0.17.*
  Downloading gym-0.17.3.tar.gz (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 1.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 831.0 kB/s eta 0:00:00
Collecting cloudpickle<1.7.0,>=1.2.0
  Downloading cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Collecting box2d-py~=2.3.5
  Downloading box2d-py-2.3.8.tar.gz (374 kB)
     -------------------------------------- 374.5/374.5 kB 1.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: box2d-py, gym
  Building wheel for box2d-py (setup.py): started
  Building wheel for box2d-py (setup.py): finished with status 'error'
  Running setup.py clean for box2d-py
  Building wheel f

EasyProcessError: start error <EasyProcess cmd_param=['Xvfb', '-help'] cmd=['Xvfb', '-help'] oserror=[WinError 2] The system cannot find the file specified return_code=None stdout="None" stderr="None" timeout_happened=False>

In [None]:
# Taken (partially) from 
# https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/03_dqn_play.py


model='PongNoFrameskip-v4-best.dat'
record_folder="video"  
visualize=True

env = make_env(DEFAULT_ENV_NAME)
if record_folder:
        env = gym.wrappers.Monitor(env, record_folder, force=True)
net = DQN(env.observation_space.shape, env.action_space.n)
net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))

state = env.reset()
total_reward = 0.0

while True:
        start_ts = time.time()
        if visualize:
            env.render()
        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
        action = np.argmax(q_vals)
        
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            break
        if visualize:
            delta = 1/FPS - (time.time() - start_ts)
            if delta > 0:
                time.sleep(delta)
print("Total reward: %.2f" % total_reward)

if record_folder:
        env.close()

<a id='citations'></a>
[1] Sutton, R. S., &amp; Barto, A. G. (2020). Reinforcement learning: An introduction. The MIT Press. 