<a href="https://colab.research.google.com/github/DarkPovoGang/DeepRL/blob/main/DeepRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning Project

## Dataset preparation

In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-qbsxsfa1
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-qbsxsfa1
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369370 sha256=ac62d72b7f827cbda353dd98a11b0532ece08e8a4c393553ca1d

In [11]:
import numpy as np
import torch
from pkg_resources import packaging
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch version:", torch.__version__)
model, preprocess = clip.load("RN50")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

preprocess

Torch version: 2.0.1+cu118
Model parameters: 102,007,137
Input resolution: 224
Context length: 77
Vocab size: 49408


Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7fedc237cd30>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [40]:
import pandas as pd
import os
import gdown
import tarfile
from PIL import Image #, ImageDraw
import json
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader


class RefCOCOg:
    FILE_ID = "1wyyksgdLwnRMC9pQ-vjJnNUn47nWhyMD"
    ARCHIVE_NAME = "refcocog.tar.gz"
    NAME = "refcocog"
    ANNOTATIONS = "annotations/refs(umd).p"
    JSON = "annotations/instances.json"
    IMAGES = "images"
    IMAGE_NAME = "COCO_train2014_{}.jpg"

    def __init__(self, data_dir, split, transform=None):
        self.data_dir = data_dir
        self._check_dataset()
        self.split = split
        self._filter_annotation(
            os.path.join(self.data_dir, self.NAME, self.ANNOTATIONS)
        )
        self._load_json()
        self.transform = transform

    def _check_dataset(self):
        if not os.path.exists(os.path.join(self.data_dir, self.ARCHIVE_NAME)):
            if not os.path.exists(self.data_dir):
                os.mkdir(self.data_dir)
            print("Downloading dataset...")
            gdown.download(id=self.FILE_ID)
        if not os.path.exists(os.path.join(self.data_dir, self.NAME)):
            print("Extracting dataset...")
            with tarfile.open(
                os.path.join(self.data_dir, self.ARCHIVE_NAME), "r:gz"
            ) as tar:
                tar.extractall(path=self.data_dir)
        else:
            print("Dataset already extracted")

    def _load_json(self):
        with open(os.path.join(self.data_dir, self.NAME, self.JSON)) as f:
            self.json = json.load(f)
        self.json = pd.DataFrame(self.json["annotations"])

    def __len__(self):
        return len(self.annotation)

    def __getitem__(self, idx):
        # get line by index
        raw = self.annotation.iloc[idx]
        # get image
        image = self._get_image(raw)
        # get sentences
        sentences = self._get_sentences(raw)
        # get bbox
        bbox = self._get_bbox(raw)

        return self._get_vector(image, sentences, bbox)

    def _get_image(self, raw):
        # get image_id
        image_id = raw["image_id"]
        # pad image_id to 12 digits
        image_id = str(image_id).zfill(12)
        # convert image to tensor
        image = Image.open(
            os.path.join(
                self.data_dir, self.NAME, self.IMAGES, self.IMAGE_NAME.format(image_id)
            )
        )
        return image

    def _get_sentences(self, raw):
        # get sentences
        sentences = raw["sentences"]
        # get raw sentences
        sentences = [sentence["raw"] for sentence in sentences]
        return sentences

    def _get_bbox(self, raw):
        # get ref_id
        id = raw["ann_id"]
        bbox = self.json[self.json["id"] == id]["bbox"].values[0]
        return bbox

    def _filter_annotation(self, path):
        self.annotation = pd.read_pickle(path)
        self.annotation = pd.DataFrame(self.annotation)
        self.annotation = self.annotation[self.annotation["split"] == self.split]

    def _get_vector(self, image, sentences, bbox):
        image = preprocess(image).unsqueeze(0).to(device)
        text = clip.tokenize(sentences).to(device)
        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)

        bbox = torch.tensor(bbox).unsqueeze(0).to(device)

        print(f"Image shape: {image_features.shape}, Text shape: {text_features.shape}, Bbox shape: {bbox.shape}")

        # Combine image and text features and normalize
        # product = np.multiply(image_features.cpu(), text_features.cpu())
        # out = np.divide(product, np.linalg.norm(product, axis=1).reshape(-1, 1))
        product = torch.mul(image_features, text_features)
        out = torch.div(product, torch.norm(product, dim=1).reshape(-1, 1))


        bboxes = bbox.repeat(text_features.shape[0],1)
        print(bboxes)
        # append bbox
        out = torch.cat((out, bboxes), dim=1)



        # append bbox
        # print("shape",product.shape)
        # out = np.append(out, bbox.cpu(), axis=1)
        print(f"Output shape: {out.shape}")
        return out

In [41]:
dataset = RefCOCOg('.', 'val')
for i in range(5):
  x = dataset[i]
  print(x)

Dataset already extracted
Image shape: torch.Size([1, 1024]), Text shape: torch.Size([2, 1024]), Bbox shape: torch.Size([1, 4])
tensor([[285.0400,  23.2900, 139.7400, 123.1100],
        [285.0400,  23.2900, 139.7400, 123.1100]], device='cuda:0')
Output shape: torch.Size([2, 1028])
tensor([[ 3.9077e-04,  1.1349e-03,  3.8624e-03,  ...,  2.3290e+01,
          1.3974e+02,  1.2311e+02],
        [-7.3051e-04,  2.1496e-03,  3.9139e-03,  ...,  2.3290e+01,
          1.3974e+02,  1.2311e+02]], device='cuda:0')
Image shape: torch.Size([1, 1024]), Text shape: torch.Size([2, 1024]), Bbox shape: torch.Size([1, 4])
tensor([[183.4000,  68.3400, 126.8100,  98.8100],
        [183.4000,  68.3400, 126.8100,  98.8100]], device='cuda:0')
Output shape: torch.Size([2, 1028])
tensor([[-2.8057e-03, -2.8992e-04, -6.5956e-03,  ...,  6.8340e+01,
          1.2681e+02,  9.8810e+01],
        [-1.7061e-03, -4.1056e-04, -6.4087e-03,  ...,  6.8340e+01,
          1.2681e+02,  9.8810e+01]], device='cuda:0')
Image shape: t

## Our approach

In [5]:
!apt-get install -y xvfb python-opengl swig x11-utils
!pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  freeglut3 libfontenc1 libpython2-stdlib libxfont2 libxkbfile1 libxtst6
  libxxf86dga1 python2 python2-minimal swig4.0 x11-xkb-utils xfonts-base
  xfonts-encodings xfonts-utils xserver-common
Suggested packages:
  python-tk python-numpy libgle3 python2-doc swig-doc swig-examples
  swig4.0-examples swig4.0-doc mesa-utils
The following NEW packages will be installed:
  freeglut3 libfontenc1 libpython2-stdlib libxfont2 libxkbfile1 libxtst6
  libxxf86dga1 python-opengl python2 python2-minimal swig swig4.0 x11-utils
  x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils xserver-common xvfb
0 upgraded, 19 newly installed, 0 to remove and 15 not upgraded.
Need to get 9,627 kB of archives.
After this operation, 24.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 python2-minimal amd64 2.7.1

In [6]:
from gym import spaces
import numpy as np
test = spaces.Box(low=np.array([0, 0, 1, 1]), high=np.array([499, 299, 500, 300]), dtype=int)
# test = spaces.Box([500, 300, 500, 300], dtype=int)
for i in range(10):
  print(test.sample())

[286 289 279 223]
[ 31 188 313 121]
[376  58  77 185]
[291 275 390 179]
[276 247  16  82]
[ 11 166  98  12]
[ 75 166 434  23]
[ 76 248 475 120]
[ 86 249 154 240]
[220  58 211  32]


In [None]:
from enum import Enum
# class syntax
class actions(Enum):
  ACT_RT = 0 #Right
  ACT_LT = 1 #Left
  ACT_UP = 2 #Up
  ACT_DN = 3 #Down
  ACT_TA = 4 #Taller
  ACT_FA = 5 #Fatter
  ACT_SR = 6 #Shorter
  ACT_TH = 7 #Thiner
  ACT_TR = 8 #Trigger


In [8]:
import gym
from gym import spaces
import pygame
import numpy as np

class VisualGroundingEnv(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
    def __init__(self, width, height, move_factor=0.2, scale_factor=0.1, render_mode=None):
        self.width = width  # The width of the image
        self.height = height  # The height of the image
        self.window_size = 512  # The size of the PyGame window
        self.move_factor = move_factor
        self.scale_factor = scale_factor

        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
        self.observation_space = spaces.Dict(
            {
                "agent": spaces.Box(low=np.array([0, 0, 1, 1]), high=np.array([self.width-1, self.height, self.width, self.height]), dtype=int),
                "target": spaces.Box(low=np.array([0, 0, 1, 1]), high=np.array([self.width-1, self.height, self.width, self.height]), dtype=int),
            }
        )

        # We have 9 actions, corresponding to "right", "up", "left", "down", "v-shrink", "v-stretch", "h-shrink", "h-stretch", "confirm"
        self.action_space = spaces.Discrete(9)

        """
        The following dictionary maps abstract actions from `self.action_space` to
        the direction we will walk in if that action is taken.
        I.e. 0 corresponds to "right", 1 to "up" etc.
        """

        #TODO: later when current data is available
        self._action_to_direction = {
            0: np.array([1, 0]),
            1: np.array([0, 1]),
            2: np.array([-1, 0]),
            3: np.array([0, -1]),
        }

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        """
        If human-rendering is used, `self.window` will be a reference
        to the window that we draw to. `self.clock` will be a clock that is used
        to ensure that the environment is rendered at the correct framerate in
        human-mode. They will remain `None` until human-mode is used for the
        first time.
        """
        self.window = None
        self.clock = None

    def _get_obs(self):
        return {"agent": self._agent_location, "target": self._target_location}

    def _get_info(self):
        # return {"distance": np.linalg.norm(self._agent_location - self._target_location, ord=1)}
        # TODO: maybe return current history of movement
        pass

    def reset(self, true_bbox: np.array, seed=None, options=None):
        self.x1 = 0
        self.y1 = 0
        self.bbox_width = self.width
        self.bbox_height = self.height

        # We need the following line to seed self.np_random
        super().reset(seed=seed)

        # Choose the agent's location uniformly at random
        self._agent_location = np.array([0,0, self.width, self.height])

        # We will sample the target's location randomly until it does not coincide with the agent's location
        self._target_location = #TODO: init with true bbox

        observation = self._get_obs()
        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, info

    def _update_bbox(self, action):
      ALPHA = 0.2
      BETA  = 0.1
      x2 = self.x1 + self.bbox_width
      y2 = self.y1 + self.bbox_height
      assert action >= actions.ACT_RT and action <= actions.ACT_TR
      self.action_history.append(action)

      if action <= actions.ACT_DN:
        delta_w = int(ALPHA * self.bbox_width())
        delta_h = int(ALPHA * self.bbox_height())
      else:
        delta_w = int(BETA * self.bbox_width())
        delta_h = int(BETA * self.bbox_height())

      # PREVENT_STUCK:
      if (delta_h == 0):
        delta_h = 1
      if (delta_w == 0):
        delta_w = 1

      #Do the corresponding action to the window
      if action == actions.ACT_RT:
        self.x1 += delta_w
        x2 += delta_w
      elif action == actions.ACT_LT:
        self.x1 -= delta_w
        x2 -= delta_w
      elif action == actions.ACT_UP:
        self.y1 -= delta_h
        y2 -= delta_h
      elif action == actions.ACT_DN:
        self.y1 += delta_h
        y2 += delta_h
      elif action == actions.ACT_TA:
        self.y1 -= delta_h
        y2 += delta_h
      elif action == actions.ACT_FA:
        self.x1 -= delta_w
        x2 += delta_w
      elif action == actions.ACT_SR:
        self.y1 += delta_h
        y2 -= delta_h
      elif action == actions.ACT_TH:
        self.x1 += delta_w
        x2 -= delta_w
      elif action == actions.ACT_TR:
        pass
      else:
        raise NotImplemented

      # ensure bbox inside image
      if self.x1 < 0:
        self.x1 = 0
      if self.y1 < 0:
        self.y1 = 0
      if self.x2 >= self.image_width:
        self.x2 = self.image_width - 1
      if self.y2 >= self.image_height:
        self.y2 = self.image_height - 1
      # ret x,y,w,h
      return  self.x1, self.y1, x2-self.x1, y2-self.y1


    def step(self, action):
            # Map the action (element of {0,1,2,3}) to the direction we walk in
            self._agent_location = self._update_bbox(action)
            # An episode is done iff the agent has reached the target
            terminated = np.array_equal(self._agent_location, self._target_location) #TODO: or quite close
            reward = 1 if terminated else 0  # TODO: change reward
            observation = self._get_obs()
            info = self._get_info()

            if self.render_mode == "human":
                self._render_frame()

            return observation, reward, terminated, False, info

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        if self.window is None and self.render_mode == "human":
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))
        if self.clock is None and self.render_mode == "human":
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = (
            self.window_size / self.size
        )  # The size of a single grid square in pixels

        # First we draw the target
        pygame.draw.rect(
            canvas,
            (255, 0, 0),
            pygame.Rect(
                pix_square_size * self._target_location,
                (pix_square_size, pix_square_size),
            ),
        )
        # Now we draw the agent
        pygame.draw.circle(
            canvas,
            (0, 0, 255),
            (self._agent_location + 0.5) * pix_square_size,
            pix_square_size / 3,
        )

        # Finally, add some gridlines
        for x in range(self.size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )

        if self.render_mode == "human":
            # The following line copies our drawings from `canvas` to the visible window
            self.window.blit(canvas, canvas.get_rect())
            pygame.event.pump()
            pygame.display.update()

            # We need to ensure that human-rendering occurs at the predefined framerate.
            # The following line will automatically add a delay to keep the framerate stable.
            self.clock.tick(self.metadata["render_fps"])
        else:  # rgb_array
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )

    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

SyntaxError: ignored

In [None]:
from gym.envs.registration import register

register(
    id='DeepLearningProject/VisualGrounding-v0',
    entry_point='DeepLearningProject.envs:VisualGrounding',
    max_episode_steps=300,
)

In [None]:
#TODO: move everything in a repo, but leave here for knowledge

from setuptools import setup

setup(
    name="DeepLearningProject",
    version="0.0.1",
    install_requires=["gym==0.26.0", "pygame==2.1.0"],
)

In [None]:
import gym_examples
env = gym.make('DeepLearningProject/VisualGrounding-v0')

### Network, Agent and Utils

In [None]:
import time
import collections

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import gym
import gym.spaces

from torch.utils.tensorboard import SummaryWriter

In [None]:
#TODO: change

class DQN(nn.Module):
    def __init__(self, input_features, n_actions, features=24):
        super(DQN, self).__init__()

        # multi layer perceptron
        self.mlp = nn.Sequential(
            nn.Linear(input_features, features),
            nn.ReLU(),
            nn.Linear(features, features * 2),
            nn.ReLU(),
            nn.Linear(features * 2, features * 4),
            nn.ReLU(),
            nn.Linear(features * 4, features * 2),
            nn.ReLU(),
            nn.Linear(features * 2, features),
            nn.ReLU(),
            nn.Linear(features, n_actions)
        )

    def forward(self, x):
        return self.mlp(x)

In [None]:
# create a subclass of Tuple with named attributes representing experience
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
    def __init__(self, capacity):

        # represent the buffer as a deque
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):

        # add the current experience to the buffer
        self.buffer.append(experience)

    def sample(self, batch_size):

        # sample an index for each element in the batch
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)

        # extract experience entries for each element in the batch
        # each value returned by zip is a list of length batch_size
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])

        # return results as numpy arrays
        return np.array(states), \
               np.array(actions), \
               np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), \
               np.array(next_states)

In [None]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer

        self._reset()

    def _reset(self):

        # restarts the environment and reset the accumulated reward
        self.state = self.env.reset().astype(np.float32)
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):

        # no need to create a computational graph when gathering experience
        with torch.no_grad():

            # will contain the total reward for the episode if the episode ends
            # or None otherwise
            done_reward = None

            # sample the action randomly with probability epsilon
            if np.random.random() < epsilon:
                action = self.env.action_space.sample()

            # otherwise, select action based on qvalues
            else:

                # creates a batch made of a single state
                # state_a = np.array([self.state], copy=False)
                state_tensor = torch.tensor(self.state).unsqueeze(0).to(device)

                # get qvalues and select the index of the maximum
                q_values = net(state_tensor)
                _, selected_action = torch.max(q_values, dim=1)
                action = int(selected_action.item())

            # perform a step in the environment
            new_state, reward, is_done, _ = self.env.step(action)
            new_state = new_state.astype(np.float32)
            self.total_reward += reward

            # save the new experience
            exp = Experience(self.state, action, reward, is_done, new_state)
            self.exp_buffer.append(exp)

            # registers the current state
            self.state = new_state

            # Gets the current representation of the environment
            # current_rgb_image = self.env.render(mode='rgb_array')

            # if the episode is finished, reset the environment
            if is_done:
                done_reward = self.total_reward
                self._reset()

            return done_reward #, current_rgb_image

In [None]:
def calc_loss(batch, net, target_net, device="cpu"):

    # unpack the batch
    states, actions, rewards, dones, next_states = batch

    #states_v = torch.tensor(np.array(states, copy=False)).to(device)
    #next_states_v = torch.tensor(np.array(next_states, copy=False)).to(device)

    # transform the batch elements to tensors
    states_v = torch.from_numpy(states).to(device)
    next_states_v = torch.from_numpy(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    # infer the qvalues for the states
    state_qvalues = net(states_v)

    # extract the qvalues for the action that was selected
    state_action_qvalues = state_qvalues.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)

    with torch.no_grad():

        # compute the qvalues for the next states using the target DQN
        next_state_qvalues = target_net(next_states_v)

        # extract the maximum one
        next_state_max_qvalue = next_state_qvalues.max(dim=1)[0]

        # if the next state refers to an ended episode, it has no value
        next_state_max_qvalue[done_mask] = 0.0

        next_state_max_qvalue = next_state_max_qvalue.detach()

    # Computes the expected qvalue using the Bellman equation
    expected_state_action_qvalues = rewards_v + GAMMA * next_state_max_qvalue

    # Penalizes the DQN for inferring a qvalue different from the one
    # computed with the target DQN using the Bellman equation
    return nn.MSELoss()(state_action_qvalues, expected_state_action_qvalues)

### Train Loop and Utils

In [None]:
DEFAULT_ENV_NAME = ""

# we terminate training if the model on average balances the pole
# for at least 195 steps
MEAN_REWARD_BOUND = 195

# discount factor
GAMMA = 0.99

BATCH_SIZE = 32

# size of the replay buffer
REPLAY_SIZE = 10000

# warmup frames for the replay buffer
REPLAY_START_SIZE = 10000

LEARNING_RATE = 1e-4

# frequency for transferring weights from the actor DQN to the target DQN
SYNC_TARGET_FRAMES = 1000

# epsilon
EPSILON_DECAY_LAST_FRAME = 15000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01

def train(net, target_net, env, buffer, agent, device, writer):

    # epsilon starts from the initial value and is then annealed
    epsilon = EPSILON_START

    # instantiate the optimizer. Note that target_net is not optimized
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    frame_idx = 0

    # frame idx and time at which the last episode ended
    ts_frame = 0
    ts = time.time()
    best_m_reward = None

    while True:
        frame_idx += 1

        # compute the current epsilon with linear annealing
        epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

        # perform a step in the environment to gather experience
        # reward, rbg_image = agent.play_step(net, epsilon, device=device)
        reward = agent.play_step(net, epsilon, device=device)

        # if the current episode has ended
        if reward is not None:

            # register the current total reward
            total_rewards.append(reward)

            # compute training speed
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()

            # compute the mean reward over the last 100 episodes
            m_reward = np.mean(total_rewards[-100:])
            print("%d: done %d games, reward %.3f, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), m_reward, epsilon, speed
            ))

            # log values
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", m_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)

            # update best rewards
            if best_m_reward is None or best_m_reward < m_reward:
                #torch.save(net.state_dict(), args.env + "-best_%.0f.dat" % m_reward)
                if best_m_reward is not None:
                    print("Best reward updated %.3f -> %.3f" % (best_m_reward, m_reward))

                best_m_reward = m_reward

            # stop training when a certain reward is achieved
            if m_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        # continue to collect experience until the warmup finishes
        if len(buffer) < REPLAY_START_SIZE:
            continue

        # at regular intervals load the weights of the actor DQN into the target DQN
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            target_net.load_state_dict(net.state_dict())

        # perform an optimization step
        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, target_net, device=device)
        loss_t.backward()
        optimizer.step()

In [None]:
def save_episode(net, env, agent, device, writer):

    from IPython import display as ipythondisplay
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

    # reset the environment
    agent._reset()
    all_frames = []

    reward = None

    # play an episode
    while reward is None:

        reward, rgb_image = agent.play_step(net, epsilon=0.0, device=device)

        # change fromat from (H, W, C) to (C, H, W) and saves the image
        rgb_image = torch.from_numpy(np.copy(rgb_image)).permute(2, 0, 1)
        all_frames.append(rgb_image)

    # video must be put into (batch, time, C, H, W) format to be saved
    video = torch.stack(all_frames, dim=0).unsqueeze(0)
    # save the video
    writer.add_video("sample_episode", video, global_step=0, fps=10)

In [None]:
def main():

    device = torch.device("cuda:0")

    # build the environment
    env = gym.make(DEFAULT_ENV_NAME)

    # create actor and target DQN models
    net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    target_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)

    # initialize the logger
    writer = SummaryWriter(log_dir="runs")
    print(net)

    # instantiate the experience buffer and the agent that collects experience
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)

    # train the network
    train(net, target_net, env, buffer, agent, device, writer)

    # save a sample episode
    # save_episode(net, env, agent, device, writer)

    writer.close()


In [None]:
!rm -r runs

In [None]:
%load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir=runs

In [None]:
main()

In [None]:
# plt.figure(figsize=(10, 10))
# plt.subplot(2, 2, 1)
# plt.plot(logs["reward"])
# plt.title("training rewards (average)")
# plt.subplot(2, 2, 2)
# plt.plot(logs["step_count"])
# plt.title("Max step count (training)")
# plt.subplot(2, 2, 3)
# plt.plot(logs["eval reward (sum)"])
# plt.title("Return (test)")
# plt.subplot(2, 2, 4)
# plt.plot(logs["eval step_count"])
# plt.title("Max step count (test)")
# plt.show()