In [None]:
# STUDENT NUMBERS
# 1886648
# 1851234 
# 1669326
from IPython.display import JSON
from google.colab import output
from subprocess import getoutput
import os

def shell(command):
  if command.startswith('cd'):
    path = command.strip().split(maxsplit=1)[1]
    os.chdir(path)
    return JSON([''])
  return JSON([getoutput(command)])
output.register_callback('shell', shell)

In [None]:
#@title Colab Shell
%%html
<div id=term_demo></div>
<script src="https://code.jquery.com/jquery-latest.js"></script>
<script src="https://cdn.jsdelivr.net/npm/jquery.terminal/js/jquery.terminal.min.js"></script>
<link href="https://cdn.jsdelivr.net/npm/jquery.terminal/css/jquery.terminal.min.css" rel="stylesheet"/>
<script>
  $('#term_demo').terminal(async function(command) {
      if (command !== '') {
          try {
              let res = await google.colab.kernel.invokeFunction('shell', [command])
              let out = res.data['application/json'][0]
              this.echo(new String(out))
          } catch(e) {
              this.error(new String(e));
          }
      } else {
          this.echo('');
      }
  }, {
      greetings: 'Welcome to Colab Shell',
      name: 'colab_demo',
      height: 250,
      prompt: 'colab > '
  });

In [None]:
import random
import numpy as np
import gym
import torch


from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0,"/content/drive/MyDrive/Colab Notebooks")

from dqn.agent import DQNAgent
from dqn.replay_buffer import ReplayBuffer
from dqn.wrappers import *

if __name__ == "__main__":

    hyper_params = {
        "seed": 42,  # which seed to use
        "env": "PongNoFrameskip-v4",  # name of the game
        "replay-buffer-size": int(5e3),  # replay buffer size
        "learning-rate": 1e-4,  # learning rate for Adam optimizer
        "discount-factor": 0.99,  # discount factor
        "num-steps": int(1e6),  # total number of steps to run the environment for
        "batch-size": 256,  # number of transitions to optimize at the same time
        "learning-starts": 10000,  # 10000 number of steps before learning starts
        "learning-freq": 5,  # number of iterations between every optimization step
        "use-double-dqn": False,  # use double deep Q-learning
        "target-update-freq": 1000,  # 1000 number of iterations between every target network update
        "eps-start": 1.0,  # e-greedy start threshold
        "eps-end": 0.01,  # e-greedy end threshold
        "eps-fraction": 0.1,  # fraction of num-steps
        "print-freq": 10,
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip"
    env = gym.make(hyper_params["env"])
    env.seed(hyper_params["seed"])

    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    env = FireResetEnv(env)
    # TODO Pick Gym wrappers to use
    #
    #
    #
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = gym.wrappers.Monitor(env, "Storage/recordings", force = True)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    # TODO Create dqn agent
    # agent = DQNAgent( ... )
    agent = DQNAgent(env.observation_space, env.action_space, replay_buffer, hyper_params["use-double-dqn"], hyper_params["learning-rate"], hyper_params["batch-size"], hyper_params["discount-factor"])

    eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
    episode_rewards = [0.0]
    losses = []
    episode_losses = [0.0]

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * (
            hyper_params["eps-end"] - hyper_params["eps-start"]
        )
        sample = random.random()
        # TODO
        #  select random action if sample is less equal than eps_threshold
        # take step in env
        # add state, action, reward, next_state, float(done) to reply memory - cast done to float
        # add reward to episode_reward

        action = agent.act(state, sample, eps_threshold)
        next_state, reward, done, info = env.step(action)
        done = float(done)
        agent.memory.add(state, action, reward, next_state, done)
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)
            episode_losses.append(0.0)

        if (
            t > hyper_params["learning-starts"]
            and t % hyper_params["learning-freq"] == 0
        ):
            losses.append(agent.optimise_td_loss())
            episode_losses[-1] += losses[-1]

        if (
            t > hyper_params["learning-starts"]
            and t % hyper_params["target-update-freq"] == 0
        ):
            agent.update_target_network()

        num_episodes = len(episode_rewards)

        if (
            done
            and hyper_params["print-freq"] is not None
            and len(episode_rewards) % hyper_params["print-freq"] == 0
        ):
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            print("********************************************************")
            print("steps: {}".format(t))
            print("episodes: {}".format(num_episodes))
            print("mean 100 episode reward: {}".format(mean_100ep_reward))
            print("% time spent exploring: {}".format(int(100 * eps_threshold)))
            print("********************************************************")


In [None]:
torch.save(agent.policy_net, "Storage/policy_net.pb")
torch.save(agent.target_net, "Storage/target_net.pb")

In [None]:
import matplotlib.pyplot as plt

figure, plots = plt.subplots(3, 1, figsize=(20, 15))

episodeLengths = env.get_episode_lengths()
npEpisode_losses = np.array(episode_losses)
npEpisodeLengths = np.array(episodeLengths, dtype = np.float32)
averaged_episode_losses = []

if len(npEpisode_losses) != len(npEpisodeLengths):
  averaged_episode_losses = npEpisode_losses[:-2] / npEpisodeLengths
else:
  averaged_episode_losses = npEpisode_losses / npEpisodeLengths

figure.suptitle("Graphs")

plots[0].set_ylabel("Reward per Episode")
plots[0].set_xlabel("Episode")
plots[0].plot(np.arange(len(episode_rewards)), episode_rewards)
plots[1].set_ylabel("Loss per Step")
plots[1].set_xlabel("Step")
plots[1].plot(np.arange(len(losses)), losses)
plots[2].set_ylabel("Averaged Loss per Episode")
plots[2].set_xlabel("Episode")
plots[2].plot(np.arange(len(averaged_episode_losses)), averaged_episode_losses)
plt.savefig("Storage/graphs.png")
plt.show()