# Deep RL hands-on by Maxim Lapan
## Chapter 6 Deep Q-Networks

* conda activate gym
* [source github](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On)
* from this part, it need modularized code. Some kind of code woudl be packed in lib directory. Be aware that

## DQN for Pong

In [1]:
from lib import wrappers
from lib import dqn_model

import os
import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

import tensorflow as tf
import datetime

In [2]:
# hyperparameters

DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.5

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

In [3]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [4]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.bool), np.array(next_states)

In [5]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [6]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.tensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [7]:
device = torch.device("cuda")
os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'
# !pip install gym[atari]
env = wrappers.make_env(DEFAULT_ENV_NAME) # "PongNoFrameskip-v4"

In [8]:
'''
pong env.observation space - (4, 84, 84)
action space - 6
'''
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
net = nn.DataParallel(net, output_device = 1).to(device)
tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
tgt_net = nn.DataParallel(tgt_net, output_device = 1).to(device)
print(net)

DataParallel(
  (module): DQN(
    (conv): Sequential(
      (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
      (3): ReLU()
      (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
      (5): ReLU()
    )
    (fc): Sequential(
      (0): Linear(in_features=3136, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=6, bias=True)
    )
  )
)


In [9]:
torch.cuda.get_device_name()

'GeForce GTX 1080'

In [10]:
print(tgt_net)

DataParallel(
  (module): DQN(
    (conv): Sequential(
      (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
      (3): ReLU()
      (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
      (5): ReLU()
    )
    (fc): Sequential(
      (0): Linear(in_features=3136, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=6, bias=True)
    )
  )
)


In [11]:
buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

In [12]:
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

In [13]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # it's convenient to record the date with current time
LOGDIR = './tmp/' + DEFAULT_ENV_NAME + '/' + current_time + '/' 
writer = tf.summary.create_file_writer(LOGDIR)

while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, epsilon,
            speed
        ))
        
        with writer.as_default():    
            tf.summary.scalar("epsilon", epsilon, frame_idx)
            tf.summary.scalar("speed", speed, frame_idx)
            tf.summary.scalar("reward_100", mean_reward, frame_idx)
            tf.summary.scalar("reward", reward, frame_idx)
        
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME + "-best.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break

    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device = device)
    loss_t.backward()
    optimizer.step()

846: done 1 games, mean reward -21.000, eps 0.99, speed 207.23 f/s
1668: done 2 games, mean reward -21.000, eps 0.98, speed 659.63 f/s
2430: done 3 games, mean reward -21.000, eps 0.98, speed 1050.44 f/s
3280: done 4 games, mean reward -21.000, eps 0.97, speed 1039.71 f/s
4061: done 5 games, mean reward -21.000, eps 0.96, speed 1028.15 f/s
4823: done 6 games, mean reward -21.000, eps 0.95, speed 1031.01 f/s
5834: done 7 games, mean reward -20.714, eps 0.94, speed 1033.54 f/s
Best mean reward updated -21.000 -> -20.714, model saved
6596: done 8 games, mean reward -20.750, eps 0.93, speed 995.74 f/s
7418: done 9 games, mean reward -20.778, eps 0.93, speed 998.99 f/s
8378: done 10 games, mean reward -20.600, eps 0.92, speed 998.38 f/s
Best mean reward updated -20.714 -> -20.600, model saved
9297: done 11 games, mean reward -20.545, eps 0.91, speed 971.28 f/s
Best mean reward updated -20.600 -> -20.545, model saved


RuntimeError: arguments are located on different GPUs at /pytorch/aten/src/THC/generic/THCTensorScatterGather.cu:13