# Deep Q-Network (DQN)

The notebook is based on "Human-Level Control through deep reinforcement learning" by Mnih et. al.

In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm.notebook import tqdm
from itertools import count
import os
import gym
from gym import wrappers

from PIL import Image

import matplotlib.pyplot as plt

## Replay Buffer

In [2]:
#replay buffer for memory replay
class ReplayBuffer():
    '''
    Stores memories up to a maximum of mem_size. The memories can be batched to use in training.
    '''
    
    def __init__(self, input_size, mem_size=10000, batch_size=64):
        self.mem_size = mem_size
        self.index = 0
        self.batch_size=batch_size
        
        self.obs_memory = np.empty((self.mem_size, *input_size), dtype=np.float32)
        self.action_memory = np.empty((self.mem_size), dtype=np.int64)
        self.reward_memory = np.empty((self.mem_size), dtype=np.float32)
        self.next_obs_memory = np.empty((self.mem_size, *input_size), dtype=np.float32)
        self.terminal_memory = np.empty((self.mem_size), dtype=np.bool)
        
    def add_memory(self, obs, action, reward, next_obs, done):
        self.obs_memory[self.index] = obs
        self.action_memory[self.index] = action
        self.reward_memory[self.index] = reward
        self.next_obs_memory[self.index] = next_obs
        self.terminal_memory[self.index] = done
        self.index += 1
        self.index %= self.mem_size

    def get_memory_batch(self):
        idxs = np.random.choice(len(self), self.batch_size, replace=False)
        
        obss = self.obs_memory[idxs]
        actions = self.action_memory[idxs]
        rewards = self.reward_memory[idxs]
        next_obss = self.next_obs_memory[idxs]
        dones = self.terminal_memory[idxs]
        
        return obss, actions, rewards, next_obss, dones
    
    def __len__(self):
        return min(self.index, self.mem_size)

In [3]:
# prioritized experience replay
class PrioritizedExperienceReplay():
    '''
    Prioritized Experience Replay by Tom Schaul, John Quan, Ioannis Antonoglou and David Silver.
    https://arxiv.org/pdf/1511.05952.pdf
    '''
    
    def __init__(self, input_size, mem_size=10000, batch_size=64, alpha=0.5, beta=0.4, beta_factor=0.9999):
        self.mem_size = mem_size
        self.index = 0
        self.batch_size=batch_size
        self.alpha = alpha
        self.beta = beta
        self.beta_factor = beta_factor
        self.idxs = None
        
        self.obs_memory = np.empty((self.mem_size, *input_size), dtype=np.float32)
        self.action_memory = np.empty((self.mem_size), dtype=np.int64)
        self.reward_memory = np.empty((self.mem_size), dtype=np.float32)
        self.next_obs_memory = np.empty((self.mem_size, *input_size), dtype=np.float32)
        self.terminal_memory = np.empty((self.mem_size), dtype=np.bool)
        self.priority_memory = np.empty((self.mem_size), dtype=np.float32)
        
    def add_memory(self, obs, action, reward, next_obs, done):
        if self.index == 0: 
            priority = 1
        else:
            # ensure that first time experience is drawn with the highest probability
            priority = self.priority_memory[:len(self)].max()
            
        self.obs_memory[self.index] = obs
        self.action_memory[self.index] = action
        self.reward_memory[self.index] = reward
        self.next_obs_memory[self.index] = next_obs
        self.terminal_memory[self.index] = done
        self.priority_memory[self.index] = priority
        self.index += 1
        self.index %= self.mem_size

    def get_memory_batch(self):
        numerator = self.priority_memory[:len(self)] ** self.alpha
        denominator = numerator.sum()
        
        p = numerator / denominator
        self.idxs = np.random.choice(len(self), self.batch_size, p=p, replace=False)
        
        weights = (p[self.idxs] * len(self))**(-self.beta)
        max_weight = weights.max()
        weights /= max_weight
        
        obss = self.obs_memory[self.idxs]
        actions = self.action_memory[self.idxs]
        rewards = self.reward_memory[self.idxs]
        next_obss = self.next_obs_memory[self.idxs]
        dones = self.terminal_memory[self.idxs]
        
        self.beta *= self.beta_factor
        
        return obss, actions, rewards, next_obss, dones, weights
    
    def update_memory_batch(self, priorities):
        self.priority_memory[self.idxs] = np.abs(priorities).squeeze()
    
    def __len__(self):
        return min(self.index, self.mem_size)

## Model

In [4]:
#function approximator of the Q-Function
class QNN(nn.Module):
    '''
    A PyTorch based neural network approximator of the Q-function.
    The network can include a flexible number of input, hidden and output nodes.
    No CNN layers are included in this architecture.
    '''
    
    def __init__(self, input_size, hidden_sizes, action_size, lr, save_dir, name):
        super(QNN, self).__init__()
        self.output_file = os.path.join(save_dir, name)
        self.input_layer = nn.Linear(*input_size, hidden_sizes[0])
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]) 
                                            for i in range(len(hidden_sizes)-1)])
        self.output_layer = nn.Linear(hidden_sizes[-1], action_size)
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.loss = nn.MSELoss()
        self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
        self.to(self.device)
        
    
    def forward(self, state):
        x = state
        x = F.relu(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        x = self.output_layer(x)
        return x
        
    
    def save(self):
        torch.save(self.state_dict(), self.output_file)
    
    def load(self):
        self.load_state_dict(torch.load(self.output_file))
        

In [5]:
# duelling function approximator separating the Q function into the V and the A part
class Duell_QNN(nn.Module):
    '''
    A PyTorch based neural network approximator of the Q-function.
    The network can include a flexible number of input, hidden and output nodes.
    '''
    
    def __init__(self, input_size, hidden_sizes, action_size, lr, save_dir, name):
        super(Duell_QNN, self).__init__()
        self.output_file = os.path.join(save_dir, name)
        self.input_layer = nn.Linear(*input_size, hidden_sizes[0])
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]) 
                                            for i in range(len(hidden_sizes)-1)])
        
        
        self.v = nn.Linear(hidden_sizes[-1], 1)
        self.a = nn.Linear(hidden_sizes[-1], action_size)
                
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.loss = nn.MSELoss()
        self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
        self.to(self.device)
        
    
    def forward(self, state):
        x = state
        x = F.relu(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        
        v = self.v(x)
        a = self.a(x)
        a_mean = a.mean(axis=1, keepdim=True)
              
        x = v + (a - a_mean)
        return x
        
    
    def save(self):
        torch.save(self.state_dict(), self.output_file)
    
    def load(self):
        self.load_state_dict(torch.load(self.output_file))

## Agent

In [6]:
class DQN:
    '''
    DQN Agent combining a memory buffer and separate online and target neural networks
    '''
    
    def __init__(self, net_type, duelling, memory_type, input_size, hidden_sizes, action_size, 
                 min_epsilon, max_epsilon, epsilon_decay, 
                 gamma, lr, mem_size, batch_size, save_dir, name):
        self.memory_type = memory_type
        
        if self.memory_type == 'UNIFORM':
            self.replay_buffer = ReplayBuffer(input_size, mem_size, batch_size)
        elif self.memory_type == 'PER':
            self.replay_buffer = PrioritizedExperienceReplay(input_size, mem_size, batch_size)
        else:
            raise ValueError('Invalid memory type for replay buffer')
        
        if duelling:
            self.qnn_target = Duell_QNN(input_size, hidden_sizes, action_size, lr, save_dir, 
                                  name=name+net_type+'duelling'+'_target.pt')
            self.qnn_online = Duell_QNN(input_size, hidden_sizes, action_size, lr, save_dir, 
                                  name=name+net_type+'duelling'+'_online.pt')
        else:
            self.qnn_target = QNN(input_size, hidden_sizes, action_size, lr, save_dir, 
                                  name=name+net_type+'_target.pt')
            self.qnn_online = QNN(input_size, hidden_sizes, action_size, lr, save_dir, 
                                  name=name+net_type+'_online.pt')
        
        self.replace_target_network()
        
        self.net_type = net_type
        self.epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.action_size = action_size
        self.batch_size = batch_size
    
    def epsilon_greedy(self, obs):
        if np.random.random() > self.epsilon:
            action = self.greedy(obs)
        else:
            action = np.random.choice(self.action_size)
        return action
    
    def greedy(self, obs):
        with torch.no_grad():
            obs = torch.from_numpy(obs).to(self.qnn_online.device).float().unsqueeze(0)
            action = np.argmax(self.qnn_online.forward(obs).detach().numpy())
            return action
    
    def decrement_epsilon(self):
        if self.epsilon <= self.min_epsilon:
            return
        
        epsilon = self.epsilon - self.epsilon_decay
        self.epsilon = max(epsilon, self.min_epsilon)
    
    def add_memory(self, obs, action, reward, next_obs, done):
        self.replay_buffer.add_memory(obs, action, reward, next_obs, done)
    
    def get_memory_batch(self):
        if self.memory_type == 'UNIFORM':
            obss, actions, rewards, next_obss, dones = self.replay_buffer.get_memory_batch()
        elif self.memory_type == 'PER':
            obss, actions, rewards, next_obss, dones, weights = self.replay_buffer.get_memory_batch()
        device = self.qnn_online.device
        obss = torch.from_numpy(obss).to(device)
        actions = torch.from_numpy(actions).to(device)
        rewards = torch.from_numpy(rewards).to(device)
        next_obss = torch.from_numpy(next_obss).to(device)
        dones = torch.from_numpy(dones).to(device)
        if self.memory_type == 'UNIFORM':
            return obss, actions, rewards, next_obss, dones
        elif self.memory_type == 'PER':
            weights = torch.from_numpy(weights).to(device)
            return obss, actions, rewards, next_obss, dones, weights
        
    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        self.qnn_online.optimizer.zero_grad()
        if self.memory_type == 'UNIFORM':
            obss, actions, rewards, next_obss, dones = self.get_memory_batch()
        if self.memory_type == 'PER':
            obss, actions, rewards, next_obss, dones, weights = self.get_memory_batch()
        with torch.no_grad():
            if self.net_type == 'DQN':
#                 target = rewards + self.gamma * torch.max(self.qnn_target.forward(next_obss).detach(), dim=1)[0] \
#                     * torch.logical_not(dones)
                indices = torch.max(self.qnn_target.forward(next_obss).detach(), dim=1)[1]
            elif self.net_type == 'DDQN':
                indices = torch.max(self.qnn_online.forward(next_obss).detach(), dim=1)[1]
            else:
                raise ValueError('net_type should be DQN or DDQN')
            

            next_values = self.qnn_target.forward(next_obss)[np.arange(self.batch_size), indices]
            target = rewards + self.gamma * next_values * torch.logical_not(dones)
            
        target = target.unsqueeze(1)
        online = self.qnn_online.forward(obss).gather(dim=1, index=actions.unsqueeze(1))

        if self.memory_type == 'UNIFORM':
            loss = self.qnn_online.loss(online, target)
        elif self.memory_type == 'PER':
            error = target - online
            loss = (weights*error).pow(2).mul(0.5).mean() 
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnn_online.parameters(), 1.0)
        
        self.qnn_online.optimizer.step()
        self.decrement_epsilon()
        
        if self.memory_type == 'PER':
            self.replay_buffer.update_memory_batch(error.detach().cpu().numpy())
    
    def save(self):
        self.qnn_online.save()
    
    def load(self):
        self.qnn_online.load()
        
    def replace_target_network(self):
        self.qnn_target.load_state_dict(self.qnn_online.state_dict())
    

## Environments config

In [7]:
ENVS = [
    {
        'NAME': 'CartPole-v1',
        'UNWRAPPED': False,
        'SOLVED_REWARD': 475.0,
        'NET_TYPE': 'DQN',
        'RESULTS': [],
        'DUELLING': False,
        'MEMORY_TYPE': 'UNIFORM'
    },
    {
        'NAME': 'CartPole-v1',
        'UNWRAPPED': False,
        'SOLVED_REWARD': 475.0,
        'NET_TYPE': 'DDQN',
        'RESULTS': [],
        'DUELLING': False,
        'MEMORY_TYPE': 'UNIFORM'
    },
    {
        'NAME': 'CartPole-v1',
        'UNWRAPPED': False,
        'SOLVED_REWARD': 475.0,
        'NET_TYPE': 'DDQN',
        'RESULTS': [],
        'DUELLING': True,
        'MEMORY_TYPE': 'UNIFORM'
    },
    {
        'NAME': 'CartPole-v1',
        'UNWRAPPED': False,
        'SOLVED_REWARD': 475.0,
        'NET_TYPE': 'DDQN',
        'RESULTS': [],
        'DUELLING': True,
        'MEMORY_TYPE': 'PER'
    },
#     {
#         'NAME': 'CartPole-v1',
#         'UNWRAPPED': False,
#         'SOLVED_REWARD': 475.0,
#         'NET_TYPE': 'DQN',
#         'RESULTS': []
#     },
#     {
#         'NAME': 'CartPole-v1',
#         'UNWRAPPED': False,
#         'SOLVED_REWARD': 475.0,
#         'NET_TYPE': 'DDQN',
#         'RESULTS': []
#     },
#     {
#         'NAME': 'MountainCar-v0',
#         'UNWRAPPED': True,
#         'SOLVED_REWARD': -110.0,
#         'NET_TYPE': 'DQN',
#         'RESULTS': []
#     },
#     {
#         'NAME': 'MountainCar-v0',
#         'UNWRAPPED': True,
#         'SOLVED_REWARD': -110.0,
#         'NET_TYPE': 'DDQN',
#         'RESULTS': []
#     },
#     {
#         'NAME': 'LunarLander-v2',
#         'UNWRAPPED': False,
#         'SOLVED_REWARD': 200,
#         'NET_TYPE': 'DQN',
#         'RESULTS': []
#     },
#     {
#         'NAME': 'LunarLander-v2',
#         'UNWRAPPED': False,
#         'SOLVED_REWARD': 200,
#         'NET_TYPE': 'DDQN',
#         'RESULTS': []
#     }
]

## Training Parameters

These parameters are used for all the environments defined above!

In [8]:
HIDDEN_SIZES = (512, 128)
EPISODES = 2000
MIN_EPSILON=0.3
MAX_EPSILON=1
EPSILON_DECAY=0.0001
GAMMA = 0.99
LEARNING_RATE = 0.00025
MEMORY_SIZE = 1000000
BATCH_SIZE = 32
REPLACE_TARGET = 1000
SAVE_DIR = './progress'
RESULTS_DIR = './results'

## Main Training Loop

In [9]:
#Main loop
for env in ENVS:
    
    NAME = env['NAME']
    UNWRAPPED = env['UNWRAPPED']
    SOLVED_REWARD = env['SOLVED_REWARD']
    NET_TYPE = env['NET_TYPE']
    DUELLING = env['DUELLING']
    MEMORY_TYPE = env['MEMORY_TYPE']
    
    print(f'--- TRAINING ENVIRONMENT {NAME}---\n')
    print(f'--- MODEL TYPE --- {NET_TYPE}---\n')
    print(f'--- DUELLING --- {DUELLING}---\n')
    print(f'--- MEMORY TYPE --- {MEMORY_TYPE}---\n')
    
    if UNWRAPPED:  
        ENV = gym.make(NAME).unwrapped
    else: 
        ENV = gym.make(NAME)
    
    ENV.seed(42)
    ACTION_SIZE = ENV.action_space.n
    INPUT_SIZE = ENV.observation_space.shape

    agent = DQN(NET_TYPE, DUELLING, MEMORY_TYPE, INPUT_SIZE, HIDDEN_SIZES, ACTION_SIZE, 
                min_epsilon=MIN_EPSILON, max_epsilon=MAX_EPSILON, epsilon_decay=EPSILON_DECAY,
                gamma=GAMMA, lr=LEARNING_RATE, 
                mem_size=MEMORY_SIZE, batch_size=BATCH_SIZE, 
                save_dir=SAVE_DIR, name=NAME)


    reward_tracking = []
    eval_tracking = env['RESULTS']
    best_mean = -1000
    best_eval_score = -1000
    reward_mean = -1000
    eval_mean = -1000
    counter = 0
    for episode in tqdm(range(EPISODES)):
        obs, done = ENV.reset(), False
        reward_sum = 0
        while not done:
            action = agent.epsilon_greedy(obs)
            next_obs, reward, done, info = ENV.step(action)
            is_truncated = 'TimeLimit.truncated' in info and info['TimeLimit.truncated']
            terminal = done and (not is_truncated)
            reward_sum += reward
            agent.add_memory(obs, action, reward, next_obs, terminal)
            obs = next_obs
            agent.learn()
            if(counter + 1) % REPLACE_TARGET == 0:
                agent.replace_target_network()
#                 counter = 0
            counter += 1
        reward_tracking.append(reward_sum)


        #testing agent
        obs, done = ENV.reset(), False
        eval_score = 0
        while not done:
            action = agent.greedy(obs)
            next_obs, reward, done, _ = ENV.step(action)
            eval_score += reward
            obs = next_obs

            # this is necessary for Mountain Car, as in earlier episodes the policy might be stuck during evaluation
            # this is the case because the evaluation policy is purely greedy and the environment is unwrapped
            if eval_score <= -200:
                done = True
        eval_tracking.append(eval_score)

        # OUTPUT INFO
        if (episode > 100):
            reward_mean = np.array(reward_tracking[-100:]).mean()
            eval_mean_10 = np.array(eval_tracking[-10:]).mean()
            eval_mean_100 = np.array(eval_tracking[-100:]).mean()
            if reward_mean > best_mean:
                best_mean = reward_mean
            if eval_mean_100 > best_eval_score:
                best_eval_score = eval_mean_100
                agent.save()

            print(f'step: {counter}, best_mean_100: {best_mean}, current_mean: {reward_mean}, \
    best_eval_100: {best_eval_score}, eval_100 {eval_mean_100}, eval_10 {eval_mean_10}', end='\r')

            if eval_mean_100 >= SOLVED_REWARD:
                print('\n', flush=True)
                print(f'---GOAL REACHED AFTER {episode} EPISODES---')
                print('\n')
                break
                
    print('-'*100)
    

--- TRAINING ENVIRONMENT CartPole-v1---

--- MODEL TYPE --- DQN---

--- DUELLING --- False---

--- MEMORY TYPE --- UNIFORM---



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))

step: 177477, best_mean_100: 229.85, current_mean: 167.13,     best_eval_100: 476.83, eval_100 476.83, eval_10 500.0

---GOAL REACHED AFTER 1542 EPISODES---



----------------------------------------------------------------------------------------------------
--- TRAINING ENVIRONMENT CartPole-v1---

--- MODEL TYPE --- DDQN---

--- DUELLING --- False---

--- MEMORY TYPE --- UNIFORM---



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))

step: 98705, best_mean_100: 150.55, current_mean: 74.4,     best_eval_100: 476.2, eval_100 476.2, eval_10 500.00.00

---GOAL REACHED AFTER 1142 EPISODES---



----------------------------------------------------------------------------------------------------
--- TRAINING ENVIRONMENT CartPole-v1---

--- MODEL TYPE --- DDQN---

--- DUELLING --- True---

--- MEMORY TYPE --- UNIFORM---



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))

step: 182059, best_mean_100: 170.77, current_mean: 72.45,     best_eval_100: 457.73, eval_100 242.18, eval_10 226.67
----------------------------------------------------------------------------------------------------
--- TRAINING ENVIRONMENT CartPole-v1---

--- MODEL TYPE --- DDQN---

--- DUELLING --- True---

--- MEMORY TYPE --- PER---



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))

step: 125657, best_mean_100: 219.48, current_mean: 212.29,     best_eval_100: 477.05, eval_100 477.05, eval_10 500.0

---GOAL REACHED AFTER 1164 EPISODES---



----------------------------------------------------------------------------------------------------


In [None]:
# fig = plt.figure(figsize=(800, 600))
# for idx, env in enumerate(ENVS):
#     NAME = env['NAME']
#     NET_TYPE = env['NET_TYPE']
#     RESULTS = env['RESULTS']
    
#     fig.add_axes([idx%2, idx//2, 1, 1]).set_title(NAME+'_'+NET_TYPE)
#     plt.plot(RESULTS)
# plt.show()

In [None]:
# play a game with the best agent and save the results as gif
for env in ENVS:
    frames = []
    
    NAME = env['NAME']
    NET_TYPE = env['NET_TYPE']
    DUELLING = env['DUELLING']
    
    ENV = gym.make(NAME)
    ACTION_SIZE = ENV.action_space.n
    INPUT_SIZE = ENV.observation_space.shape
    
    agent = DQN(NET_TYPE, DUELLING, INPUT_SIZE, HIDDEN_SIZES, ACTION_SIZE, 
                min_epsilon=MIN_EPSILON, max_epsilon=MAX_EPSILON, epsilon_decay=EPSILON_DECAY,
                gamma=GAMMA, lr=LEARNING_RATE, 
                mem_size=MEMORY_SIZE, batch_size=BATCH_SIZE, save_dir=SAVE_DIR, name=NAME)
    agent.load()
    ENV = gym.make(NAME)
    ENV = wrappers.Monitor(ENV, "./gym-results", force=True)
    obs, done = ENV.reset(), False
    frames.append(Image.fromarray(ENV.render(mode='rgb_array')))
    eval_score = 0
    while not done:
        action = agent.greedy(obs)
        next_obs, reward, done, _ = ENV.step(action)
        eval_score += reward
        obs = next_obs
        frames.append(Image.fromarray(ENV.render(mode='rgb_array')))
    ENV.close()
    
    path = os.path.join(RESULTS_DIR, NAME+NET_TYPE+'.gif')
    with open(path, 'wb') as f:
        im = Image.new('RGB', frames[0].size)
        im.save(f, save_all=True, append_images=frames, loop=0, duration=25) 

![SegmentLocal](results/CartPole-v1DDQN.gif "segment")

In [1]:
from IPython import display
display.HTML('<img src="{}">'.format('./results/MountainCar-v0DDQN.gif'))

In [2]:
from IPython import display
display.HTML('<img src="{}">'.format('./results/LunarLander-v2DDQN.gif'))