In [1]:
TRAIN = False
TEST = True

ENV_NAME = 'BreakoutDeterministic-v4'
#ENV_NAME = 'PongDeterministic-v4'  

In [2]:
import os
import random
import gym
import tensorflow as tf
import numpy as np
import imageio
from skimage.transform import resize

In [3]:
class ProcessFrame:
    """Resizes and converts RGB Atari frames to grayscale"""
    def __init__(self, frame_height=84, frame_width=84):
        """
        Args:
            frame_height: Integer, Height of a frame of an Atari game
            frame_width: Integer, Width of a frame of an Atari game
        """
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.frame = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
        self.processed = tf.image.rgb_to_grayscale(self.frame)
        self.processed = tf.image.crop_to_bounding_box(self.processed, 34, 0, 160, 160)
        self.processed = tf.image.resize_images(self.processed, 
                                                [self.frame_height, self.frame_width], 
                                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    
    def process(self, session, frame):
        """
        Args:
            session: A Tensorflow session object
            frame: A (210, 160, 3) frame of an Atari game in RGB
        Returns:
            A processed (84, 84, 1) frame in grayscale
        """
        return session.run(self.processed, feed_dict={self.frame:frame})

In [4]:
class DQN:
    # Learning rate can be increased to 0.00025 in Pong for quicker results
    def __init__(self, n_actions, hidden=1024, learning_rate=0.00001, 
                 frame_height=84, frame_width=84, agent_history_length=4):
        """
        params
            n_actions = Integer, number of possible actions
            hidden = Integer, Number of filters in the final convolutional layer. 
                    This is different from the DeepMind implementation
            learning_rate = Float, Learning rate for the Adam optimizer
            frame_height = Integer, Height of a frame of an Atari game
            frame_width = Integer, Width of a frame of an Atari game
            agent_history_length = Integer, Number of frames stacked together to create a state
        """
        self.n_actions = n_actions
        self.hidden = hidden
        self.learning_rate = learning_rate
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.agent_history_length = agent_history_length
        
        self.input = tf.placeholder(shape=[None, self.frame_height, 
                                           self.frame_width, self.agent_history_length], 
                                    dtype=tf.float32)
        # Normalizing the input
        self.input_scaled = self.input/255
        
        # Convolutional layers
        self.conv1 = tf.layers.conv2d(
            inputs=self.input_scaled, filters=32, kernel_size=[8, 8], strides=4,
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv1')
        self.conv2 = tf.layers.conv2d(
            inputs=self.conv1, filters=64, kernel_size=[4, 4], strides=2, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv2')
        self.conv3 = tf.layers.conv2d(
            inputs=self.conv2, filters=64, kernel_size=[3, 3], strides=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv3')
        self.conv4 = tf.layers.conv2d(
            inputs=self.conv3, filters=hidden, kernel_size=[7, 7], strides=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv4')
        
        # Splitting into value and advantage stream
        self.value_stream, self.advantagestream = tf.split(self.conv4, 2, 3)
        self.value_stream = tf.layers.flatten(self.value_stream)
        self.advantagestream = tf.layers.flatten(self.advantagestream)
        self.advantage = tf.layers.dense(
            inputs=self.advantagestream, units=self.n_actions,
            kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantage")
        self.value = tf.layers.dense(
            inputs=self.value_stream, units=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2), name='value')
        
        # Combining value and advantage into Q-values as described above
        self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_values, 1)
        
        ###################
        # Parameter update.
        ###################
        
        # targetQ according to Bellman equation: 
        # Q = r + gamma*max Q', calculated in the function learn()
        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action, self.n_actions, dtype=tf.float32)), axis=1)
        
        # Parameter updates
        # DeepMind uses the quadratic cost function with error clipping (see https://www.nature.com/articles/nature14236/)
        # See https://machinelearningmastery.com/exploding-gradients-in-neural-networks/
        self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_q, predictions=self.Q))
        
        # Consider using the RMS optimizer: https://arxiv.org/abs/1710.02298 - RMSProp was substituted for Adam with a learning rate of 0.0000625
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = self.optimizer.minimize(self.loss)

In [5]:
# See https://blog.openai.com/openai-baselines-dqn/
class ActionGetter:
    """Epsilon greedy action strategy with annealing epsilon"""
    def __init__(self, n_actions, eps_initial=1, eps_final=0.1, eps_final_frame=0.01, 
                 eps_evaluation=0.0, eps_annealing_frames=1000000, 
                 replay_memory_start_size=50000, max_frames=25000000):
        """
        Params:
            n_actions: Integer, number of possible actions
            eps_initial: Float, Exploration probability for the first 
                replay_memory_start_size frames
            eps_final: Float, Exploration probability after 
                replay_memory_start_size + eps_annealing_frames frames
            eps_final_frame: Float, Exploration probability after max_frames frames
            eps_evaluation: Float, Exploration probability during evaluation
            eps_annealing_frames: Int, Number of frames over which the 
                exploration probabilty is annealed from eps_initial to eps_final
            replay_memory_start_size: Integer, Number of frames during 
                which the agent only explores
            max_frames: Integer, Total number of frames shown to the agent
        """
        self.n_actions = n_actions
        self.eps_initial = eps_initial
        self.eps_final = eps_final
        self.eps_final_frame = eps_final_frame
        self.eps_evaluation = eps_evaluation
        self.eps_annealing_frames = eps_annealing_frames
        self.replay_memory_start_size = replay_memory_start_size
        self.max_frames = max_frames
        
        # Slopes and intercepts for exploration decrease
        self.slope = -(self.eps_initial - self.eps_final)/self.eps_annealing_frames
        self.intercept = self.eps_initial - self.slope*self.replay_memory_start_size
        self.slope_2 = -(self.eps_final - self.eps_final_frame)/(self.max_frames - self.eps_annealing_frames - self.replay_memory_start_size)
        self.intercept_2 = self.eps_final_frame - self.slope_2*self.max_frames

    def get_action(self, session, frame_number, state, main_dqn, evaluation=False):
        """
        Args:
            session: A tensorflow session object
            frame_number: Integer, number of the current frame
            state: A (84, 84, 4) sequence of frames of an Atari game in grayscale
            main_dqn: A DQN object
            evaluation: A boolean saying whether the agent is being evaluated
        Returns:
            An integer between 0 and n_actions - 1 determining the action the agent perfoms next
        """
        if evaluation:
            eps = self.eps_evaluation
        elif frame_number < self.replay_memory_start_size:
            eps = self.eps_initial
        elif frame_number >= self.replay_memory_start_size and frame_number < self.replay_memory_start_size + self.eps_annealing_frames:
            eps = self.slope*frame_number + self.intercept
        elif frame_number >= self.replay_memory_start_size + self.eps_annealing_frames:
            eps = self.slope_2*frame_number + self.intercept_2
        
        if np.random.rand(1) < eps:
            return np.random.randint(0, self.n_actions)
        return session.run(main_dqn.best_action, feed_dict={main_dqn.input:[state]})[0]  

We now know how the DQN predicts the best action and we have a simple answer to the exploration exploitation dilemma.
So, what else do we need to make it work? Let's take a look at the algorithm presented on page 7 in [Mnih et al. 2015](https://www.nature.com/articles/nature14236/).

![](pictures/DQN.png)

Let us go through the algorithm step by step:
* We do not know yet what *replay memory* D is.
* Action-value function Q is our DQN network, that we already implemented.
* We need to discuss why a second Q network called *target* action-value function is needed.
* At the beginning of each episode a sequence is initalized. This is implemented by stacking four (grayscale) frames together as discussed above.
* We discussed how the action is selected ($\epsilon$-greedy).
* When the action is performed, the environment returns the next frame and the reward for that action. `gym` additionaly returns a boolean called `terminal` that states whether the game is over and a dictionary containing the number of lives the agent has left (`ale.lives`). 
* We do not know yet, what it means to store a transition in the replay memory D. A list `[state, action, reward, terminal, new_state]` is called transition. A `state` are four frames stacked together. `new_state` is produced by stacking the observed frame (after the action is performed) onto `state` and removing the oldest frame. You will see the implementation later. 
* We have to discuss, how a minibatch is retured from the replay memory and how the gradient descend step is performed.
* Finally we have to look at why and how the target Q network is reset to the main Q network.

Let's continue with the replay memory

## 5. Replay memory

>Second, learning directly from consecutive samples is inefficient, due to the strong correlations between the samples; randomizing the samples breaks these correlations and therefore reduces the variance of the updates. Third, when learning on-policy the current parameters determine the next data sample that the parameters are trained on. For example, if the maximizing action is to move left then the training samples will be dominated by samples from the left-hand side; if the maximizing action then switches to the right then the training distribution will also switch. It is easy to see how unwanted feedback loops may arise and the parameters could get stuck in a poor local minimum, or even diverge catastrophically. ([page 5 of Mnih et al. 2013](https://arxiv.org/abs/1312.5602))

This means that when we choose an action and perform a step to receive a reward, the network does not learn from this last step but rather adds the transition to the replay memory. It then draws a random minibatch from the replay memory to perform a gradient descent step.

The replay memory stores the last one million transitions. Let's recall that a transition is `[state, action, reward, terminal, new_state]`. We therefore need to store the last one million `state`, `action`, `reward`, `terminal` and `new_state`. If you remember that `state` and `new_state` are four frames each, that would be eight million frames. However, since `new_state` is created by stacking the newest frame on top of `state` and deleting the oldest frame, `new_state` and `state` share three frames. Furthermore, `new_state` of transition i will be `state` of transition i+1. This means that it is sufficient to store the last one million frames (84*84 pixels) as a (1 million, 84, 84) tensor and then slicing four frames out of this tensor when we need a `state` or `new_state`. 

With one million frames of 84 by 84 pixels that need to be stored in your computers memory, we need to consider in what datatype we store them. The environment returns frames with pixel values stored as `uint8` which can have values ranging from 0 to 255. A `uint8` needs 8 bits. The network expects a `tf.float32` input with pixel values between 0 and 1 (which takes four times more space than a `uint8`). Since we want to reduce the memory requirements, we store the frames in `uint8` and divide them by 255 before passing them to the network.

When implementing this version of replay memory, we looked at this [code](https://github.com/tambetm/simple_dqn/blob/master/src/replay_memory.py) and ended up implementing the replay memory with some adjustments that make the code more understandable.

Let's look at the `ReplayMemory` class below. In the constructor, we pre-allocate the memory for the frames, the actions, the rewards, the terminal states and also for the states and new states of the minibatch. 

In the `add_experience` method the frames etc. are written into `self.frames` at index `self.current` which is then increased by 1. When `self.current` reaches the size of the replay memory (one million), it is reset to zero to overwrite the oldest frames. The method `_get_state` slices four frames out of `self.frames` and returns them as a `state`.

To understand what the method `_get_valid_indices` does, we need to understand what an invalid index is. We store all frames the agent sees in `self.frames`. When a game terminates (`terminal=True`) at index i, frame at index i belongs to a different episode than the frame at i+1. We want to avoid creating a `state` with frames from two different episodes. The same thing can happen at the index `self.current`. 

Finally we need to make sure that an index is not smaller than the number of frames stacked toghether to create a `state` (`self.agent_history_length=4`), so that a `state` and `new_state` can be sliced out of the array. 

The method `_get_valid_indices` finds 32 (size of minibatch) valid indices.
The method `get_minibatch` returns the transitions for those indices. Pay attention that we need to transpose `self.states` and `self.new_states` before returning them: the DQN expects an input of the dimension `[None,84,84,4]` whereas `_get_state` returns a `state` of the dimension `[4,84,84]`

We now know 1) why a replay memory greatly improves the stability of the algorithm, 2) how to store a transition in the replay memory and 3) how a minibatch is returned.


In [6]:
class ReplayMemory:
    """Replay Memory that stores the last size=1,000,000 transitions"""
    def __init__(self, size=1000000, frame_height=84, frame_width=84, 
                 agent_history_length=4, batch_size=32):
        """
        Args:
            size: Integer, Number of stored transitions
            frame_height: Integer, Height of a frame of an Atari game
            frame_width: Integer, Width of a frame of an Atari game
            agent_history_length: Integer, Number of frames stacked together to create a state
            batch_size: Integer, Number if transitions returned in a minibatch
        """
        self.size = size
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.agent_history_length = agent_history_length
        self.batch_size = batch_size
        self.count = 0
        self.current = 0
        
        # Pre-allocate memory
        self.actions = np.empty(self.size, dtype=np.int32)
        self.rewards = np.empty(self.size, dtype=np.float32)
        self.frames = np.empty((self.size, self.frame_height, self.frame_width), dtype=np.uint8)
        self.terminal_flags = np.empty(self.size, dtype=np.bool)
        
        # Pre-allocate memory for the states and new_states in a minibatch
        self.states = np.empty((self.batch_size, self.agent_history_length, 
                                self.frame_height, self.frame_width), dtype=np.uint8)
        self.new_states = np.empty((self.batch_size, self.agent_history_length, 
                                    self.frame_height, self.frame_width), dtype=np.uint8)
        self.indices = np.empty(self.batch_size, dtype=np.int32)
        
    def add_experience(self, action, frame, reward, terminal):
        """
        Args:
            action: An integer between 0 and env.action_space.n - 1 
                determining the action the agent perfomed
            frame: A (84, 84, 1) frame of an Atari game in grayscale
            reward: A float determining the reward the agend received for performing an action
            terminal: A bool stating whether the episode terminated
        """
        if frame.shape != (self.frame_height, self.frame_width):
            raise ValueError('Dimension of frame is wrong!')
        self.actions[self.current] = action
        self.frames[self.current, ...] = frame
        self.rewards[self.current] = reward
        self.terminal_flags[self.current] = terminal
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.size
             
    def _get_state(self, index):
        if self.count is 0:
            raise ValueError("The replay memory is empty!")
        if index < self.agent_history_length - 1:
            raise ValueError("Index must be min 3")
        return self.frames[index-self.agent_history_length+1:index+1, ...]
        
    def _get_valid_indices(self):
        for i in range(self.batch_size):
            while True:
                index = random.randint(self.agent_history_length, self.count - 1)
                if index < self.agent_history_length:
                    continue
                if index >= self.current and index - self.agent_history_length <= self.current:
                    continue
                if self.terminal_flags[index - self.agent_history_length:index].any():
                    continue
                break
            self.indices[i] = index
            
    def get_minibatch(self):
        """
        Returns a minibatch of self.batch_size = 32 transitions
        """
        if self.count < self.agent_history_length:
            raise ValueError('Not enough memories to get a minibatch')
        
        self._get_valid_indices()
            
        for i, idx in enumerate(self.indices):
            self.states[i] = self._get_state(idx - 1)
            self.new_states[i] = self._get_state(idx)
        
        return np.transpose(self.states, axes=(0, 2, 3, 1)), self.actions[self.indices], self.rewards[self.indices], np.transpose(self.new_states, axes=(0, 2, 3, 1)), self.terminal_flags[self.indices]
                

## 6. Target network and parameter update

Why do we need two networks, the action-value function and the *target* action-value function?

Remember that prior to updating the network's parameters, we draw a minibatch with 32 transitions. For simplicity we consider one transition now. It consists of a `state`, an `action` that was performed in the `state`, the received `reward`, the `new_state` and a bool saying whether the episode is over.

We perform a gradient descent step:
The main network looks at state and estimates the $Q_\text{prediction}$-values that say how good each action is. However, we want the $Q$-values to follow the Bellman equation we introduced above. Therefore we calculate the $Q_\text{target}$-values according to the Bellman equation (how we would like the $Q$-values to be) and then compare the estimates $Q_\text{prediction}$ to the targets $Q_\text{target}$. Let's consider the quadratic loss function instead of the Huber loss function for simplicity:

\begin{equation}
L = \frac{1}{2}\left(Q_\text{prediction} - Q_\text{target}\right)^2
\end{equation}

This ensures that we regress the current $Q_\text{prediction}$-values for `state` towards the $Q_\text{target}$-values given by the Bellman equation.

$Q_\text{prediction}$ is calculated in the `DQN` class (`self.q_values`). $Q_\text{prediction}$ depends on the current `state` in the minibatch we drew and on the parameters $\theta$ of the network that estimates it.

The $Q_\text{target}$ value is calculated according to the Bellman equation. It is the sum of the immediate reward $r$ received for performing action $a$ in state $s$ (`action` and `state` from the minibatch) and the maximum $Q$-value over all possible actions $a'$ in $s'$ (`new_state` from the minibatch):

\begin{equation}
Q_\text{target}(s,a) = r + \gamma \textrm{max} \left(Q(s',a')\right)
\end{equation}

This is not done in the `DQN` class but in the `learn` method below. The calculated value is then passed to the placeholder called `self.target_q` in the `DQN` class. There, the loss function is defined and the gradient descent step is performed.

So, now that we understand how the parameters are updated, why use two networks?

The problem is that both $Q_\text{prediction}$ and $Q_\text{target}$ depend on the same parameters $\theta$ if only one network is used. This can lead to instability when regressing $Q_\text{prediction}$ towards $Q_\text{target}$ because the "target is moving". We ensure a "fixed target" by introducing a second network with fixed and only occasionally updated parameters that estimates the target $Q$-values.

On page 1 of [Mnih et al. 2015](https://www.nature.com/articles/nature14236/) the authors explain:
>Reinforcement learning is known to be unstable or even to diverge when a nonlinear function approximator such as a neural network is used to represent the action-value (also known as Q) function. This instability has several causes: the correlations present in the sequence of observations, the fact that small updates to Q may significantly change the policy and therefore change the data distribution, and the correlations between the action-values [...] and the target values [...].
We address these instabilities with a novel variant of Q-learning, which uses two key ideas. First, we used a biologically inspired mechanism termed experience replay that randomizes over the data, thereby removing correlations in the observation sequence and smoothing over changes in the data distribution [...]. Second, we used an iterative update that adjusts the action-values (Q) towards target values that are only periodically updated, thereby reducing correlations with the target. 

Therefore they used one network to predict the $Q_\text{prediction}$-value and the other fixed network to predict the $Q_\text{target}$-value. The main network is optimized during the gradient descend step and every 10000 steps the main network's parameters are copied to the target network. Be aware that the network update frequency is measured in the number of chosen actions/frames seen (DeepMind code) and not the number of parameter updates which occur every four frames ([Mnih et al. 2015](https://www.nature.com/articles/nature14236/)).

There is one additional very powerful improvement called *Double Q-Learning*.

## 7. Double Q-Learning
DQN has been observed to estimate unrealistically high $Q$-values. The reason for this is, that the Bellman equation *includes a maximization step over estimated action values, which tends to prefer overestimated to underestimated values* (see [van Hasselt et al. 2016, page 1](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/download/12389/11847)). 

The authors explain:

>If all values would be uniformly higher then the relative action preferences are preserved and we would not expect the resulting policy to be any worse. [...]
If, however, the overestimations are not uniform and not concentrated at states about which we wish to learn more, then they might negatively affect the quality of the resulting policy. [...]
We then show that this algorithm not only yields more accurate value estimates, but leads to much higher scores on several games. This demonstrates that the overestimations of DQN were indeed leading to poorer policies and that it is beneficial to reduce them 

The estimated $Q$-values are noisy. Assume that the true $Q$-value is 0 for all actions. But because of the noisy estimation, some $Q$-values might be slightly positive, others slightly negative. The max operation in the Bellman equation will however always chose the small positive values, despite the fact, that those actions are not truly better. The estimatation of $Q$-values is thus biased towards larger values. How do we fix this?
Instead of estimating the $Q$-values in the next state $Q(s',a')$ with only the target network, we use the main network to estimate which action is the best and then ask the target network how high the $Q$-value is for that action. This way, the main network will still prefer the action with the small positive $Q$-value but because of the noisy estimation, the target network will predict a small positive **or** small negative $Q$-value for that action and on average, the predicted $Q$-values will be closer to 0.

Mathematically, the reason for the overestimation is, that the expectation of a maximum is greater than or equal to the maximum of an expectation [van Hasselt 2013, Theorem 1](https://arxiv.org/abs/1302.7175).

The Bellman equation changes from

\begin{align}
Q_\text{target}(s,a) &= r + \gamma \textrm{max} Q(s',a';\theta_\text{target}) &\text{Normal DQN}\\
\text{to}\qquad\qquad Q_\text{target}(s,a) &= r + \gamma Q\left(s',a'=\text{argmax} Q(s',a';\theta_\text{main});\theta_\text{target}\right)&\text{Double DQN}
\end{align}

The main network estimates which action $a'$ (in the next state $s'$) is best (that is the $\text{argmax} Q(s',a';\theta_\text{main})$ part). The target network then estimates what the $Q$-value for that action is. This $Q$-value has to be discounted with $\gamma$ and is then added to the reward $r$ the agent got for action $a$ (not $a'$).

I know that this equation might look discouraging. So let's describe it again in words:

Normal DQN: Ask the target network for the highest $Q$-Value. If the noisy $Q$-values are for example $(0.1,-0.1)$ for actions with index $0$ and $1$ respectively, the target $Q$-network will answer $0.1$.

Double DQN: Ask the main network which action has the highest $Q$-value. If the noisy $Q$-values are for example $(0.1,-0.1)$ for actions with index $0$ and $1$ respectively, the main network will answer that action with index $0$ has the highest $Q$-value. Then we ask the target network, which has a different noise, what the $Q$-value for the action with the chosen index ($0$ in this example) is. Let's assume the target network's noisy estimates are $(-0.05,0.3)$ it will answer $-0.05$.  

This solves the problem of overestimated $Q$-values because the two networks have different noise and the bias towards slightly larger noisy $Q$-values cancels.


One more thing:
If the game is over (`terminal=True`) because the agend lost or won, there is no next state and the $Q_\text{target}$-value is simply the reward $r$.

Look at the implementation in the cell below.

In [7]:
def learn(session, replay_memory, main_dqn, target_dqn, batch_size, gamma):
    """
    Args:
        session: A tensorflow sesson object
        replay_memory: A ReplayMemory object
        main_dqn: A DQN object
        target_dqn: A DQN object
        batch_size: Integer, Batch size
        gamma: Float, discount factor for the Bellman equation
    Returns:
        loss: The loss of the minibatch, for tensorboard
    Draws a minibatch from the replay memory, calculates the 
    target Q-value that the prediction Q-value is regressed to. 
    Then a parameter update is performed on the main DQN.
    """
    # Draw a minibatch from the replay memory
    states, actions, rewards, new_states, terminal_flags = replay_memory.get_minibatch()    
    # The main network estimates which action is best (in the next 
    # state s', new_states is passed!) 
    # for every transition in the minibatch
    arg_q_max = session.run(main_dqn.best_action, feed_dict={main_dqn.input:new_states})
    # The target network estimates the Q-values (in the next state s', new_states is passed!) 
    # for every transition in the minibatch
    q_vals = session.run(target_dqn.q_values, feed_dict={target_dqn.input:new_states})
    double_q = q_vals[range(batch_size), arg_q_max]
    # Bellman equation. Multiplication with (1-terminal_flags) makes sure that 
    # if the game is over, targetQ=rewards
    target_q = rewards + (gamma*double_q * (1-terminal_flags))
    # Gradient descend step to update the parameters of the main network
    loss, _ = session.run([main_dqn.loss, main_dqn.update], 
                          feed_dict={main_dqn.input:states, 
                                     main_dqn.target_q:target_q, 
                                     main_dqn.action:actions})
    return loss

In [8]:
# main network periodically copied every 10,000 steps to the target network
class TargetNetworkUpdater:
    """Copies the parameters of the main DQN to the target DQN"""
    def __init__(self, main_dqn_vars, target_dqn_vars):
        """
        Args:
            main_dqn_vars: A list of tensorflow variables belonging to the main DQN network
            target_dqn_vars: A list of tensorflow variables belonging to the target DQN network
        """
        self.main_dqn_vars = main_dqn_vars
        self.target_dqn_vars = target_dqn_vars

    def _update_target_vars(self):
        update_ops = []
        for i, var in enumerate(self.main_dqn_vars):
            copy_op = self.target_dqn_vars[i].assign(var.value())
            update_ops.append(copy_op)
        return update_ops
            
    def update_networks(self, sess):
        """
        Args:
            sess: A Tensorflow session object
        Assigns the values of the parameters of the main network to the 
        parameters of the target network
        """
        update_ops = self._update_target_vars()
        for copy_op in update_ops:
            sess.run(copy_op)

In [9]:
def generate_gif(frame_number, frames_for_gif, reward, path):
    """
        Args:
            frame_number: Integer, determining the number of the current frame
            frames_for_gif: A sequence of (210, 160, 3) frames of an Atari game in RGB
            reward: Integer, Total reward of the episode that es ouputted as a gif
            path: String, path where gif is saved
    """
    for idx, frame_idx in enumerate(frames_for_gif): 
        frames_for_gif[idx] = resize(frame_idx, (420, 320, 3), 
                                     preserve_range=True, order=0).astype(np.uint8)
        
    imageio.mimsave(f'{path}{"ATARI_frame_{0}_reward_{1}.gif".format(frame_number, reward)}', 
                    frames_for_gif, duration=1/30)


The learning environment is provided by OpenAi's `gym`. It is very important that you have the right version of the environments. `BreakoutDeterministic-v3` for example has six actions whereas `BreakoutDeterministic-v4` has a minimal set of four actions, which is what DeepMind used in [xitari](https://github.com/deepmind/xitari/blob/master/games/supported/Breakout.cpp#L88-L91). Additional actions make the learning task harder for the agent which can alter the evaluation score significantly. If you want to find out the number of actions and their meaning, type `env.action_space.n` and  `env.unwrapped.get_action_meanings()`.

There are two additional small adjustments we need to discuss:

When a life is lost, we save `terminal_life_lost = True` in the replay memory. Create a new notebook, make a Breakout environment, in a loop repeat random or no actions and print the reward and the number of lives the agent has. 

`
frame = env.reset()
for i in range(1000):
    new_frame, reward, terminal, info = env.step(0)
    print(reward, terminal, info['ale.lives'])
`

You will see, that there is no punishment (reward is 0) when a life is lost. It helps the agent tremendously avoiding losing a life if you consider loss of life as end of episode. However, we only do this in the replay memory as we do not want to reset the game once the first life is lost. Therefore two terminal states `terminal` and `terminal_life_lost` are needed, one to reset the game, the other for the replay memory. This adjustment helped the agent improve from an average reward slightly above 50 to approximately 140 in Breakout! 

Let's wrap the `gym` environment in an `Atari` class which takes care of stacking frames ontop of each other to create states, resetting the environment when an episode ended and checking if a life was lost after a step was taken. You find the implementation in the cell below.

During evaluation, at the beginning of each episode, action 1 ('FIRE') is repeated for a random number of steps between 1 and `no_op_steps=10`. This ensures, that the agent starts in a different situation every time and thus cannot simply learn a fixed sequence of actions. [Mnih et al. 2015](https://www.nature.com/articles/nature14236/) use a random number between 1 and 30 of 'NOOP'-actions (see page 10, Table 1). However, in Breakout, nothing happens if you don't fire first. Once there is a ball in the game, 'FIRE' does nothing. Therefore I started with a random number of 'FIRE'-actions. Furthermore, I limited the random number of initial 'FIRE' actions to 10. When experimenting with larger numbers, I found, that the first life was usually already lost when the agent starting moving. You might want to change this, in case you want to experiment with another environment.

In [10]:
class Atari:
    """Wrapper for the environment provided by gym"""
    def __init__(self, envName, no_op_steps=10, agent_history_length=4):
        self.env = gym.make(envName)
        self.frame_processor = ProcessFrame()
        self.state = None
        self.last_lives = 0
        self.no_op_steps = no_op_steps
        self.agent_history_length = agent_history_length

    def reset(self, sess, evaluation=False):
        """
        Args:
            sess: A Tensorflow session object
            evaluation: A boolean saying whether the agent is evaluating or training
        Resets the environment and stacks four frames ontop of each other to 
        create the first state
        """
        frame = self.env.reset()
        self.last_lives = 0
        terminal_life_lost = True # Set to true so that the agent starts 
                                  # with a 'FIRE' action when evaluating
        if evaluation:
            for _ in range(random.randint(1, self.no_op_steps)):
                frame, _, _, _ = self.env.step(1) # Action 'Fire'
        processed_frame = self.frame_processor.process(sess, frame)   # (★★★)
        self.state = np.repeat(processed_frame, self.agent_history_length, axis=2)
        
        return terminal_life_lost

    def step(self, sess, action):
        """
        Args:
            sess: A Tensorflow session object
            action: Integer, action the agent performs
        Performs an action and observes the reward and terminal state from the environment
        """
        new_frame, reward, terminal, info = self.env.step(action)  # (5★)
            
        if info['ale.lives'] < self.last_lives:
            terminal_life_lost = True
        else:
            terminal_life_lost = terminal
        self.last_lives = info['ale.lives']
        
        processed_new_frame = self.frame_processor.process(sess, new_frame)   # (6★)
        new_state = np.append(self.state[:, :, 1:], processed_new_frame, axis=2) # (6★)   
        self.state = new_state
        
        return processed_new_frame, reward, terminal, terminal_life_lost, new_frame
    

In the cell below I declare some constants that define the learning behaviour of the agent:

In [11]:
tf.reset_default_graph()

# Control parameters
MAX_EPISODE_LENGTH = 18000       # Equivalent of 5 minutes of gameplay at 60 frames per second
EVAL_FREQUENCY = 200000          # Number of frames the agent sees between evaluations
EVAL_STEPS = 10000               # Number of frames for one evaluation
NETW_UPDATE_FREQ = 10000         # Number of chosen actions between updating the target network. 
                                 # According to Mnih et al. 2015 this is measured in the number of 
                                 # parameter updates (every four actions), however, in the 
                                 # DeepMind code, it is clearly measured in the number
                                 # of actions the agent choses
DISCOUNT_FACTOR = 0.99           # gamma in the Bellman equation
REPLAY_MEMORY_START_SIZE = 50000 # Number of completely random actions, 
                                 # before the agent starts learning
MAX_FRAMES = 30000000            # Total number of frames the agent sees 
MEMORY_SIZE = 1000000            # Number of transitions stored in the replay memory
NO_OP_STEPS = 10                 # Number of 'NOOP' or 'FIRE' actions at the beginning of an 
                                 # evaluation episode
UPDATE_FREQ = 4                  # Every four actions a gradient descend step is performed
HIDDEN = 1024                    # Number of filters in the final convolutional layer. The output 
                                 # has the shape (1,1,1024) which is split into two streams. Both 
                                 # the advantage stream and value stream have the shape 
                                 # (1,1,512). This is slightly different from the original 
                                 # implementation but tests I did with the environment Pong 
                                 # have shown that this way the score increases more quickly
LEARNING_RATE = 0.00001          # Set to 0.00025 in Pong for quicker results. 
                                 # Hessel et al. 2017 used 0.0000625
BS = 32                          # Batch size

PATH = "output/"                 # Gifs and checkpoints will be saved here
SUMMARIES = "summaries"          # logdir for tensorboard
RUNID = 'run_1'
os.makedirs(PATH, exist_ok=True)
os.makedirs(os.path.join(SUMMARIES, RUNID), exist_ok=True)
SUMM_WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUNID))

atari = Atari(ENV_NAME, NO_OP_STEPS)

print("The environment has the following {} actions: {}".format(atari.env.action_space.n, 
                                                                atari.env.unwrapped.get_action_meanings()))

[2019-03-07 23:27:00,056] Making new env: BreakoutDeterministic-v4
  result = entry_point.load(False)


The environment has the following 4 actions: ['NOOP', 'FIRE', 'RIGHT', 'LEFT']


Let's create the networks:

In [12]:
# main DQN and target DQN networks:
with tf.variable_scope('mainDQN'):
    MAIN_DQN = DQN(atari.env.action_space.n, HIDDEN, LEARNING_RATE)   # (★★)
with tf.variable_scope('targetDQN'):
    TARGET_DQN = DQN(atari.env.action_space.n, HIDDEN)               # (★★)

init = tf.global_variables_initializer()
saver = tf.train.Saver()    

MAIN_DQN_VARS = tf.trainable_variables(scope='mainDQN')
TARGET_DQN_VARS = tf.trainable_variables(scope='targetDQN')

In [13]:
# Set up tensorboard
LAYER_IDS = ["conv1", "conv2", "conv3", "conv4", "denseAdvantage", 
             "denseAdvantageBias", "denseValue", "denseValueBias"]

# Scalar summaries for tensorboard: loss, average reward and evaluation score
with tf.name_scope('Performance'):
    LOSS_PH = tf.placeholder(tf.float32, shape=None, name='loss_summary')
    LOSS_SUMMARY = tf.summary.scalar('loss', LOSS_PH)
    REWARD_PH = tf.placeholder(tf.float32, shape=None, name='reward_summary')
    REWARD_SUMMARY = tf.summary.scalar('reward', REWARD_PH)
    EVAL_SCORE_PH = tf.placeholder(tf.float32, shape=None, name='evaluation_summary')
    EVAL_SCORE_SUMMARY = tf.summary.scalar('evaluation_score', EVAL_SCORE_PH)

PERFORMANCE_SUMMARIES = tf.summary.merge([LOSS_SUMMARY, REWARD_SUMMARY])

# Histogramm summaries for tensorboard: parameters
with tf.name_scope('Parameters'):
    ALL_PARAM_SUMMARIES = []
    for i, Id in enumerate(LAYER_IDS):
        with tf.name_scope('mainDQN/'):
            MAIN_DQN_KERNEL = tf.summary.histogram(Id, tf.reshape(MAIN_DQN_VARS[i], shape=[-1]))
        ALL_PARAM_SUMMARIES.extend([MAIN_DQN_KERNEL])
PARAM_SUMMARIES = tf.summary.merge(ALL_PARAM_SUMMARIES)

In [14]:
# Train on GPU if a trained_path is provided
def train(trained_path = None, save_file = None, model_name="my_model"):
    # Trained path: The path (if provided) to look for the saved file from. EG: "trained/pong/"
    # Save_file: The path (if provided) to save tf outputs to
    # The model name
    """Contains the training and evaluation loops"""
    my_replay_memory = ReplayMemory(size=MEMORY_SIZE, batch_size=BS)   # (★)
    network_updater = TargetNetworkUpdater(MAIN_DQN_VARS, TARGET_DQN_VARS)
    action_getter = ActionGetter(atari.env.action_space.n, 
                                 replay_memory_start_size=REPLAY_MEMORY_START_SIZE, 
                                 max_frames=MAX_FRAMES)

    with tf.Session() as sess:
        
        if trained_path != None:
            saver = tf.train.import_meta_graph(trained_path+save_file)
            saver.restore(sess,tf.train.latest_checkpoint(trained_path))
        else:
            sess.run(init)
        
        frame_number = 0
        rewards = []
        loss_list = []
        
        while frame_number < MAX_FRAMES:
            
            ########################
            ####### Training #######
            ########################
            epoch_frame = 0
            while epoch_frame < EVAL_FREQUENCY:
                terminal_life_lost = atari.reset(sess)
                episode_reward_sum = 0
                for _ in range(MAX_EPISODE_LENGTH):
                    # (4★)
                    action = action_getter.get_action(sess, frame_number, atari.state, MAIN_DQN)   
                    # (5★)
                    processed_new_frame, reward, terminal, terminal_life_lost, _ = atari.step(sess, action)  
                    frame_number += 1
                    epoch_frame += 1
                    episode_reward_sum += reward
                    
                    # (7★) Store transition in the replay memory
                    my_replay_memory.add_experience(action=action, 
                                                    frame=processed_new_frame[:, :, 0],
                                                    reward=reward, 
                                                    terminal=terminal_life_lost)   
                    
                    if frame_number % UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                        loss = learn(sess, my_replay_memory, MAIN_DQN, TARGET_DQN,
                                     BS, gamma = DISCOUNT_FACTOR) # (8★)
                        loss_list.append(loss)
                    if frame_number % NETW_UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                        network_updater.update_networks(sess) # (9★)
                    
                    if terminal:
                        terminal = False
                        break

                rewards.append(episode_reward_sum)
                
                # Output the progress:
                if len(rewards) % 10 == 0:
                    # Scalar summaries for tensorboard
                    if frame_number > REPLAY_MEMORY_START_SIZE:
                        summ = sess.run(PERFORMANCE_SUMMARIES, 
                                        feed_dict={LOSS_PH:np.mean(loss_list), 
                                                   REWARD_PH:np.mean(rewards[-100:])})
                        
                        SUMM_WRITER.add_summary(summ, frame_number)
                        loss_list = []
                    # Histogramm summaries for tensorboard
                    summ_param = sess.run(PARAM_SUMMARIES)
                    SUMM_WRITER.add_summary(summ_param, frame_number)
                    
                    print(len(rewards), frame_number, np.mean(rewards[-100:]))
                    with open('rewards.dat', 'a') as reward_file:
                        print(len(rewards), frame_number, 
                              np.mean(rewards[-100:]), file=reward_file)
            
            ########################
            ###### Evaluation ######
            ########################
            terminal = True
            gif = True
            frames_for_gif = []
            eval_rewards = []
            evaluate_frame_number = 0
            
            for _ in range(EVAL_STEPS):
                if terminal:
                    terminal_life_lost = atari.reset(sess, evaluation=True)
                    episode_reward_sum = 0
                    terminal = False
               
                # Fire (action 1), when a life was lost or the game just started, 
                # so that the agent does not stand around doing nothing. When playing 
                # with other environments, you might want to change this...
                action = 1 if terminal_life_lost else action_getter.get_action(sess, frame_number,
                                                                               atari.state, 
                                                                               MAIN_DQN,
                                                                               evaluation=True)
                processed_new_frame, reward, terminal, terminal_life_lost, new_frame = atari.step(sess, action)
                evaluate_frame_number += 1
                episode_reward_sum += reward

                if gif: 
                    frames_for_gif.append(new_frame)
                if terminal:
                    eval_rewards.append(episode_reward_sum)
                    gif = False # Save only the first game of the evaluation as a gif
                     
            print("Evaluation score:\n", np.mean(eval_rewards))       
            try:
                generate_gif(frame_number, frames_for_gif, eval_rewards[0], PATH)
            except IndexError:
                print("No evaluation game finished")
            
            #Save the network parameters
            saver.save(sess, PATH+'/'+model_name, global_step=frame_number)
            frames_for_gif = []
            
            # Show the evaluation score in tensorboard
            summ = sess.run(EVAL_SCORE_SUMMARY, feed_dict={EVAL_SCORE_PH:np.mean(eval_rewards)})
            SUMM_WRITER.add_summary(summ, frame_number)
            with open('rewardsEval.dat', 'a') as eval_reward_file:
                print(frame_number, np.mean(eval_rewards), file=eval_reward_file)

`jupyter-nbconvert --to script DQN.ipynb` to generate python

`tensorboard --logdir=summaries` to set up tensorboard

In [15]:
if TRAIN:
    train()

In [16]:
# Load and run trained network
if TEST:
    
    gif_path = "GIF/"
    os.makedirs(gif_path,exist_ok=True)

    if ENV_NAME == 'BreakoutDeterministic-v4':
        trained_path = "trained/breakout/"
        save_file = "my_model-15845555.meta"
    
    elif ENV_NAME == 'PongDeterministic-v4':
        trained_path = "trained/pong/"
        save_file = "my_model-3217770.meta"

    action_getter = ActionGetter(atari.env.action_space.n, 
                                 replay_memory_start_size=REPLAY_MEMORY_START_SIZE, 
                                 max_frames=MAX_FRAMES)
    
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(trained_path+save_file)
        saver.restore(sess,tf.train.latest_checkpoint(trained_path))
        frames_for_gif = []
        terminal_live_lost = atari.reset(sess, evaluation = True)
        episode_reward_sum = 0
        while True:
            atari.env.render()
            action = 1 if terminal_live_lost else action_getter.get_action(sess, 0, atari.state, 
                                                                           MAIN_DQN, 
                                                                           evaluation = True)
            processed_new_frame, reward, terminal, terminal_live_lost, new_frame = atari.step(sess, action)
            episode_reward_sum += reward
            frames_for_gif.append(new_frame)
            if terminal == True:
                break
        
        atari.env.close()
        print("The total reward is {}".format(episode_reward_sum))
        print("Creating gif...")
        generate_gif(0, frames_for_gif, episode_reward_sum, gif_path)
        print("Gif created, check the folder {}".format(gif_path))

The total reward is 421.0
Creating gif...


  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Gif created, check the folder GIF/


In [17]:
atari_variables = tf.train.list_variables('trained/breakout/my_model-15845555')

In [19]:
trained_path = "trained/breakout/"
save_file = "my_model-15845555.meta"
sess = tf.Session()
sess.run(tf.initialize_all_variables())
saver = tf.train.import_meta_graph(trained_path+save_file)
saver.restore(sess,tf.train.latest_checkpoint(trained_path))

In [51]:
optimizer = MAIN_DQN.optimizer
reset_main_optimizer_op = tf.variables_initializer(optimizer.variables())

optimizer = TARGET_DQN.optimizer
reset_target_optimizer_op = tf.variables_initializer(optimizer.variables())

conv1_vars = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 
              if "conv1" in v.name and "Adam" not in v.name]
reset_conv1_op = tf.variables_initializer(conv1_vars)

conv2_vars = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 
              if "conv2" in v.name and "Adam" not in v.name]
reset_conv2_op = tf.variables_initializer(conv2_vars)

conv3_vars = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 
              if "conv3" in v.name and "Adam" not in v.name]
reset_conv3_op = tf.variables_initializer(conv3_vars)

conv4_vars = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 
              if "conv4" in v.name and "Adam" not in v.name]
reset_conv4_op = tf.variables_initializer(conv4_vars)


advantage_vars = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 
              if "advantage" in v.name and "Adam" not in v.name]
reset_advantage_op = tf.variables_initializer(advantage_vars)

value_vars = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 
              if "value" in v.name and "Adam" not in v.name]
reset_value_op = tf.variables_initializer(value_vars)

In [None]:
# Train on GPU if a trained_path is provided
def transfer_initialized_train(trained_path = None, save_file = None, model_name="my_model"):
    # Trained path: The path (if provided) to look for the saved file from. EG: "trained/pong/"
    # Save_file: The path (if provided) to save tf outputs to
    # The model name
    """Contains the training and evaluation loops"""
    my_replay_memory = ReplayMemory(size=MEMORY_SIZE, batch_size=BS)   # (★)
    network_updater = TargetNetworkUpdater(MAIN_DQN_VARS, TARGET_DQN_VARS)
    action_getter = ActionGetter(atari.env.action_space.n, 
                                 replay_memory_start_size=REPLAY_MEMORY_START_SIZE, 
                                 max_frames=MAX_FRAMES)

    with tf.Session() as sess:
        
        if trained_path != None:
            # Load saver
            saver = tf.train.import_meta_graph(trained_path+save_file)
            # Restore the graph
            saver.restore(sess,tf.train.latest_checkpoint(trained_path))
            
            # Reset optimizer parameters
            sess.run(reset_main_optimizer_op)
            sess.run(reset_target_optimizer_op)
            
            # Reset value parameters
            sess.run(reset_value_op)
            sess.run(reset_advantage_op)
        else:
            sess.run(init)
        
        frame_number = 0
        rewards = []
        loss_list = []
        
        while frame_number < MAX_FRAMES:
            ########################
            ####### Training #######
            ########################
            epoch_frame = 0
            while epoch_frame < EVAL_FREQUENCY:
                terminal_life_lost = atari.reset(sess)
                episode_reward_sum = 0
                for _ in range(MAX_EPISODE_LENGTH):
                    # (4★)
                    action = action_getter.get_action(sess, frame_number, atari.state, MAIN_DQN)   
                    # (5★)
                    processed_new_frame, reward, terminal, terminal_life_lost, _ = atari.step(sess, action)  
                    frame_number += 1
                    epoch_frame += 1
                    episode_reward_sum += reward
                    
                    # (7★) Store transition in the replay memory
                    my_replay_memory.add_experience(action=action, 
                                                    frame=processed_new_frame[:, :, 0],
                                                    reward=reward, 
                                                    terminal=terminal_life_lost)   
                    
                    if frame_number % UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                        loss = learn(sess, my_replay_memory, MAIN_DQN, TARGET_DQN,
                                     BS, gamma = DISCOUNT_FACTOR) # (8★)
                        loss_list.append(loss)
                    if frame_number % NETW_UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                        network_updater.update_networks(sess) # (9★)
                    
                    if terminal:
                        terminal = False
                        break

                rewards.append(episode_reward_sum)
                
                # Output the progress:
                if len(rewards) % 10 == 0:
                    # Scalar summaries for tensorboard
                    if frame_number > REPLAY_MEMORY_START_SIZE:
                        summ = sess.run(PERFORMANCE_SUMMARIES, 
                                        feed_dict={LOSS_PH:np.mean(loss_list), 
                                                   REWARD_PH:np.mean(rewards[-100:])})
                        
                        SUMM_WRITER.add_summary(summ, frame_number)
                        loss_list = []
                    # Histogramm summaries for tensorboard
                    summ_param = sess.run(PARAM_SUMMARIES)
                    SUMM_WRITER.add_summary(summ_param, frame_number)
                    
                    print(len(rewards), frame_number, np.mean(rewards[-100:]))
                    with open('rewards.dat', 'a') as reward_file:
                        print(len(rewards), frame_number, 
                              np.mean(rewards[-100:]), file=reward_file)
            
            ########################
            ###### Evaluation ######
            ########################
            terminal = True
            gif = True
            frames_for_gif = []
            eval_rewards = []
            evaluate_frame_number = 0
            
            for _ in range(EVAL_STEPS):
                if terminal:
                    terminal_life_lost = atari.reset(sess, evaluation=True)
                    episode_reward_sum = 0
                    terminal = False
               
                # Fire (action 1), when a life was lost or the game just started, 
                # so that the agent does not stand around doing nothing. When playing 
                # with other environments, you might want to change this...
                action = 1 if terminal_life_lost else action_getter.get_action(sess, frame_number,
                                                                               atari.state, 
                                                                               MAIN_DQN,
                                                                               evaluation=True)
                processed_new_frame, reward, terminal, terminal_life_lost, new_frame = atari.step(sess, action)
                evaluate_frame_number += 1
                episode_reward_sum += reward

                if gif: 
                    frames_for_gif.append(new_frame)
                if terminal:
                    eval_rewards.append(episode_reward_sum)
                    gif = False # Save only the first game of the evaluation as a gif
                     
            print("Evaluation score:\n", np.mean(eval_rewards))       
            try:
                generate_gif(frame_number, frames_for_gif, eval_rewards[0], PATH)
            except IndexError:
                print("No evaluation game finished")
            
            #Save the network parameters
            saver.save(sess, PATH+'/'+model_name, global_step=frame_number)
            frames_for_gif = []
            
            # Show the evaluation score in tensorboard
            summ = sess.run(EVAL_SCORE_SUMMARY, feed_dict={EVAL_SCORE_PH:np.mean(eval_rewards)})
            SUMM_WRITER.add_summary(summ, frame_number)
            with open('rewardsEval.dat', 'a') as eval_reward_file:
                print(frame_number, np.mean(eval_rewards), file=eval_reward_file)