<a href="https://colab.research.google.com/github/Kopok/RL-Experiments/blob/master/DDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [140]:
import itertools
import random
import gym
import numpy as np
from collections import deque
import tflearn
import tensorflow as tf
from skimage.transform import rescale, resize, downscale_local_mean
from skimage.color import rgb2gray
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/gdrive')

from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

Mounted at /content/gdrive


In [0]:
ENV_NAME = "Pong-v0"

LEARNING_RATE = 0.00025

MEMORY_SIZE = 15000
BATCH_SIZE = 32

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995
IMG_WIDTH = 84
IMG_HEIGHT = 84


tf.reset_default_graph()

# Control parameters
MAX_EPISODE_LENGTH = 18000       # Equivalent of 5 minutes of gameplay at 60 frames per second
EVAL_FREQUENCY = 200000          # Number of frames the agent sees between evaluations
EVAL_STEPS = 10000               # Number of frames for one evaluation
NETW_UPDATE_FREQ = 10000         # Number of chosen actions between updating the target network. 
                                 # According to Mnih et al. 2015 this is measured in the number of 
                                 # parameter updates (every four actions), however, in the 
                                 # DeepMind code, it is clearly measured in the number
                                 # of actions the agent choses
DISCOUNT_FACTOR = 0.99           # gamma in the Bellman equation
REPLAY_MEMORY_START_SIZE = MEMORY_SIZE # Number of completely random actions, 
                                 # before the agent starts learning
MAX_FRAMES = 30000000            # Total number of frames the agent sees 
NO_OP_STEPS = 10                 # Number of 'NOOP' or 'FIRE' actions at the beginning of an 
                                 # evaluation episode
UPDATE_FREQ = 4                  # Every four actions a gradient descend step is performed
HIDDEN = 1024                    # Number of filters in the final convolutional layer. The output 
                                 # has the shape (1,1,1024) which is split into two streams. Both 
                                 # the advantage stream and value stream have the shape 
                                 # (1,1,512). This is slightly different from the original 
                                 # implementation but tests I did with the environment Pong 
                                 # have shown that this way the score increases more quickly
                                 # Hessel et al. 2017 used 0.0000625

RUNID = 'run_1'

In [0]:
def preprocess(state):
    # Turn to grey
    state = rgb2gray(state)
    
    # Crop 26px from top & 16px from bottom
    state = state[26:-16]
    
    # Downscale 168, 160 to 84,84
    state = resize(state, [IMG_HEIGHT, IMG_WIDTH], anti_aliasing=False)
    tf.image.resize_images(state, [IMG_HEIGHT, IMG_WIDTH], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    #plt.imshow(state, cmap="gray")
    #plt.show()
    
    
    # Reshape
    state = np.array(state).reshape([-1, IMG_HEIGHT, IMG_WIDTH, 1])
    return state

In [0]:
class DQN(object):
    """Implements a Deep Q Network"""
    
    # pylint: disable=too-many-instance-attributes
    
    def __init__(self, n_actions, hidden=1024, learning_rate=0.00001, 
                 frame_height=84, frame_width=84, agent_history_length=4, MEMORY_SIZE = 8000):
        """
        Args:
            n_actions: Integer, number of possible actions
            hidden: Integer, Number of filters in the final convolutional layer. 
                    This is different from the DeepMind implementation
            learning_rate: Float, Learning rate for the Adam optimizer
            frame_height: Integer, Height of a frame of an Atari game
            frame_width: Integer, Width of a frame of an Atari game
            agent_history_length: Integer, Number of frames stacked together to create a state
        """     
        
        self.n_actions = n_actions
        self.hidden = hidden
        self.learning_rate = learning_rate
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.agent_history_length = agent_history_length
        self.exploration_rate = EXPLORATION_MAX

        
        self.input = tf.placeholder(shape=[None, self.frame_height, 
                                           self.frame_width, self.agent_history_length], 
                                    dtype=tf.float32)
        # Normalizing the input
        self.inputscaled = self.input/255
        
        # Convolutional layers
        self.conv1 = tf.layers.conv2d(
            inputs=self.inputscaled, filters=32, kernel_size=[8, 8], strides=4,
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv1')
        self.conv2 = tf.layers.conv2d(
            inputs=self.conv1, filters=64, kernel_size=[4, 4], strides=2, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv2')
        self.conv3 = tf.layers.conv2d(
            inputs=self.conv2, filters=64, kernel_size=[3, 3], strides=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv3')
        self.conv4 = tf.layers.conv2d(
            inputs=self.conv3, filters=hidden, kernel_size=[7, 7], strides=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2),
            padding="valid", activation=tf.nn.relu, use_bias=False, name='conv4')
        
        # Splitting into value and advantage stream
        self.valuestream, self.advantagestream = tf.split(self.conv4, 2, 3)
        self.valuestream = tf.layers.flatten(self.valuestream)
        self.advantagestream = tf.layers.flatten(self.advantagestream)
        self.advantage = tf.layers.dense(
            inputs=self.advantagestream, units=self.n_actions,
            kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantage")
        self.value = tf.layers.dense(
            inputs=self.valuestream, units=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2), name='value')
        
        # Combining value and advantage into Q-values
        self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_values, 1)
        
        # targetQ according to Bellman equation: 
        # Q = r + gamma*max Q', calculated in the function learn()
        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        # Action that was performed
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        # Q value of the action that was performed
        self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action, self.n_actions, dtype=tf.float32)), axis=1)
        
        # Parameter updates
        self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_q, predictions=self.Q))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = self.optimizer.minimize(self.loss)
        
    def act(self,session, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.n_actions)

        return session.run(self.best_action, feed_dict={self.input:[state]})[0]

    def update_explore_rate(self):
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
        
class Memory(object):
  
  
      def __init__(self,init_state, MEMORY_SIZE = 8000, agent_history_length=4):
      
        self.agent_history_length = agent_history_length
        self.frames  = deque(maxlen=MEMORY_SIZE)
        self.actions = deque(maxlen=MEMORY_SIZE-agent_history_length)
        self.rewards  = deque(maxlen=MEMORY_SIZE-agent_history_length)
        self.terminals = deque(maxlen=MEMORY_SIZE-agent_history_length)
        self.reset_points = []
        
        self.setup(init_state, False)
          
      
      def setup(self, state, balance = True):
          if state.shape != (84, 84):
              print("Shape was: "+str(state.shape) + " Expected: [ 84, 84 ]")
              raise IndexError
          
          
          
          if balance:
              for i in range(self.agent_history_length):
                self.remember(state,None,None,None)
                
          else:
            self.frames.extend([state, state, state, state])
            
          self.reset_points.append(len(self.actions))
                    
      def remember(self, new_frame, action, reward, terminal):         
          
          # We are about to start dropping items, we need to update idx positions
          if len(self.frames) == self.frames.maxlen:
              i = 0
              dropped = 0
              while i < len(self.reset_points):
            
                  self.reset_points[i] -= 1
              
                  if self.reset_points[i] < 0:
                      dropped = 1
                      self.reset_points.pop(0)
                  else:
                    i+=1

                             
          if new_frame.shape != (84,84):
            print("Shape was: " + str(new_frame.shape) + " Expected: [ 84, 84 ]")
            raise IndexError
            
          self.frames.append(new_frame)
          self.actions.append(action)
          self.rewards.append(reward)
          self.terminals.append(terminal)

          
      def check_idx(self, idx):
          if len(self.reset_points) > 0:
              i = 0           
              reset_point = self.reset_points[i]
                             
              while idx > reset_point and i+1 < len(self.reset_points):
                  i+=1
                  reset_point = self.reset_points[i]
            
              if idx < reset_point and idx >= reset_point - self.agent_history_length:
                  return False
          return True
                             
      def get_memory(self, idx):
          if idx >= len(self.frames) or not self.check_idx(idx):
            raise IndexError

          s = []
          a = self.actions[idx]
          r = self.rewards[idx]
          s_ = []
          t = self.terminals[idx]
          
          for i in range(idx,idx+self.agent_history_length):
            s.append(self.frames[i])
            s_.append(self.frames[i+1])
          
          
          
          s1 = np.stack([s[0],s[1],s[2],s[3]], -1) 
          s2 = np.stack([s_[0],s_[1],s_[2],s_[3]], -1 )
          
          
          return s1,a,r,s2,t
      

      def get_batch(self, size=1):
          s,a,r,s_,t = [],[],[],[],[]

          for i in range(size):
          
            idx = random.randrange(len(self.actions))
            while not self.check_idx(idx):
              idx = random.randrange(len(self.actions))
            
            sI,aI,rI,s_I,tI = self.get_memory(idx)
            
            errors = ""
            if aI == None: errors+= "Action "
            if rI == None: errors+= "reward "
            if tI == None: errors+= "terminal "
            
            if len(errors) > 0:
              print(errors)
              print(idx,self.check_idx(idx))
              print(self.reset_points)
              print(list(itertools.islice(self.terminals, idx-5, idx+5)))
              print(list(itertools.islice(self.actions, idx-5, idx+5)))
              print(list(itertools.islice(self.rewards, idx-5, idx+5)))
              
              raise TypeError


            s.append(sI)
            a.append(aI)
            r.append(rI)
            s_.append(s_I)
            t.append(tI)
          
          return np.array(s), np.array(a), np.array(r), np.array(s_), np.array(t) 
          
      def get_curr_state(self):
        
          state = []
        
          l = len(self.frames)
          for i in range(l-1,l-self.agent_history_length-1,-1):
              item = self.frames[i]
              state.append(item)
          
          return state
        

In [0]:
def learn(session, memory, main_dqn, target_dqn, batch_size, gamma):
    """
    Args:
        session: A tensorflow sesson object
        replay_memory: A Memory object
        main_dqn: A DQN object
        target_dqn: A DQN object
        batch_size: Integer, Batch size
        gamma: Float, discount factor for the Bellman equation
    Returns:
        loss: The loss of the minibatch, for tensorboard
    Draws a minibatch from the replay memory, calculates the 
    target Q-value that the prediction Q-value is regressed to. 
    Then a parameter update is performed on the main DQN.
    """
    # Draw a minibatch from the replay memory
    states, actions, rewards, new_states, terminal_flags = memory.get_batch(batch_size)    

    # The main network estimates which action is best (in the next state s', new_states is passed!) 
    # for every transition in the minibatch
    arg_q_max = session.run(main_dqn.best_action, feed_dict={main_dqn.input:new_states})

    # The target network estimates the Q-values (in the next state s', new_states is passed!) 
    # for every transition in the minibatch
    q_vals = session.run(target_dqn.q_values, feed_dict={target_dqn.input:new_states})
    double_q = q_vals[range(batch_size), arg_q_max]
    
    # Bellman equation. Multiplication with (1-terminal_flags) makes sure that 
    # if the game is over, targetQ=rewards
    
    
    inverted_flags = 1-terminal_flags
    discounted_Q = gamma*double_q
    discounted_finshed_Q = discounted_Q * inverted_flags
    
    
    target_q = rewards + discounted_finshed_Q
    
    # Gradient descend step to update the parameters of the main network
    loss, _ = session.run([main_dqn.loss, main_dqn.update], 
                          feed_dict={main_dqn.input:states, 
                                     main_dqn.target_q:target_q, 
                                     main_dqn.action:actions})

    return loss

In [0]:
class TargetNetworkUpdater(object):
    """Copies the parameters of the main DQN to the target DQN"""
    def __init__(self, main_dqn_vars, target_dqn_vars):
        """
        Args:
            main_dqn_vars: A list of tensorflow variables belonging to the main DQN network
            target_dqn_vars: A list of tensorflow variables belonging to the target DQN network
        """
        self.main_dqn_vars = main_dqn_vars
        self.target_dqn_vars = target_dqn_vars

    def _update_target_vars(self):
        update_ops = []
        for i, var in enumerate(self.main_dqn_vars):
            copy_op = self.target_dqn_vars[i].assign(var.value())
            update_ops.append(copy_op)
        return update_ops
            
    def __call__(self, sess):
        """
        Args:
            sess: A Tensorflow session object
        Assigns the values of the parameters of the main network to the 
        parameters of the target network
        """
        update_ops = self._update_target_vars()
        for copy_op in update_ops:
            sess.run(copy_op)

In [0]:

def clip_reward(reward):
    if reward > 0:
        return 1
    elif reward == 0:
        return 0
    else:
        return -1
      
def preprocess(state):
    # Turn to grey
    state = rgb2gray(state)
    
    # Crop 26px from top & 16px from bottom
    state = state[26:-16]
    
    # Downscale 168, 160 
    state = resize(state, [IMG_HEIGHT, IMG_WIDTH], anti_aliasing=False)

    #plt.imshow(state, cmap="gray")
    #plt.show()
    
    
    # Reshape
   # state = np.array(state).reshape([-1, IMG_HEIGHT, IMG_WIDTH])
    return state




  
def train():
  
    env = gym.make(ENV_NAME)

    tf.reset_default_graph()

    # main DQN and target DQN networks:
    with tf.variable_scope('mainDQN'):
        MAIN_DQN = DQN(env.action_space.n, HIDDEN, LEARNING_RATE)   # (★★)
    with tf.variable_scope('targetDQN'):
        TARGET_DQN = DQN(env.action_space.n, HIDDEN)               # (★★)
    
    #env = env.unwrapped

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()    

    MAIN_DQN_VARS = tf.trainable_variables(scope='mainDQN')
    TARGET_DQN_VARS = tf.trainable_variables(scope='targetDQN')
  
    update_networks = TargetNetworkUpdater(MAIN_DQN_VARS, TARGET_DQN_VARS)
    memory = Memory(preprocess(env.reset()), MEMORY_SIZE)   


    with tf.Session() as sess:
        sess.run(init)
        run = 0
        frame_number = 0
  #      saver.restore(sess, "/content/gdrive/My Drive/Models/DDQN/checkpoint"+str(run)+"-"+str(frame_number))

        rewards = []
        loss_list = []
        for i in range(run):
          MAIN_DQN.update_explore_rate()
          TARGET_DQN.update_explore_rate()
        
        while run < 3000:
            run+=1
            ########################
            ####### Training #######
            ########################
            

            
            frame = preprocess(env.reset())
            
            if run > 1 :
                memory.setup(frame)
            
            episode_reward_sum = 0
            
            
            for _ in range(MAX_EPISODE_LENGTH):
              
                frame_number += 1
              
                state = np.stack(memory.get_curr_state(), -1)

                action = MAIN_DQN.act(sess, state)   

                unprocessed_new_frame, reward, terminal, _ = env.step(action)  
                

                # Clip the reward
                clipped_reward = clip_reward(reward)

                
                memory.remember(preprocess(unprocessed_new_frame), action, reward, terminal)

                episode_reward_sum += reward
                   

                if frame_number % UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                    loss = learn(sess, memory, MAIN_DQN, TARGET_DQN,
                                 BATCH_SIZE, gamma = DISCOUNT_FACTOR) # (8★)
                    loss_list.append(loss)
                    
                if frame_number % NETW_UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                    
                    update_networks(sess) # (9★)

                if terminal:
                    output = ("Run: " + str(run) + "  Reward: " + str(episode_reward_sum) + "  Explore Rate: " + str(MAIN_DQN.exploration_rate) + "  Frame Count: "+ str(frame_number))

                    logging.info(output)
                    MAIN_DQN.update_explore_rate()
                    TARGET_DQN.update_explore_rate()
                    terminal = False
                    break

            rewards.append(episode_reward_sum)

       
            #Save the network parameters
            if run%25 == 0:
                saver.save(sess, "/content/gdrive/My Drive/Models/DDQN/checkpoint"+str(run), global_step=frame_number)
            frames_for_gif = []
            


In [148]:
train()

INFO:Run: 1  Reward: -16.0  Explore Rate: 1.0  Frame Count: 1803
INFO:Run: 2  Reward: -20.0  Explore Rate: 0.995  Frame Count: 2937
INFO:Run: 3  Reward: -21.0  Explore Rate: 0.990025  Frame Count: 4056
INFO:Run: 4  Reward: -21.0  Explore Rate: 0.985074875  Frame Count: 5186
INFO:Run: 5  Reward: -21.0  Explore Rate: 0.9801495006250001  Frame Count: 6323
INFO:Run: 6  Reward: -21.0  Explore Rate: 0.9752487531218751  Frame Count: 7656
INFO:Run: 7  Reward: -20.0  Explore Rate: 0.9703725093562657  Frame Count: 8892
INFO:Run: 8  Reward: -21.0  Explore Rate: 0.9655206468094844  Frame Count: 10180
INFO:Run: 9  Reward: -20.0  Explore Rate: 0.960693043575437  Frame Count: 11317
INFO:Run: 10  Reward: -20.0  Explore Rate: 0.9558895783575597  Frame Count: 12449
INFO:Run: 11  Reward: -20.0  Explore Rate: 0.9511101304657719  Frame Count: 13552
INFO:Run: 12  Reward: -20.0  Explore Rate: 0.946354579813443  Frame Count: 14909
INFO:Run: 13  Reward: -21.0  Explore Rate: 0.9416228069143757  Frame Count: 162

KeyboardInterrupt: ignored