In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Flatten, Dense
import numpy as np
import gym

from skimage import transform
from skimage.color import rgb2gray

import matplotlib.pyplot as plt

from collections import deque

import random
import time

In [None]:
def preprocess_screen(screen, sz_to_process):
    gray = rgb2gray(screen)
    cropped_screen = gray[8:-12,5:-12]  # For Space Invaders, TODO make it cleaner
    
    preprocessed_screen = transform.resize(cropped_screen, sz_to_process, mode='constant', anti_aliasing=True)
    
    return preprocessed_screen

In [None]:
def stack_frames(stacked_frames, last_2_screens, is_new_episode):
    assert isinstance(stacked_frames, deque), "stacked_frames has not type deque"
    sz_to_process = stacked_frames[0].shape
    
    max_screen = np.maximum(last_2_screens[0], last_2_screens[1])

    frame = preprocess_screen(max_screen, sz_to_process)
    
    if is_new_episode:
        for _ in range(len(stacked_frames)):
            stacked_frames.append(frame)
    else:
        stacked_frames.append(frame)
        
    input_state = np.stack(stacked_frames, axis=2)
                    
    return input_state, stacked_frames

In [None]:
class EnvWrapper:
    # Do not have to handle stacked frames externally
    
    def __init__(self, game_name, state_size, frame_skipping):
        self.env = gym.make(game_name)
        self.stacked_frames = deque([np.zeros(state_size[:-1]) for _ in range(state_size[-1])],
                                    maxlen=state_size[-1])
        self.nb_actions = self.env.action_space.n
        self.frame_skipping = frame_skipping
        self.last_2_screens = deque(maxlen=2)
        
    def step(self, action, render=False):
        for _ in range(self.frame_skipping):
            screen, reward, done, info = self.env.step(action)
            self.last_2_screens.append(screen)
            if render:
                self.render()
            if done:
                break
        if done:
            # Reset env and state
            screen = self.env.reset()
            for _ in range(2):
                self.last_2_screens.append(screen)
            stacked_state, self.stacked_frames = stack_frames(self.stacked_frames, self.last_2_screens, True)
        else:
            stacked_state, self.stacked_frames = stack_frames(self.stacked_frames, self.last_2_screens, False)

        return stacked_state, reward, done, info
            
    def reset(self):
        screen = self.env.reset()
        for _ in range(2):
            self.last_2_screens.append(screen)
        stacked_state, self.stacked_frames = stack_frames(self.stacked_frames, self.last_2_screens, True)
        
        return stacked_state
        
    def render(self):
        self.env.render()

In [None]:
class SumTree():
        
    def __init__(self, capacity):
        self.capacity = capacity
        self.data = np.zeros(capacity, dtype=object)
        self.tree = np.zeros(2 * capacity - 1)
        self.pointer = 0
                
    def add(self, data, p):
        idx_data = self.pointer
        idx_tree = self.pointer + self.capacity - 1
                
        self.data[idx_data] = data

        diff_p = p - self.tree[idx_tree]
        self.tree[idx_tree] += diff_p
        
        while idx_tree != 0:
            idx_tree = (idx_tree - 1) // 2
            self.tree[idx_tree] += diff_p
        
        self.pointer = (self.pointer + 1) % self.capacity
        
    def update(self, idx_data, p):
        idx_tree = idx_data + self.capacity - 1

        diff_p = p - self.tree[idx_tree]
        self.tree[idx_tree] += diff_p
        
        while idx_tree != 0:
            idx_tree = (idx_tree - 1) // 2
            self.tree[idx_tree] += diff_p
            
    def get(self, s):
        cur_idx = 0
        while cur_idx < self.capacity - 1:
            left_idx = 2 * cur_idx + 1
            right_idx = left_idx + 1
            
            if self.tree[left_idx] >= s:
                cur_idx = left_idx
            else:
                cur_idx = right_idx
                s -= self.tree[left_idx]
                
        idx_data = cur_idx - (self.capacity - 1)
        return idx_data, self.data[idx_data], self.tree[cur_idx]
    
    def max_p_leaf(self):
        return np.max(self.tree[-self.capacity:])
    
    def total(self):
        return self.tree[0]

In [None]:
class Memory():
        
    def __init__(self, max_size, PER_epsilon, PER_alpha, PER_beta, PER_beta_increment):
        self.sumtree = SumTree(max_size)
        
        self.PER_epsilon = PER_epsilon
        self.PER_alpha = PER_alpha
        self.PER_beta = PER_beta

        self.PER_beta_increment = PER_beta_increment
        
    def add(self, experience):
        priority = self.sumtree.max_p_leaf()
        if priority == 0:
            priority = self.PER_epsilon
        self.sumtree.add(experience, priority)
        
    def sample(self, batch_size):
        tot_sumtree = self.sumtree.total()
        p_step = tot_sumtree / batch_size
        
        indices_data = np.empty(batch_size, dtype=np.int)
        experiences = np.empty(batch_size, dtype=object)
        is_weights = np.empty(batch_size, dtype=np.float)
        
        
        for i in range(batch_size):
            # Sample value ar random in segment
            a, b = i * p_step,  (i + 1) * p_step
            value = np.random.uniform(a, b)
            
            # Get experience from the sumtree
            idx_data, experience, priority = self.sumtree.get(value)
            
            # Compute IS weight: (1/a)^b = a^(-b) 
            is_w = (batch_size * (priority / tot_sumtree)) ** (-self.PER_beta)
            
            indices_data[i] = idx_data
            experiences[i] = experience
            is_weights[i] = is_w
            
        is_weights /= np.max(is_weights)
        
        # Anneal beta towards 1
        self.PER_beta = min(self.PER_beta + self.PER_beta_increment, 1)

        return indices_data, experiences, is_weights
    
    def update_priorities(self, indices_data, errors):
        errors += self.PER_epsilon
        errors = errors ** self.PER_alpha
        for i, e in zip(indices_data, errors):
            self.sumtree.update(i, e)
        

In [None]:
class DQGraph:
    
    def __init__(self, state_size, nb_actions, learning_rate, scope_name):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.inputs = tf.placeholder(tf.float32, (None, *(self.state_size)), name="inputs")
            self.action = tf.placeholder(tf.uint8, (None,), name="action")
            self.action_OH = tf.one_hot(self.action, self.nb_actions, name="action_OH")
            
            self.target_Q = tf.placeholder(tf.float32, (None,), name="target_Q")
            
            self.is_weight = tf.placeholder(tf.float32, (None,), name="is_weight")
            
            # Neural net
            self.conv1 = tf.layers.conv2d(inputs=self.inputs,
                                          filters=32,
                                          kernel_size=(8,8),
                                          strides=(4,4),
                                          padding="valid",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="conv1")
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            self.conv2 = tf.layers.conv2d(inputs=self.conv1_out,
                                          filters=64,
                                          kernel_size=(4,4),
                                          strides=(2,2),
                                          padding="valid",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="conv2")
            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")            
            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out,
                                          filters=64,
                                          kernel_size=(3,3),
                                          strides=(2,2),
                                          padding="valid",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          name="conv3")
            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            
            # Dueling DQNet, we separate the network in 2 streams.
            # One computing V(s)
            self.fc_value = tf.layers.dense(self.flatten,
                                            units=512,
                                            activation=tf.nn.elu,
                                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                            name="fc_value")
            
            self.value = tf.layers.dense(self.fc_value,
                                         units=1,
                                         activation=None,
                                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                         name="value")
            
            # The other computing A(s,a)
            self.fc_advantages = tf.layers.dense(self.flatten,
                                                 units=512,
                                                 activation=tf.nn.elu,
                                                 kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                 name="fc_advantages")
            
            self.advantages = tf.layers.dense(self.fc_advantages,
                                              units=self.nb_actions,
                                              activation=None,
                                              kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                              name="advantages")
            
            self.output = self.value + (self.advantages - tf.reduce_mean(self.advantages, axis=1, keepdims=True))
            
            # Prediction of the Q value
            self.pred_Q = tf.reduce_sum(tf.multiply(self.output, self.action_OH), 1)
            
            # Error for PER
            self.errors = tf.abs(self.pred_Q - self.target_Q)
            
            self.loss = tf.reduce_mean(self.is_weight * tf.math.square(self.errors))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.optimizer = tf.contrib.estimator.clip_gradients_by_norm(self.optimizer, clip_norm=1.)
            
            #tf.summary.scalar("grad_norm", self.optimizer.compute_gradients(self.loss))
            
            self.train_op = self.optimizer.minimize(self.loss)

In [None]:
class DQTrainer:
    
    def __init__(self,
                 dqnet, dqtarget,
                 env, eval_env,
                 nb_actions, gamma, exp_memory,
                 ckpt_file):
        
        self.dqnet = dqnet
        self.dqtarget = dqtarget
        self.env = env
        self.eval_env = eval_env
        self.nb_actions = nb_actions
        self.gamma = gamma
        self.exp_memory = exp_memory
        self.ckpt_file = ckpt_file
        self.saver = tf.train.Saver()
        
    def choose_action(self, input_state, epsilon, session):
        # Epsilon-greedy strategy
        if np.random.rand() > epsilon:
            outputs = session.run(self.dqnet.output,
                                  feed_dict={self.dqnet.inputs: np.expand_dims(input_state, axis=0)})
            action = np.argmax(outputs)
        else:
            action = np.random.choice(np.arange(nb_actions))

        return action


    def run_episode(self, env_to_run, max_step, session, render=False, store_in_memory=True, epsilon=0):        
        state = env_to_run.reset()

        total_reward = 0

        for step in range(max_step):
            action = self.choose_action(state, epsilon, session)

            # Apply action to env and get next state, reward, and done bool
            next_state, reward, done, _ = env_to_run.step(action, render)
            clipped_reward = np.clip(reward, -1, 1)

            if store_in_memory:
                self.exp_memory.add((state, action, clipped_reward, next_state, done))

            state = next_state
            total_reward += reward

            if done:
                break

        print("Reward on episode: %f" % total_reward)
 
    def train_on_batch(self, batch_size, session):
        
        indices_data, experiences, is_weights = self.exp_memory.sample(batch_size)

        states = [el[0] for el in experiences]
        actions = [el[1] for el in experiences]
        rewards = [el[2] for el in experiences]
        next_states = [el[3] for el in experiences]
        done = [el[4] for el in experiences]
        
        # Double DQN mechanism
        # First run to compute Q values for next state with both networks
        Q_vals_next = sess.run(self.dqnet.output,
                               feed_dict={self.dqnet.inputs: next_states})
        
        Q_vals_next_target = sess.run(self.dqtarget.output,
                                      feed_dict={self.dqtarget.inputs: next_states})
        
        # Compute target Q values (Bellman equation)
        target_Qs = np.empty(batch_size)
        for step in range(batch_size):
            if done[step]:
                target_Qs[step] = rewards[step]
            else:
                best_action_net = np.argmax(Q_vals_next[step])
                target_Qs[step] = rewards[step] + self.gamma * Q_vals_next_target[step][best_action_net]

        # Second run to optimize
        _, loss, errors = sess.run([self.dqnet.train_op, self.dqnet.loss, self.dqnet.errors],
                                   feed_dict={self.dqnet.inputs: states,
                                              self.dqnet.action: actions,
                                              self.dqnet.target_Q: target_Qs,
                                              self.dqnet.is_weight: is_weights})
        
        # Update priorities in Memory
        self.exp_memory.update_priorities(indices_data, errors)
        
        
    def play_and_learn(self, batch_size, learning_rate, total_steps,
                       epsilon_start, epsilon_decay, do_decay_espilon_every,
                       replay_period, t_upd_target,
                       frame_skipping,
                       evaluate_after, save_after,
                       session):
        
        epsilon = epsilon_start
        
        # Reset state
        state = self.env.reset()

        T = time.time()
        for step in range(1, total_steps + 1):
            action = self.choose_action(state, epsilon, session)

            # Apply action to env and get next state, reward, and done bool
            next_state, reward, done, _ = self.env.step(action)
            clipped_reward = np.clip(reward, -1, 1)

            self.exp_memory.add((state, action, clipped_reward, next_state, done))

            state = next_state
            
            if step % do_decay_espilon_every == 0:
                epsilon *= epsilon_decay
            
            if step % replay_period == 0:
                self.train_on_batch(batch_size, session)
                            
            if step % t_upd_target == 0:
                upd_ops = self.update_target()
                session.run(upd_ops)
                #print("Updated target weights")
                
            if step % evaluate_after == 0:
                self.run_episode(self.eval_env, 10000, session, store_in_memory=False)
                print("DEBUG:\n\tepsilon = %f" % epsilon)
                print("Time to play %i steps: %f" %(step, time.time() - T))
                    
            if step % save_after == 0:
                self.saver.save(sess, "./models/model.ckpt")
                print("Saved model after %i steps." % step)        
        
        
    def update_target(self):
        # Get the parameters of the DQNNet
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, dqnet.scope_name)

        # Get the parameters of the DQTarget
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, dqtarget.scope_name)

        op_holder = []

        # Update our target_network parameters with DQNNetwork parameters
        for from_var, to_var in zip(from_vars,to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder
    
    
    def restore(self, session):
        self.saver.restore(session, self.ckpt_file)

In [None]:
sz_to_process = (110,84)
stack_size = 4
state_size = (*sz_to_process, stack_size)

frame_skipping = 2

# Create the environments
env = EnvWrapper('SpaceInvaders-v0', state_size, frame_skipping)
eval_env = EnvWrapper('SpaceInvaders-v0', state_size, frame_skipping)

nb_actions = env.nb_actions

max_mem_size = int(1e4)

gamma = 0.95

batch_size = 32
learning_rate = 0.00025
total_steps = int(1e6)

epsilon_start = 1.
epsilon_decay = 0.997
do_decay_espilon_every = total_steps / 1e4

replay_period = 4
t_upd_target = 5000
evaluate_after = 500
save_after = 20000

PER_epsilon = 0.01
PER_alpha = 0.6
PER_beta = 0.5
PER_beta_increment = PER_beta / (total_steps / replay_period)

ckpt_file = "./models/model.ckpt"

In [None]:
exp_memory = Memory(max_mem_size, PER_epsilon, PER_alpha, PER_beta, PER_beta_increment)

In [None]:
tf.reset_default_graph()

with tf.Session() as sess:
    dqnet = DQGraph(state_size=state_size,
                    nb_actions=nb_actions,
                    learning_rate=learning_rate,
                    scope_name='DQNet')
    dqtarget = DQGraph(state_size=state_size,
                       nb_actions=nb_actions,
                       learning_rate=learning_rate,
                       scope_name='DQTarget')

    dqtrainer = DQTrainer(dqnet, dqtarget,
                          env=env,
                          eval_env=eval_env,
                          nb_actions=nb_actions,
                          gamma=gamma,
                          exp_memory=exp_memory,
                          ckpt_file=ckpt_file)
    
    # Setup TensorBoard
    writer = tf.summary.FileWriter("./tensorboard/", sess.graph)
    
    sess.run(tf.global_variables_initializer())
    
    # Fill memory
    dqtrainer.run_episode(env,
                          5000,
                          sess, render=False, epsilon=1.)
    dqtrainer.play_and_learn(batch_size, learning_rate, total_steps,
                             1., epsilon_decay, do_decay_espilon_every,
                             replay_period, t_upd_target,
                             frame_skipping,
                             evaluate_after, save_after,
                             session=sess)

In [None]:
tf.reset_default_graph()

with tf.Session() as sess:
    dqnet = DQGraph(state_size=state_size,
                    nb_actions=nb_actions,
                    learning_rate=learning_rate,
                    scope_name='DQNet')
    dqtarget = DQGraph(state_size=state_size,
                       nb_actions=nb_actions,
                       learning_rate=learning_rate,
                       scope_name='DQTarget')

    dqtrainer = DQTrainer(dqnet, dqtarget,
                          env=env,
                          possible_actions=possible_actions,
                          gamma=gamma,
                          exp_memory=exp_memory,
                          state_size=state_size)
    
    dqtrainer.restore(sess)
    dqtrainer.run_episode(10000, 1, sess, render=True, epsilon=0.)
    env.close()