In [1]:
import tensorflow as tf
import numpy as np
import retro
import gym

from skimage import transform
from skimage.color import rgb2gray

import matplotlib.pyplot as plt

from collections import deque
import random
import warnings

In [2]:
env = gym.make('BreakoutDeterministic-v4')

action_size = env.env.action_space.n
possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())

print("size of frame: ", env.observation_space)
print("number of actions: ", action_size)
print("actions: ", possible_actions)

size of frame:  Box(210, 160, 3)
number of actions:  4
actions:  [[1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]]


Preprocess functions:

In [3]:
class FrameProcessor(object):
    def __init__(self, height=84, width = 84):
        self.height = height
        self.widht = width
        self.frame = tf.placeholder(shape=[210,160,3], dtype=tf.unit8)
        
        self.gray_scaled = tf.image.rgb_to_grayscale(frame)
        self.cropped_frame = tf.image.crop_to_bounding_box(self.gray_scaled,34, 0, 160, 160)
        self.normalized_cropped = self.cropped_frame/255.0
        self.preprocessed = tf.image.resize_images(self.normalized_cropped, 84, 84, method = tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    
def preprocess_frame(frame, session):
    return session.run(self.processed, feed_dict={self.frame:frame})

In [4]:
class DDQN(object):
    def __init__(self, number_actions, hidden=512, learning_rate=0.0000625, height=84, width=84, history_length=4):
        self.number_actions = number_actions
        self.hidden = hidden
        self.learning_rate = learning_rate
        self.height = height
        self.width = width
        self.history_length = history_length
        
        self.input = tf.placeholder(shape=[None, self.height, self.width, self.history_length])
        
        #CONV LAYERS:
        
        #CONV 1:
        self.conv1 = tf.layers.conv2d(inputs=self.input, filters=32, kernel_size=[8, 8], strides=4,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.elu, use_bias=False, name="conv1")
        
        #CONV 2:
        self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, kernel_size=[4, 4], strides=2,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.elu, use_bias=False, name="conv2")
        
        #CONV 3:
        self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, kernel_size=[3, 3], strides=1,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.elu, use_bias=False, name="conv3")
        
        self.flattern = tf.layers.flatten(self.conv3)
        
        
        #Calculate V(s)
        self.value_fc = tf.layers.dense(inputs = self.flatten, units=hidden, activation=tf.nn.elu,
                                        kernel_initializer=tf.variance_scaling_initializer(scale=2), name="value_fc")
        
        self.value = tf.layers.dense(inputs = self.value_fc, units=1, activation=None,
                                        kernel_initializer=tf.variance_scaling_initializer(scale=2), name="value")
        
        #Calculate A(s,a)
        self.advantage_fc = tf.layers.dense(inputs = self.flatten, units=hidden, activation=tf.nn.elu,
                                        kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantage_fc")
        
        self.advantage = tf.layers.dense(inputs = self.advantage_fc, units=self.number_actions, activation=None,
                                        kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantages")

        
        #Combine the two:
        self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_values, 1)
        
        #target q:
        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        
        #action we took:
        self.action = tf.placeholder(shape=[None], dtype=tf.float32)
        
        #Q value of the action above:
        self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action, self.number_actions, dtype=tf.float32))
                               , axis=1)
        
        
        self.loss = tf.reduce.mean(tf.losses.huber_loss(labels=self.target_q, predictions= self.Q))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = self.optimizer.minimize(self.loss)
        

In [5]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions, session, DDQN):
    exp_tradeoff = np.random.rand()
    
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if(explore_probability > exp_tradeoff):
        #random action(exploration)
        action = np.random.randint(0, action_size)
    else:
        action = session.run(DDQN.best_action, feed_dict={DDQN.input:[state]})[0]
        
    return action, explore_probability

In [6]:
def ReplayMemory(object):
    def __init__(self, size= 100000, height=84, width= 84, history_length=4, batch_size=32):
        self.size = size
        self.hight = height
        self.history_length = history_length
        self.batch_size = batch_size
        self.count = 0
        self.current= 0
        
        #where the transitions will be stored:
        self.actions = np.empty(self.size, dtype=np.int32)
        self.rewards = np.empty(self.size, dtype=np.int32)
        self.frames = np.empty((self.size, self.height, self.width), dtype=np.uint8)
        self.terminal_flags = np.empty(self.size, dtype=np.int32)
        
        #memory for the minibatch:
        self.states = np.empty((self.batch_size, self.history_length, self.frame_height, self.frame_width),
                               dtype=np.uint8)
        self.new_states = np.empty((self.batch_size, self.history_length, self.frame_height, self.frame_width),
                               dtype=np.uint8)
        self.indices = np.empty(self.batch_size, dtype=np.int32)
        
        def add_experience(self, action, frame, reward, terminal):
            if frame.shape != (self.height, self.width):
                raise ValueError("Dimensions of frame do not match 84x84")
            
            #add the experience:
            self.actions[self.current] = action
            self.frames[self.current, ...] = frame
            self.rewards[self.current] = reward
            self.terminal_flags[self.current] = terminal
            self.count = max(self.count, self.current+1)
            self.current = (self.current + 1) % self.size # if we reach the limit we start overriding the first ones
        
        def _get_state(self, index):
            if self.count is 0:
                raise ValueError("the memory is empty")
            if index < 3:
                raise ValueError("index must be at least 3")
            return self.frames[index-self.history_length+1:index+1, ...] #get the 4 frames that represent this state
            
        def _get_valid_indices(self):
            for i in range(self.batch_size):
                while True:
                    index = random.randint(self.history_length, self.count - 1)
                    if index < self.history_length: # index cannot be smalled than 4
                        continue
                    if index >= self.current and index - self.history_length <= self.current: # there should be atleast 4 frames to get after the state
                        continue
                    if self.terminal_flags[index - self.history_length:index].any(): #if there is a terminal flag active, that means that in those four frame the agent died => we do not want to take them as a state
                        continue
                    break
                self.indices[i] = index
        
        def get_minibatch(self):
            
            if self.count < self.history_length:
                raise ValueError("not enough memories to get a minibatch")
                
            self._get_valid_indices()
            
            for i, idx in enumerate(self.indices):
                self.states[i] = self._get_state(idx - 1)
                self.new_states[i] = self._get_state(idx)
                
            return np.transpose(self.states, axes=(0,2,3,1)), self.actions[self.indices], self.rewards[self.indices], self.transpose(self.new_states, axes(0,2,3,1)), self.terminal_flags[self.indices]
    