In [1]:
import tensorflow as tf
import numpy as np
import retro
import gym
import os

from skimage import transform
from skimage.color import rgb2gray

import matplotlib.pyplot as plt

from collections import deque
import random
import warnings

#remove deprication msgs for .layers
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [2]:
env = gym.make('BreakoutDeterministic-v4')

ACTION_SIZE = env.env.action_space.n
possible_actions = env.unwrapped.get_action_meanings()

print("size of frame: ", env.observation_space)
print("number of actions: ", ACTION_SIZE)
print("actions: ", possible_actions)

size of frame:  Box(210, 160, 3)
number of actions:  4
actions:  ['NOOP', 'FIRE', 'RIGHT', 'LEFT']


In [3]:
#try a few actions and games:
frame = env.reset()
for i in range(30):
    new_frame, reward, terminal, info = env.step(1)
    print(reward, terminal, info['ale.lives'])


0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 5
0.0 False 4
0.0 False 4
0.0 False 4
0.0 False 4
0.0 False 4
0.0 False 4


Preprocess functions:

In [4]:
class FrameProcessor(object):
    def __init__(self, height=84, width = 84):
        self.height = height
        self.width = width
        self.frame = tf.placeholder(shape=[210,160,3], dtype=tf.uint8)
        self.processed = tf.image.rgb_to_grayscale(self.frame)
        self.processed = tf.image.crop_to_bounding_box(self.processed,34, 0, 160, 160)
        self.processed = tf.image.resize_images(self.processed, [self.height, self.width] , method = tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    
    def process(self, session, frame):
        return session.run(self.processed, feed_dict={self.frame:frame})

In [5]:
class DDQN(object):
    def __init__(self, number_actions, hidden=1024, learning_rate=0.0000625, height=84, width=84, history_length=4):
        self.number_actions = number_actions
        self.hidden = hidden
        self.learning_rate = learning_rate
        self.height = height
        self.width = width
        self.history_length = history_length
        
        self.input = tf.placeholder(shape=[None, self.height, self.width, self.history_length], dtype=tf.float32)
        self.input = self.input/255
        
        #CONV LAYERS:
        
        #CONV 1:
        self.conv1 = tf.layers.conv2d(inputs=self.input, filters=32, kernel_size=[8, 8], strides=4,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.relu, use_bias=False, name="conv1")
        
        #CONV 2:
        self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, kernel_size=[4, 4], strides=2,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.relu, use_bias=False, name="conv2")
        
        #CONV 3:
        self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, kernel_size=[3, 3], strides=1,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.relu, use_bias=False, name="conv3")
        
        #CONV 4:
        self.conv4 = tf.layers.conv2d(inputs = self.conv3, filters=hidden, kernel_size=[7, 7], strides=1,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                     padding="valid", activation=tf.nn.relu, use_bias=False, name="conv4")
        
        #Split into two channels:
        self.valuestream, self.advantagestream = tf.split(self.conv4, 2, 3)
        
        #value stream:
        self.valuestream = tf.layers.flatten(self.valuestream)
        self.value = tf.layers.dense(inputs=self.valuestream, units=1,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2), name="value")
        
        #advantage stream:
        self.advantagestream = tf.layers.flatten(self.advantagestream)
        self.advantage = tf.layers.dense(inputs=self.advantagestream, units=1,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantage")

        
        #Combine the two:
        self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_values, 1)
        
        #target q:
        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        
        #action we took:
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        #Q value of the action above:
        self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action,
                                                                     self.number_actions, dtype=tf.float32)), axis=1)
        
        
        self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_q, predictions= self.Q))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = self.optimizer.minimize(self.loss)
        

In [6]:
class ActionGetter:
    
    def __init__(self, n_actions, eps_initial=1, eps_final=0.1, eps_final_frame=0.01,
                 eps_evaluation=0.0, eps_annealing_frames=1000000,
                 replay_memory_start_size=50000, max_frames = 1000000):
        self.n_actions = n_actions
        self.eps_initial = eps_initial
        self.eps_final = eps_final
        self.eps_final_frame = eps_final_frame
        self.eps_evaluation = eps_evaluation
        self.eps_annealing_frames = eps_annealing_frames
        self.replay_memory_start_size = replay_memory_start_size
        self.max_frames = max_frames
        
        self.slope = -(self.eps_initial - self.eps_final)/self.eps_annealing_frames
        self.intercept = self.eps_initial - self.slope*self.replay_memory_start_size
        self.slope_2 = -(self.eps_final - self.eps_final_frame)/(self.max_frames - self.eps_annealing_frames - self.replay_memory_start_size)
        self.intercept_2 = self.eps_final_frame - self.slope_2*self.max_frames

    def get_action(self, session, frame_number, state, main_ddqn, evaluation=False):
        
        if evaluation:
            eps = self.eps_evaluation
        elif frame_number < self.replay_memory_start_size:
            eps = self.eps_initial
        elif frame_number >= self.replay_memory_start_size and frame_number < self.replay_memory_start_size + self.eps_annealing_frames:
            eps = self.slope*frame_number + self.intercept
        elif frame_number >= self.replay_memory_start_size + self.eps_annealing_frames:
            eps = self.slope_2*frame_number + self.intercept_2
        
        if np.random.rand(1) < eps:
            return np.random.randint(0, self.n_actions)
        return session.run(main_ddqn.best_action, feed_dict={main_ddqn.input:[state]})[0]

In [7]:
class ReplayMemory(object):
    def __init__(self, size=100000, height=84, width= 84, history_length=4, batch_size=32):
        self.size = size
        self.height = height
        self.width = width
        self.history_length = history_length
        self.batch_size = batch_size
        self.count = 0
        self.current= 0
        
        #where the transitions will be stored:
        self.actions = np.empty(self.size, dtype=np.int32)
        self.rewards = np.empty(self.size, dtype=np.int32)
        self.frames = np.empty((self.size, self.height, self.width), dtype=np.uint8)
        self.terminal_flags = np.empty(self.size, dtype=np.bool)
        
        #memory for the minibatch:
        self.states = np.empty((self.batch_size, self.history_length, self.height, self.width),
                               dtype=np.uint8)
        self.new_states = np.empty((self.batch_size, self.history_length, self.height, self.width),
                               dtype=np.uint8)
        self.indices = np.empty(self.batch_size, dtype=np.int32)
        
    def add_experience(self, action, frame, reward, terminal):
        if frame.shape != (self.height, self.width):
            raise ValueError("Dimensions of frame do not match 84x84")
            
        #add the experience:
        self.actions[self.current] = action
        self.frames[self.current, ...] = frame
        self.rewards[self.current] = reward
        self.terminal_flags[self.current] = terminal
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.size # if we reach the limit we start overriding the first ones
        
    def _get_state(self, index):
        if self.count is 0:
            raise ValueError("the memory is empty")
        if index < 3:
            raise ValueError("index must be at least 3")
        return self.frames[index-self.history_length+1:index+1, ...] #get the 4 frames that represent this state
            
    def _get_valid_indices(self):
        for i in range(self.batch_size):
            while True:
                index = random.randint(self.history_length, self.count - 1)
                if index < self.history_length: # index cannot be smalled than 4
                    continue
                if index >= self.current and index - self.history_length <= self.current: # there should be atleast 4 frames to get after the state
                    continue
                if self.terminal_flags[index - self.history_length:index].any(): #if there is a terminal flag active, that means that in those four frame the agent died => we do not want to take them as a state
                    continue
                break
            self.indices[i] = index
        
    def get_minibatch(self):
            
        if self.count < self.history_length:
            raise ValueError("not enough memories to get a minibatch")
                
        self._get_valid_indices()
            
        for i, idx in enumerate(self.indices):
            self.states[i] = self._get_state(idx - 1)
            self.new_states[i] = self._get_state(idx)
                
        return np.transpose(self.states, axes=(0,2,3,1)), self.actions[self.indices], self.rewards[self.indices], np.transpose(self.new_states, axes=(0,2,3,1)), self.terminal_flags[self.indices]
    

In [8]:
def learn(session, replay_memory, main_ddqn, target_ddqn, batch_size, gamma):
    states, actions, rewards, new_states, terminal_flags = replay_memory.get_minibatch()
    
    best_new_actions = session.run(main_ddqn.best_action, feed_dict={main_ddqn.input:new_states})
    
    q_vals = session.run(target_ddqn.q_values, feed_dict={target_ddqn.input:new_states})
    double_q = q_vals[range(batch_size), best_new_actions]
    
    target_q = rewards + (gamma*double_q * (1 - terminal_flags))
    
    loss, _= session.run([main_ddqn.loss, main_ddqn.update]
                         , feed_dict={main_ddqn.input:states, main_ddqn.target_q:target_q, main_ddqn.action:actions})
    
    return loss

In [9]:
class TargetNetworkUpdater(object):
    def __init__(self, main_ddqn_vars, target_ddqn_vars):
        self.main_ddqn_vars = main_ddqn_vars
        self.target_ddqn_vars = target_ddqn_vars
        
    def _update_target_vars(self):
        update_ops = []
        for i, var in enumerate(self.main_ddqn_vars):
            copy_op = self.target_ddqn_vars[i].assign(var.value())
            update_ops.append(copy_op)
        return update_ops
    
    def __call__(self, sess):
        update_ops = self._update_target_vars()
        for copy_op in update_ops:
            sess.run(copy_op)
            

In [10]:
class BreakoutGame(object):
    
    def __init__(self, env_name="BreakoutDeterministic-v4", no_op_steps=10, history_length=4):
        self.env = gym.make(env_name)
        self.frame_processor = FrameProcessor()
        self.state = None
        self.last_lives = 0
        self.no_op_steps = no_op_steps
        self.history_length = history_length
        
    def reset(self, sess, evaluation=False):
        frame = self.env.reset()
        self.last_lives = 0
        terminal_life_lost = True
        
        if evaluation:
            for _ in range(random.randint(1, self.no_op_steps)):
                frame, _, _, _ = self.env.step(1) #fire the ball (only the first time it works)
                
        processed_frame = self.frame_processor.process(sess, frame)
        self.state = np.repeat(processed_frame, self.history_length, axis=2)
        
        return terminal_life_lost
    
    def step(self, sess, action):
        
        new_frame, reward, terminal, info = self.env.step(action)
        
        if info['ale.lives'] < self.last_lives:
            terminal_life_lost = True
        else:
            terminal_life_lost = terminal
        self.last_lives = info['ale.lives']
        
        processed_new_frame = self.frame_processor.process(sess, new_frame)
        new_state = np.append(self.state[:, :, 1:], processed_new_frame, axis=2)
        self.state = new_state
        
        return processed_new_frame, reward, terminal, terminal_life_lost, new_frame
        

In [11]:
#clip the rewards to be -1, 0, 1. Proved to be benefitial for Breakout
def clip_reward(reward):
    if reward > 0:
        return 1
    elif reward == 0:
        return 0
    else:
        return -1

In [12]:
tf.reset_default_graph()
#define the hyper-params:

ENV_NAME = 'BreakoutDeterministic-v4'

MAX_EPISODE_LENGTH = 18000
EVAL_FREQUENCY = 200000
EVAL_STEPS = 10000
TARGET_NETWORK_UPDATE_FREQ = 10000

DISCOUNT_FACTOR = 0.99 # Gamma in the Q learning equation
REPLAY_MEMORY_START_SIZE = 50000 # random actions at the begining

MAX_FRAMES = 7000000
MEMORY_SIZE = 1000000
NO_OP_STEPS = 10

UPDATE_FREQ = 4
HIDDEN = 1024

LEARNING_RATE = 0.0000625

BATCH_SIZE = 32

EXPLORE_START = 1.0
EXPLORE_STOP = 0.01
DECAY_RATE = 0.00005

#IMPORTANT FOR TRAINING OR TESTING:
PATH = "output/"
INFO = "info"
RUNID = "run1"
PATH = PATH + RUNID 
TRAIN = True

os.makedirs(PATH, exist_ok=True)
os.makedirs(os.path.join(INFO, RUNID), exist_ok=True)
SUMM_WRITER = tf.summary.FileWriter(os.path.join(INFO, RUNID))


In [13]:
atari = BreakoutGame(ENV_NAME, NO_OP_STEPS)

#create main DQN:
with tf.variable_scope('mainDDQN'):
    MAIN_DDQN = DDQN(atari.env.action_space.n, HIDDEN, LEARNING_RATE)

MAIN_DDQN_VARS = tf.trainable_variables(scope='mainDDQN')


#create target DQN:
with tf.variable_scope('targetDDQN'):
    TARGET_DDQN = DDQN(atari.env.action_space.n, HIDDEN)

TARGET_DDQN_VARS = tf.trainable_variables(scope='targetDDQN')


init= tf.global_variables_initializer()
saver = tf.train.Saver()

In [14]:
LAYERS_IDS = ["conv1", "conv2", "conv3", "value_fc", "value", "advantage_fc", "advantages"]

with tf.name_scope("Performace"):
    LOSS_PH = tf.placeholder(tf.float32, shape=None, name="loss_summary")
    LOSS_SUMMARY = tf.summary.scalar("loss", LOSS_PH)
    REWARD_PH = tf.placeholder(tf.float32, shape=None, name="reward_summary")
    REWARD_SUMMARY = tf.summary.scalar("reward", REWARD_PH)
    EVAL_SCORE_PH = tf.placeholder(tf.float32, shape=None, name="eval_summary")
    EVAL_SCORE_SUMMARY = tf.summary.scalar("eval_score", EVAL_SCORE_PH)
    
PERFORMANCE_SUMMARIES = tf.summary.merge([LOSS_SUMMARY, REWARD_SUMMARY])

with tf.name_scope("Parameters"):
    ALL_PARAM_SUMMARIES = []
    for i, Id in enumerate(LAYERS_IDS):
        with tf.name_scope("mainDDQN/"):
            MAIN_DDQN_KERNEL = tf.summary.histogram(Id, tf.reshape(MAIN_DDQN_VARS[i], shape=[-1]))
        ALL_PARAM_SUMMARIES.extend([MAIN_DDQN_KERNEL])
PARAM_SUMMARIES = tf.summary.merge(ALL_PARAM_SUMMARIES)
    

In [17]:
def train():
    my_replay_memory = ReplayMemory(size=MEMORY_SIZE, batch_size=BATCH_SIZE)   # (★)
    network_updater = TargetNetworkUpdater(MAIN_DDQN_VARS, TARGET_DDQN_VARS)
    action_getter = ActionGetter(atari.env.action_space.n, 
                                 replay_memory_start_size=REPLAY_MEMORY_START_SIZE, 
                                 max_frames=MAX_FRAMES)
    
    with tf.Session() as sess:
        sess.run(init)
        
        frame_number = 0
        rewards = []
        loss_list = []
        
        while frame_number < MAX_FRAMES:
            #training:
            epoch_frame = 0
            print(frame_number)
            while epoch_frame < EVAL_FREQUENCY:
                
                terminal_life_lost = atari.reset(sess)
                episode_reward_sum = 0
                
                for _ in range(MAX_EPISODE_LENGTH):
                    
                    #take the action:
                    action = action_getter.get_action(sess, frame_number, atari.state, MAIN_DDQN)
                    
                    processed_new_frame, reward, terminal, terminal_life_lost, _ = atari.step(sess, action)
                    
                    frame_number += 1
                    epoch_frame += 1
                    episode_reward_sum += reward
                    
                    clipped_reward = clip_reward(reward)
                    
                    my_replay_memory.add_experience(action=action, frame=processed_new_frame[:, :, 0], reward=clipped_reward,
                                                   terminal=terminal_life_lost)
                    
                    if frame_number % UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                        loss = learn(sess, my_replay_memory, MAIN_DDQN, TARGET_DDQN, BATCH_SIZE, gamma = DISCOUNT_FACTOR)
                    
                    if frame_number % TARGET_NETWORK_UPDATE_FREQ == 0 and frame_number > REPLAY_MEMORY_START_SIZE:
                        network_updater.update_networks(sess)

                    if terminal:  
                        terminal = False
                        break

                rewards.append(episode_reward_sum)

                if len(rewards) % 10 == 0:
                    # Scalar summaries for tensorboard
                    if frame_number > REPLAY_MEMORY_START_SIZE:
                        summ = sess.run(PERFORMANCE_SUMMARIES, 
                                        feed_dict={LOSS_PH:np.mean(loss_list), 
                                                    REWARD_PH:np.mean(rewards[-100:])})

                        SUMM_WRITER.add_summary(summ, frame_number)
                        loss_list = []
                        # Histogramm summaries for tensorboard
                    summ_param = sess.run(PARAM_SUMMARIES)
                    SUMM_WRITER.add_summary(summ_param, frame_number)

                    print(len(rewards), frame_number, np.mean(rewards[-100:]))
                    with open('rewards.dat', 'a') as reward_file:
                        print(len(rewards), frame_number, 
                              np.mean(rewards[-100:]), file=reward_file)

            #evaluation:
            
            terminal = True
            eval_rewards = []
            evaluate_frame_number = 0
            
            for _ in range(EVAL_STEPS):
                if terminal:
                    terminal_life_lost = atari.reset(sess, evaluation=True)
                    episode_reward_sum = 0
                    terminal = False
                    
                action = 1 if terminal_life_lost else action_getter.get_action(sess, frame_number, atari.state,
                                                                               MAIN_DDQN, evaluation = True)
                
                processed_new_frame, reward, terminal, terminal_life_lost, _ = atari.step(sess, action)
                evaluate_frame_number += 1
                episode_reward_sum += reward
                
                if terminal:
                    eval_rewards.append(episode_reward_sum)
                
            print("Evaluation score:\n", np.mean(eval_rewards))
            
            #save network params:
            saver.save(sess, PATH+"/my_model", global_step=frame_number)
            
            summ = sess.run(EVAL_SCORE_SUMMARY, feed_dict={EVAL_SCORE_PH:np.mean(eval_rewards)})
            SUMM_WRITER.add_summary(summ, frame_number)
            
            with open("rewardsEval.dat", "a") as eval_reward_file:
                 print(frame_number, np.mean(eval_rewards), file=eval_reward_file)
                
                

In [16]:
if TRAIN:
    train()

0
10 1635 0.7
20 3446 0.95
30 5214 0.9666666666666667
40 7194 1.1


KeyboardInterrupt: 

In [None]:
if not TRAIN:
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph("trained/my_model-4005245.meta")
        saver.restore(sess,tf.train.latest_checkpoint("trained"))
        terminal_life_lost = atari.reset(sess, evaluation = True)
        episode_reward_sum = 0
        while True:
            atari.env.render()
            
            action = 1 if terminal_life_lost else action_getter.get_action(sess, frame_number, atari.state,
                                                                           MAIN_DDQN, evaluation = True)
                
            processed_new_frame, reward, terminal, terminal_life_lost, new_frame = atari.step(sess ,action)
            episode_reward_sum += reward
            if terminal == True:
                break
        
        atari.env.close()
        print("total reward: {}".format(episode_reward_sum))