In [1]:
import tensorflow as tf
import gym
import numpy as np 
import warnings
import matplotlib.pyplot as plt
from collections import deque
from skimage import transform
from skimage.color import rgb2gray # Help us to gray our frames
from SumTree import SumTree #SumTree implementation by Jaromír 
warnings.filterwarnings('ignore')

In [2]:
class GameEnv:
    
    """
      This Class creates Atari Game Enivroment and provides some preprocessing functions.
    """
  
    def __init__(self, game = 'SpaceInvaders-v0'):
        self.env = gym.make(game)
        self.n_actions = self.env.action_space.n
        self.frame_size = self.env.observation_space.shape
        self.hot_enc_actions = np.array(np.identity(self.n_actions).tolist()) 
        self.stack_size = 4
        self.stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(self.stack_size)], maxlen=self.stack_size)
        self.hyperparameters = {
                               'learning_rate' : 0.00025,
                               'total_episodes' : 50,
                               'max_steps' : 50000,
                               'btach_size': 64,
                               'explore_start' : 1,
                               'explore_end' : 0.01,
                               'decay_rate' : 0.00001,
                               'gamma' : 0.9,
                               'pretrain_length' : 64,
                               'memory_size' : 1000000,
                               'state_size' : [110, 84, 4]
                               }
        self.training = False
        self.render = False
    
    
    
    
    def _preprocess_frame(self,frame):
        gray_frame = rgb2gray(frame)
        cropped_frame = gray_frame[8:-12,4:-12]

        # Normalize Pixel Values
        normalized_frame = cropped_frame/255.0

        # Resize
        # Thanks to Mikołaj Walkowiak
        preprocessed_frame = transform.resize(normalized_frame, [110,84])

        return preprocessed_frame # 110x84x1 frame
  
    def stack_frame(self, state, new_epis = False):
    
        frame = self._preprocess_frame(state)

        if new_epis:
            self.stacked_frames  =  deque([frame for _ in range(self.stack_size)], maxlen=self.stack_size)
        else:
            self.stacked_frames.append(frame)

        self.stacked_state = np.stack(self.stacked_frames, axis=2)
        return self.stacked_state  



In [3]:
class DDQNN:
    
  
    def __init__(self, gamenv, name):
        
        self.gamenv = gamenv
        self.decay_step = 0
        with tf.variable_scope(name):
            self._inputs = tf.placeholder(tf.float32, [None, *self.gamenv.hyperparameters['state_size']], name='inputs')
            self._ISWeights = tf.placeholder(tf.float32, [None,1], name='IS_weights')
            self._actions = tf.placeholder(tf.float32, [None, self.gamenv.n_actions], name='actions')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")

            self.conv1 = tf.layers.conv2d(inputs = self._inputs, 
                                        filters = 32,
                                        kernel_size = [8,8],
                                        strides = [4,4],
                                        padding = 'VALID',
                                        kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                        name = 'Conv1')
            self.actvf1 = tf.nn.elu(self.conv1, name='Elu1')

            self.conv2 = tf.layers.conv2d(inputs = self.conv1, 
                                        filters = 64,
                                        kernel_size = [4,4],
                                        strides = [2,2],
                                        padding = 'VALID',
                                        kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                        name = 'Conv2')
            self.actvf2 = tf.nn.elu(self.conv2, name='Elu2')

            self.conv3 = tf.layers.conv2d(inputs = self.conv2, 
                                        filters = 64,
                                        kernel_size = [3,3],
                                        strides = [2,2],
                                        padding = 'VALID',
                                        kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                        name = 'Conv3')
            self.actvf3 = tf.nn.elu(self.conv3, name='Elu3')

            self.flatten = tf.contrib.layers.flatten(self.actvf3)
            self.value_fc = tf.layers.dense(inputs = self.flatten,
                                            units = 512,
                                            activation = tf.nn.elu,
                                            kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                            name = 'Value_fc')
            self.value = tf.layers.dense(inputs = self.value_fc,
                                        units = 1,
                                        activation = None,
                                        kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                        name="value")
            self.advantage_fc = tf.layers.dense(inputs = self.flatten,
                                      units = 512,
                                      activation = tf.nn.elu,
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name="advantage_fc")

            self.advantage = tf.layers.dense(inputs = self.advantage_fc,
                                            units = self.gamenv.n_actions,
                                            activation = None,
                                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name="advantages")


            self.output = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
            self.Q = tf.reduce_sum(tf.multiply(self.output, self._actions), axis=1)

            self.absolute_errors = tf.abs(self.target_Q - self.Q)# for updating Sumtree

            self.loss = tf.reduce_mean(self._ISWeights * tf.squared_difference(self.target_Q, self.Q))

            self.optimizer = tf.train.RMSPropOptimizer(self.gamenv.hyperparameters['learning_rate']).minimize(self.loss)
      
      
    def predict_action(self, state, sess):
        
        hyperp = self.gamenv.hyperparameters
        explore_probability = hyperp['explore_end'] + (hyperp['explore_start'] - hyperp['explore_end']) * np.exp(-hyperp['decay_rate'] * self.decay_step)

        if explore_probability > np.random.rand():
            action = self.gamenv.hot_enc_actions[self.gamenv.env.action_space.sample()]

        else:
            Qs = sess.run(self.output,feed_dict = {self._inputs:state.reshape((1,*state.shape))})
            action = self.gamenv.hot_enc_actions[np.argmax(Qs)]

        return action, explore_probability


    
    
    
    
    
    

In [4]:
def update_target_graph():
    
    # Get the parameters of our DQNNetwork
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")
    
    # Get the parameters of our Target_network
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")

    op_holder = []
    
    # Update our target_network parameters with DQNNetwork parameters
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

In [5]:
class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
    PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
    PER_b = 0.4  # importance-sampling, from initial value increasing to 1
    
    PER_b_increment_per_sampling = 0.001
    
    absolute_error_upper = 1.  # clipped abs error
    
    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        
    def store(self, experience):
        # Find the max priority
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])
        
        # If the max priority = 0 we can't put priority = 0 since this exp will never have a chance to be selected
        # So we use a minimum priority
        if max_priority == 0:
            max_priority = self.absolute_error_upper
        
        self.tree.add(max_priority, experience)   # set the max p for new p
        
    def sample(self, n):
        # Create a sample array that will contains the minibatch
        memory_b = []
        
        b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)
        
        # Calculate the priority segment
        # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
        priority_segment = self.tree.total_priority / n       # priority segment
    
        # Here we increasing the PER_b each time we sample a new minibatch
        self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])  # max = 1
        
        # Calculating the max_weight
        p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-self.PER_b)
        
        for i in range(n):
            """
            A value is uniformly sample from each range
            """
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            
            """
            Experience that correspond to each value is retrieved
            """
            index, priority, data = self.tree.get_leaf(value)
            
            #P(j)
            sampling_probabilities = priority / self.tree.total_priority
            
            #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b)/ max_weight
                                   
            b_idx[i]= index
            
            experience = [data]
            
            memory_b.append(experience)
        
        return b_idx, memory_b, b_ISWeights
    
    """
    Update the priorities on the tree
    """
    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.PER_e  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
        ps = np.power(clipped_errors, self.PER_a)

        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)
        

In [6]:
def pre_populate_memory(memory, gamenv):
    
    
    state = gamenv.env.reset()
    state = gamenv.stack_frame(state,new_epis = True)
    for i in range(gamenv.hyperparameters['pretrain_length']):
        
        action = gamenv.hot_enc_actions[gamenv.env.action_space.sample()]
        next_state, reward, done, info = gamenv.env.step(np.argmax(action))
        next_state = gamenv.stack_frame(next_state, new_epis = False)
        if done:
            
            next_state = np.zeros(next_state.shape)
            experience = state, action, reward, next_state, done
            memory.store(experience)
            state = gamenv.env.reset()
            state = gamenv.stack_frame(state,new_epis = True)
        else:
            experience = state, action, reward, next_state, done
            memory.store(experience)
            state = next_state
    return memory

In [7]:
# Reset the graph
tf.reset_default_graph()
# Make a new Game Object
spaceinvaders = GameEnv()
# Instantiate the DQNetwork
DQNetwork = DDQNN(spaceinvaders, name="DQNetwork")

# Instantiate the target network
TargetNetwork = DDQNN(spaceinvaders, name="TargetNetwork")

In [8]:
memory = Memory(spaceinvaders.hyperparameters['memory_size'])
memory = pre_populate_memory(memory, spaceinvaders)

In [9]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("./tensorboard/dddqn/1")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [10]:
spaceinvaders.training = True

In [12]:
# Saver will help us to save our model
saver = tf.train.Saver()
hyperp = spaceinvaders.hyperparameters
max_tau = 10000
rewards_list = []
if spaceinvaders.training == True:
    with tf.Session() as sess:
#         saver.restore(sess, "./models/model.ckpt")
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        tau = 0 # 
        update_target = update_target_graph()
        sess.run(update_target)
        for episode in range(hyperp['total_episodes']):
            # Set step to 0
            step = 0
            
           
            episode_rewards = []
            
           
            state = spaceinvaders.env.reset()
            state = spaceinvaders.stack_frame(state, True)
            
            while step < hyperp['max_steps']:
                step += 1
                tau += 1
                               
                #Increase decay_step
                DQNetwork.decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = DQNetwork.predict_action(state, sess)
                
                #Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = spaceinvaders.env.step(np.argmax(action))
                
                if spaceinvaders.render:
                    spaceinvaders.env.render()
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros((110,84), dtype=np.int)
                    
                    next_state = spaceinvaders.stack_frame(next_state, False)

                    # Set step = max_steps to end the episode
                    step = hyperp['max_steps']

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))

                    rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    experience = state, action, reward, next_state, done
                    memory.store(experience)

                else:
                    # Stack the frame of the next_state
                    next_state = spaceinvaders.stack_frame(next_state, False)
                
                    # Add experience to memory
                    experience = state, action, reward, next_state, done
                    memory.store(experience)

                    # st+1 is now our current state
                    state = next_state
                    

                ### LEARNING PART            
                tree_idx, batch, ISWeights_mb = memory.sample(hyperp['btach_size'])
                
                
                states_mb = np.array([each[0][0] for each in batch], ndmin=3)
                actions_mb = np.array([each[0][1] for each in batch])
                rewards_mb = np.array([each[0][2] for each in batch]) 
                next_states_mb = np.array([each[0][3] for each in batch], ndmin=3)
                dones_mb = np.array([each[0][4] for each in batch])

                target_Qs_batch = []

                # Get Q values for next_state 
                q_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork._inputs: next_states_mb})
                q_target_next_state = sess.run(TargetNetwork.output, feed_dict = {TargetNetwork._inputs: next_states_mb})
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    action = np.argmax(q_next_state[i])
                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + hyperp['gamma'] * np.max(q_target_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                
                _, loss, absolute_errors = sess.run([DQNetwork.optimizer, DQNetwork.loss, DQNetwork.absolute_errors],
                                    feed_dict={DQNetwork._inputs: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork._actions: actions_mb,
                                              DQNetwork._ISWeights: ISWeights_mb})
                memory.batch_update(tree_idx, absolute_errors)

                summary = sess.run(write_op, feed_dict={DQNetwork._inputs: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork._actions: actions_mb,
                                              DQNetwork._ISWeights: ISWeights_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
                if tau > max_tau:
                    # Update the parameters of our TargetNetwork with DQN_weights
                    update_target = update_target_graph()
                    sess.run(update_target)
                    tau = 0
                    print("Model updated")

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: 190.0 Explore P: 0.9899 Training Loss 0.0000
Model Saved
Episode: 1 Total reward: 180.0 Explore P: 0.9831 Training Loss 0.0000
Episode: 2 Total reward: 135.0 Explore P: 0.9759 Training Loss 0.0000
Episode: 3 Total reward: 110.0 Explore P: 0.9708 Training Loss 0.0000


KeyboardInterrupt: 

In [None]:
for batc in batch:
    print(len(batc[0][1]))

In [None]:
batch[1][0]