<a href="https://colab.research.google.com/github/HemaZ/Deep-Reinforcement-Learning/blob/master/DQN_SpaceInvaders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## Let's install all the requirements and prepare the Atari Environment 

! wget http://www.atarimania.com/roms/Roms.rar && unrar x Roms.rar && unzip Roms/ROMS.zip
! pip3 install gym-retro
! python3 -m retro.import ROMS/

--2019-01-05 12:58:41--  http://www.atarimania.com/roms/Roms.rar
Resolving www.atarimania.com (www.atarimania.com)... 195.154.81.199
Connecting to www.atarimania.com (www.atarimania.com)|195.154.81.199|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10823448 (10M) [application/x-rar-compressed]
Saving to: ‘Roms.rar.3’


2019-01-05 12:59:05 (445 KB/s) - ‘Roms.rar.3’ saved [10823448/10823448]


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from Roms.rar


Would you like to replace the existing file Roms/HC ROMS.zip
11729845 bytes, modified on 2018-12-24 16:58
with a new one
11729845 bytes, modified on 2018-12-24 16:58

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit E

All OK
Archive:  Roms/ROMS.zip
replace ROMS/128 in 1 Game Select ROM (128 in 1) (Unknown) ~.bin? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Importing 1904 potential games...
Importing Freeway-Atari2600
Importing Pitfall-Atari2600
Importing NameThisGame-Atari2600
Importin

In [1]:
import tensorflow as tf
import numpy as np
import retro                 # Retro Environment
import random
from skimage import transform # Help us to preprocess the frames
from skimage.color import rgb2gray # Help us to gray our frames

import matplotlib.pyplot as plt # Display graphs

from collections import deque # Ordered collection with ends



In [2]:
class GameEnv:
  """
  This Class creates Atari Game Enivroment and provides some preprocessing functions.
  """
  
  def __init__(self, game = 'SpaceInvaders-Atari2600'):
    self.env = retro.make(game)
    self.n_actions = self.env.action_space.n
    self.frame_size = self.env.observation_space.shape
    self.hot_enc_actions = np.array(np.identity(self.n_actions).tolist()) 
    self.stack_size = 4
    self.stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(self.stack_size)], maxlen=self.stack_size)
    self.hyperparameters = {
                           'learning_rate' : 0.00025,
                           'total_episodes' : 50,
                           'max_steps' : 50000,
                           'btach_size': 64,
                           'explore_start' : 1,
                           'explore_end' : 0.01,
                           'decay_rate' : 0.00001,
                           'gamma' : 0.9,
                           'pretrain_length' : 64,
                           'memory_size' : 1000000,
                           'state_size' : [110, 84, 4]
                           }
    self.training = False
    self.render = False
    
    
    
    
  def _preprocess_frame(self,frame):
    gray_frame = rgb2gray(frame)
    cropped_frame = gray_frame[8:-12,4:-12]
    
    # Normalize Pixel Values
    normalized_frame = cropped_frame/255.0
    
    # Resize
    # Thanks to Mikołaj Walkowiak
    preprocessed_frame = transform.resize(normalized_frame, [110,84])
    
    return preprocessed_frame # 110x84x1 frame
  
  def stack_frame(self, state, new_epis = False):
    
    frame = self._preprocess_frame(state)
    
    if new_epis:
      self.stacked_frames  =  deque([frame for _ in range(self.stack_size)], maxlen=self.stack_size)
    else:
      self.stacked_frames.append(frame)
    
    self.stacked_state = np.stack(self.stacked_frames, axis=2)
    return self.stacked_state  
    
    

In [3]:
class DeepQNN:
  
  def __init__(self, gamenv):
    self.gamenv = gamenv
    self.decay_step = 0
    with tf.variable_scope('DQNN'):
      self._inputs = tf.placeholder(tf.float32, [None, *self.gamenv.hyperparameters['state_size']], name='inputs')
      self._actions = tf.placeholder(tf.float32, [None, self.gamenv.n_actions], name='actions')
      self.target_Q = tf.placeholder(tf.float32, [None], name="target")
      
      self.conv1 = tf.layers.conv2d(inputs = self._inputs, 
                                    filters = 32,
                                    kernel_size = [8,8],
                                    strides = [4,4],
                                    padding = 'VALID',
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = 'Conv1')
      self.actvf1 = tf.nn.elu(self.conv1, name='Elu1')
      
      self.conv2 = tf.layers.conv2d(inputs = self.conv1, 
                                    filters = 64,
                                    kernel_size = [4,4],
                                    strides = [2,2],
                                    padding = 'VALID',
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = 'Conv2')
      self.actvf2 = tf.nn.elu(self.conv2, name='Elu2')
      
      self.conv3 = tf.layers.conv2d(inputs = self.conv2, 
                                    filters = 64,
                                    kernel_size = [3,3],
                                    strides = [2,2],
                                    padding = 'VALID',
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = 'Conv3')
      self.actvf3 = tf.nn.elu(self.conv3, name='Elu3')
      
      self.flatten = tf.contrib.layers.flatten(self.actvf3)
      self.fc = tf.layers.dense(inputs = self.flatten,
                                units = 512,
                                activation = tf.nn.elu,
                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
      self.output = tf.layers.dense(inputs = self.fc, 
                                   kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                   units = self.gamenv.n_actions, 
                                   activation=None)
      self.Q = tf.reduce_sum(tf.multiply(self.output, self._actions))
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
      self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
      self.optimizer = tf.train.AdamOptimizer(self.gamenv.hyperparameters['learning_rate']).minimize(self.loss)
      
      
  def predict_action(self, state, sess):
    hyperp = self.gamenv.hyperparameters
    explore_probability = hyperp['explore_end'] + (hyperp['explore_start'] - hyperp['explore_end']) * np.exp(-hyperp['decay_rate'] * self.decay_step)
    
    if explore_probability > np.random.rand():
      action = self.gamenv.hot_enc_actions[random.randint(0,self.gamenv.n_actions-1)]
      
    else:
      Qs = sess.run(self.output,feed_dict = {self._inputs:state.reshape((1,*state.shape))})
      action = self.gamenv.hot_enc_actions[np.argmax(Qs)]
      
    return action, explore_probability
    
    
    
    
    
    
    
    

In [4]:
class Memory:
  def __init__(self, max_size):
    self.buffer = deque(maxlen = max_size)
    
  def add(self, experience):
    self.buffer.append(experience)
    
  def sample(self, batch_size):
    buffer_len = len(self.buffer)
    index = np.random.choice(np.arange(buffer_len), size = batch_size, replace = False)
    return [self.buffer[i] for i in index]

In [5]:
def pre_populate_memory(memory, gamenv):
  state = gamenv.env.reset()
  state = gamenv.stack_frame(state,new_epis = True)
  for i in range(gamenv.hyperparameters['btach_size']):
    action = gamenv.hot_enc_actions[random.randint(0,gamenv.n_actions-1)]
    next_state, reward, done, info = gamenv.env.step(action)
    next_state = gamenv.stack_frame(next_state, new_epis = False)
    if done:
      next_state = np.zeros(next_state.shape)
      memory.add((state, action, reward, next_state, done))
      state = gamenv.env.reset()
      state = gamenv.stack_frame(state,new_epis = True)
    else:
      memory.add((state, action, reward, next_state, done))
      state = next_state
  return memory
    

In [6]:
tf.reset_default_graph()
spaceinvaders = GameEnv()
# spaceinvaders.training = True
dqnn = DeepQNN(spaceinvaders)
memory = Memory(spaceinvaders.hyperparameters['memory_size'])


In [7]:
memory = pre_populate_memory(memory, spaceinvaders)

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [8]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("./tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", dqnn.loss)

write_op = tf.summary.merge_all()

In [18]:
sys.getsizeof(memory)

56

In [0]:
# Saver will help us to save our model
saver = tf.train.Saver()
hyperp = spaceinvaders.hyperparameters
rewards_list = []
if spaceinvaders.training == True:
    with tf.Session() as sess:
        saver.restore(sess, "./models/model.ckpt")
        # Initialize the variables
        sess.run(tf.global_variables_initializer())

        # Initialize the decay rate (that will use to reduce epsilon) 
#         decay_step = 0
        
        for episode in range(hyperp['total_episodes']):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            state = spaceinvaders.env.reset()
            
            # Remember that stack frame function also call our preprocess function.
            state = spaceinvaders.stack_frame(state, True)
            
            while step < hyperp['max_steps']:
                step += 1
                
                #Increase decay_step
                dqnn.decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = dqnn.predict_action(state, sess)
                
                #Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = spaceinvaders.env.step(action)
                
                if spaceinvaders.render:
                    env.render()
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros((110,84), dtype=np.int)
                    
                    next_state = spaceinvaders.stack_frame(next_state, False)

                    # Set step = max_steps to end the episode
                    step = hyperp['max_steps']

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))

                    rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))

                else:
                    # Stack the frame of the next_state
                    next_state = spaceinvaders.stack_frame(next_state, False)
                
                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))

                    # st+1 is now our current state
                    state = next_state
                    

                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(hyperp['btach_size'])
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                # Get Q values for next_state 
                Qs_next_state = sess.run(dqnn.output, feed_dict = {dqnn._inputs: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + hyperp['gamma'] * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([dqnn.loss, dqnn.optimizer],
                                        feed_dict={dqnn._inputs: states_mb,
                                                   dqnn.target_Q: targets_mb,
                                                   dqnn._actions: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={dqnn._inputs: states_mb,
                                                       dqnn.target_Q: targets_mb,
                                                       dqnn._actions: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

In [9]:
saver = tf.train.Saver()
with tf.Session() as sess:
    total_test_rewards = []
    
    # Load the model
    saver.restore(sess, "../models/model.ckpt")
    
    for episode in range(1):
        total_rewards = 0
        
        state = spaceinvaders.env.reset()
        state = spaceinvaders.stack_frame(state, True)
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            # Reshape the state
            state = state.reshape((1, *spaceinvaders.hyperparameters['state_size']))
            # Get action from Q-network 
            # Estimate the Qs values state
            Qs = sess.run(dqnn.output, feed_dict = {dqnn._inputs: state})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = spaceinvaders.hot_enc_actions[choice]
            
            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = spaceinvaders.env.step(action)
            spaceinvaders.env.render()
            
            total_rewards += reward

            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
                
            next_state = spaceinvaders.stack_frame(next_state, False)
            state = next_state
            
    spaceinvaders.env.close()


INFO:tensorflow:Restoring parameters from ../models/model.ckpt


NotFoundError: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

Key DQNN/Conv1/bias not found in checkpoint
	 [[{{node save/RestoreV2}} = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]

Caused by op 'save/RestoreV2', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 427, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-2f8ae1760a39>", line 1, in <module>
    saver = tf.train.Saver()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1094, in __init__
    self.build()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1106, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1143, in _build
    build_save=build_save, build_restore=build_restore)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 787, in _build_internal
    restore_sequentially, reshape)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 406, in _AddRestoreOps
    restore_sequentially)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 854, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 1466, in restore_v2
    shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3263, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1751, in __init__
    self._traceback = tf_stack.extract_stack()

NotFoundError (see above for traceback): Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

Key DQNN/Conv1/bias not found in checkpoint
	 [[{{node save/RestoreV2}} = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]
