<a href="https://colab.research.google.com/github/LucasColas/AI-Space-Invaders/blob/master/SI_DQLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install dependancies




In [0]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

Pacman Dependancies

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

In [0]:
%%bash

# install required system dependencies
apt-get install -y xvfb x11-utils

# install required python dependencies (might need to install additional gym extras depending)
pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

# Imports and Helper functions


In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import datetime
from tensorflow import keras
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import random
import math
import glob
import io
import base64
from IPython.display import HTML

from skimage import transform
from skimage.color import rgb2gray
from collections import deque

from IPython import display as ipythondisplay

In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# Space Invaders !


In [0]:
env = wrap_env(gym.make("SpaceInvaders-v0")) 

In [0]:

print("action space : ", env.action_space)
print("frame size : ",env.observation_space)

possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
type(possible_actions)


In [0]:
observation = env.reset()

while True:
  
    env.render()
    
    #your agent goes here
    action = env.action_space.sample() 
         
    observation, reward, done, info = env.step(action) 
   
        
    if done: 
      break;
            
env.close()
show_video()

In [0]:
def pre_processing(frame):

  frame_grey = rgb2gray(frame)

  crop_frame = frame_grey[8:-12, 4:-12]

  normalize_frame = crop_frame/255.0

  preprocessed_frame = transform.resize(normalize_frame, [110,84])

  return preprocessed_frame

In [0]:
stack_size = 4

stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
  frame = pre_processing(state)

  if is_new_episode:

    stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen=4)

    stacked_frames.append(frame)
    stacked_frames.append(frame)
    stacked_frames.append(frame)
    stacked_frames.append(frame)

    stacked_state = np.stack(stacked_frames, axis=2)

  else:
    stacked_frames.append(frame)

    stacked_state = np.stack(stack_frames, axis=2)

  
  return stacked_state, stacked_frames


In [0]:
# HYPERPARAMETERS

state_size = [110,84,4]
action_size = env.action_space.n

Alpha = 0.0003
Gamma = 0.9

total_episodes = 100
max_steps = 60000
batch_size = 64

Epsilon_starte = 1.0
Epsilon_decay = 0.01
Epsilon_min = 0.001

pretrain_length = batch_size 
memory_size = 1000000

stack_size = 4

training = False
rendering = True 


In [0]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        tf.compat.v1.disable_eager_execution()
        
        with tf.compat.v1.variable_scope(name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
            # [None, 84, 84, 4]
            self.inputs_ = tf.compat.v1.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.compat.v1.placeholder(tf.float32, [None, self.action_size], name="actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.compat.v1.placeholder(tf.float32, [None], name="target")
            
            """
            First convnet:
            CNN
            ELU
            """
            # Input is 110x84x4
            self.conv1 = tf.compat.v1.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.keras.initializers.GlorotNormal(),
                                         name = "conv1")
            
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            """
            Second convnet:
            CNN
            ELU
            """
            self.conv2 = tf.compat.v1.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.keras.initializers.GlorotNormal(),
                                 name = "conv2")

            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")            
            
            """
            Third convnet:
            CNN
            ELU
            """
            self.conv3 = tf.compat.v1.layers.conv2d(inputs = self.conv2_out,
                                 filters = 64,
                                 kernel_size = [3,3],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.keras.initializers.GlorotNormal(),
                                 name = "conv3")

            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            self.flatten = tf.compat.v1.layers.flatten(self.conv3_out)
            
            self.fc = tf.compat.v1.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.keras.initializers.GlorotNormal(),
                                name="fc1")
            
            self.output = tf.compat.v1.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.keras.initializers.GlorotNormal(),
                                          units = self.action_size, 
                                        activation=None)
            

  
            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.optimizers.Adam(self.learning_rate) #Deleted .minimize (don't know if it's important)

In [0]:
tf.compat.v1.reset_default_graph()

DQNetwork = DQNetwork(state_size, action_size, Alpha)


In [0]:
class Memory():
  def __init__(self, max_size):
    self.buffer = deque(maxlen=max_size)

  def add(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    buffer_size = len(self.buffer)

    index = np.random.choice(np.arange(buffer_size), size = batch_size, 
                             replace = False)
    
    return [self.buffer[i] for i in index]


In [0]:
memory = Memory(max_size= memory_size)

In [0]:
for i in range(pretrain_length):

  if i == 0:
    state = env.reset()

    state, stacked_frames = stack_frames(stacked_frames, state, True)

  choice = random.randint(1,len(possible_actions))-1
  print(choice)
  action = possible_actions[choice][choice]
  print(action)

  next_state, reward, done, observation = env.step(action)

  #env.render

  state, stacked_frames = stack_frames(stacked_frames, state, True)

  if done:

    next_state = np.zeros(state.shape)
        
        
    memory.add((state, action, reward, next_state, done))
        
        
    state = env.reset()
        

    state, stacked_frames = stack_frames(stacked_frames, state, True)
        
  else:
       
      memory.add((state, action, reward, next_state, done))
        
      state = next_state


In [0]:
def choose_action(explore_start, explore_stop, decay_rate, decat_step, state, actions):
  explo = np.random.rand()

  explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)


  if explore_probability > explo:

    choice = random.randint(1, len(possible_actions))-1
    action = possible_actions[choice][choice]

  else:
    Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})

    choice = np.argmax(Qs)
    action = possible_actions[choice][choice]

  return action, explore_probability

In [0]:
saver = tf.compat.v1.train.Saver()

if training == True:
  with tf.compat.v1.Session as sess:
    sess.run(tf.compat.v1.global_variables_initializer())

    decay_step = 0

    for episode in range(total_episodes):
      step = 0

      episode_rewards = []

      state = env.reset()


      state, stacked_frames = stack_frames(stacked_frames=, state, True)

      while step < max_steps:

        step += 1

        decay_step += 1

        action, explore_probability = choose_action(explore_start, explore_stop, decay_rate, decat_step, state,possible_actions)

        next_state, reward, done, observation = env.step(action)

        if rendering:
          env.render()

        episode_rewards.append(reward)

        if done:
          next_state = np.zeros((110,84), dtype=np.int)

          next_state, stacked_frames = stack_frames(stack_frames, next_state, False)

          step = max_steps

          total_reward = np.sum(episode_rewards)

          print('Episode: {}'.format(episode),
                'Total reward: {}'.format(total_reward),
                'Explore P: {:.4f}'.format(explore_probability),
                'Training Loss {:.4f}'.format(loss))
          
          rewards_list.append((episode, total_reward))

          memory.add((state, action, reward, next_state, done))
        else:

          next_state, stacked_frames = stacked_frames(stacked_frames, next_state, False)

          memory.add((state, action, reward, next_state, done))

          state = next_state

        
        batch = memory.sample(batch_size)

        states_mb = np.array([each[0] for each in batch], ndmin=3)

        actions_mb = np.array([each[1] for each in batch])
        rewards_mb = np.array([each[2] for each in batch]) 
        next_states_mb = np.array([each[3] for each in batch], ndmin=3)
        dones_mb = np.array([each[4] for each in batch])

        target_Qs_batch = []

        Qs_next_state = sess.run(DQNetwork.output, feed_dict=DQNetwork.inputs: next_states_mb)

        for i in range(0, len(batch)):
          terminal = dones_mb[i]

          if terminal:
            target_Qs_batch.append(rewards_mb[i])

          else: 
            target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
            target_Qs_batch.append(target)

        targets_mb = np.array([each for each in target_Qs_batch])

        loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],feed_dict={DQNetwork.inputs_: states_mb,
                                                                            DQNetwork.target_Q: targets_mb,
                                                                            DQNetwork.actions_: actions_mb})

                
        summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                DQNetwork.target_Q: targets_mb,
                                                DQNetwork.actions_: actions_mb})
        writer.add_summary(summary, episode)
        writer.flush()

            # Save model every 5 episodes
      if episode % 5 == 0:
        save_path = saver.save(sess, "./models/model.ckpt")
        print("Model Saved")


