<a href="https://colab.research.google.com/github/KCL-Machine-Learning/dqn_atari/blob/main/Playing_Atari_Games_with_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars

# Playing Atrai games with Neural Network

### Papers: 
- [Playing Atari with Deep Reinforcment Learning](https://arxiv.org/pdf/1312.5602.pdf)
- [Human-level control through deep reinforcement learning](https://www.nature.com/articles/nature14236.pdf)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My\ Drive/DQN

In [None]:
import gym
import numpy as np
import tensorflow as tf

In [None]:
import random

from matplotlib import pyplot as plt
from collections import deque, namedtuple

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

In [None]:

env = gym.make('Breakout-v0')
for i_episode in range(1):
    observation = env.reset()
    for t in range(200):
        show_state(env)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print(reward, done, info)
        
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

In [None]:
observation, observation.max()

In [None]:
ACTION_SIZE = env.action_space.n - 1
STACK = 4
INPUT_DIM = (64, 64, STACK)

In [None]:
ACTION_SIZE

In [None]:
processed_obs = tf.cast(observation, tf.float32)
processed_obs /= 255.0
processed_obs = tf.image.rgb_to_yuv(processed_obs)[:, :, :1]
processed_obs = tf.image.resize(processed_obs, (64, 64))
plt.imshow(processed_obs[..., 0])
processed_obs, processed_obs.numpy().max()

In [None]:
env.action_space.sample()

In [None]:
env.unwrapped.get_action_meanings()

In [None]:
BUFFER_SIZE = int(1e6)         # replay buffer size
BATCH_SIZE = 32                # minibatch size
GAMMA = 0.99                   # discount factor
PARAM_UPDATE_EVERY = 1         # how often to update the parameters
TARGET_UPDATE_EVERY = 10000    # how often to update the target network
START_LEARNING = 15000         # start learning after how many steps

In [None]:
class ReplayBuffer:
    def __init__(self, size=BUFFER_SIZE, input_shape=INPUT_DIM[:2], history_length=STACK):
        """
        Arguments:
            size: Integer, Number of stored transitions
            input_shape: Shape of the preprocessed frame
            history_length: Integer, Number of frames stacked together to create a state for the agent
        """
        self.size = size
        self.input_shape = input_shape
        self.history_length = history_length
        self.count = 0  # total index of memory written to, always less than self.size
        self.current = 0  # index to write to

        # Pre-allocate memory
        self.actions = np.empty(self.size, dtype=np.int32)
        self.rewards = np.empty(self.size, dtype=np.float32)
        self.frames = np.empty((self.size, self.input_shape[0], self.input_shape[1], 1), dtype=np.float32)
        self.next_frames = np.empty((self.size, self.input_shape[0], self.input_shape[1], 1), dtype=np.float32)
        self.dones = np.empty(self.size, dtype=np.bool)


    def add(self, frame, action, reward, next_frame, done):
        """Saves a transition to the replay buffer
        Arguments:
            action: An integer between 0 and env.action_space.n - 1 
                determining the action the agent perfomed
            frame: A (84, 84, 1) frame of the game in grayscale
            reward: A float determining the reward the agend received for performing an action
            done: A bool stating whether the episode terminated
        """

        self.actions[self.current] = action
        self.frames[self.current, ...] = frame
        self.next_frames[self.current, ...] = next_frame
        self.rewards[self.current] = reward
        self.dones[self.current] = done
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.size

    def sample(self, batch_size=BATCH_SIZE):
        """Returns a minibatch of self.batch_size = 32 transitions
        Arguments:
            batch_size: How many samples to return
        Returns:
            A tuple of states, actions, rewards, new_states, and dones
        """

        # Get a list of valid indices
        indices = random.sample(range(0, self.count), batch_size)
        states = np.zeros((batch_size, self.input_shape[0], self.input_shape[1], self.history_length))
        next_states = np.zeros((batch_size, self.input_shape[0], self.input_shape[1], self.history_length))

        for i, idx in enumerate(indices):
            for j in range(self.history_length): # 0, 1, 2, 3   
                if j > 0 and self.dones[idx-j]:
                    break
                states[i, :, :, j] =  self.frames[idx-j, :, :, 0]
                next_states[i, :, :, j] = self.next_frames[idx-j, :, :, 0]

        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(self.actions[indices], dtype=tf.int32)
        rewards = tf.convert_to_tensor(self.rewards[indices], dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
        dones = tf.convert_to_tensor(self.dones[indices], dtype=tf.float32)
        return states,  actions, rewards, next_states, dones

    def save(self):
        """Save the replay buffer to a folder"""

        np.save('actions.npy', self.actions)
        np.save('frames.npy', self.frames)
        np.save('next_frames.npy', self.next_frames)
        np.save('rewards.npy', self.rewards)
        np.save('dones.npy', self.dones)

    def load(self):
        """Loads the replay buffer from a folder"""
        self.actions = np.load('actions.npy')
        self.frames = np.load('frames.npy')
        self.next_frames = np.load('next_frames.npy')
        self.rewards = np.load('rewards.npy')
        self.dones = np.load('dones.npy')
    
    def __len__(self):

        return self.count

In [None]:
class Agent():

    def __init__(self, input_dim, action_size=9, seed=123):
        tf.random.set_seed(seed)
        self.local_model = self.__build_model__(input_dim, action_size)
        self.local_model.compile(optimizer=tf.keras.optimizers.RMSprop(
            learning_rate=0.00025, rho=0.95, momentum=0.95))
        self.target_model = self.__build_model__(input_dim, action_size)
        self.target_model.set_weights(self.local_model.get_weights())

        self.action_size = action_size
        self.input_dim = input_dim
        self.start_learning = False
        self.memory = ReplayBuffer()
        self.t_step = 0

    def __build_model__(self, input_dim, action_size):
        input_layer = tf.keras.layers.Input(shape=input_dim)
        x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), strides=4, kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.), activation="relu")(input_layer)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(512, activation="relu",kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.))(x)
        output = tf.keras.layers.Dense(action_size,kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.))(x)
    
        return tf.keras.Model(inputs=input_layer, outputs=output)
    
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        self.t_step = self.t_step + 1 
        
        if not self.start_learning and self.t_step % START_LEARNING == 0:
            self.start_learning = True
            print("Starting learning from", self.t_step)
            

        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > BATCH_SIZE and self.start_learning and self.t_step % PARAM_UPDATE_EVERY == 0:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        
        if self.start_learning and self.t_step % TARGET_UPDATE_EVERY == 0:
            self.update_target_network()
            self.t_step = 0

    def act(self, state, eps=0.):
        action_values = self.local_model.predict(state)
        if random.random() > eps:
            action = np.argmax(action_values)
        else:
            action = random.choice(np.arange(self.action_size))
        
        return action
    
    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        target_q_next = tf.reduce_max(self.target_model.predict(next_states), axis=1)
        target_q = rewards + (gamma * target_q_next * (1-dones))
        with tf.GradientTape() as tape:
            q_values = self.local_model(states, training=True)
            one_hot_actions = tf.keras.utils.to_categorical(actions, self.action_size, dtype=np.float32)
            predicted_q = tf.reduce_sum(tf.multiply(q_values, one_hot_actions), axis=1)
            loss = tf.keras.losses.Huber()(target_q, predicted_q)
        
        model_gradients = tape.gradient(loss, self.local_model.trainable_weights)

        self.local_model.optimizer.apply_gradients(zip(model_gradients, self.local_model.trainable_weights))

    def update_target_network(self):
        self.target_model.set_weights(self.local_model.get_weights())


In [None]:
def dqn(agent, n_episodes=2000, max_t=2000, eps_start=1.0, eps_end=0.01, eps_decay=0.9969, train=True, window_size=10):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        train (bool): to update agent or not
    """
    max_score = -100
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=window_size)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    print("Starting training")
    
    env = gym.make('Breakout-v0')
    for i_episode in range(1, n_episodes+1):
        frame = env.reset()
        frame = tf.cast(frame, tf.float32)
        frame /= 255
        frame = tf.image.rgb_to_yuv(frame)[:, :, :1]
        frame = tf.image.resize(frame, size=tf.convert_to_tensor(INPUT_DIM, dtype=tf.int32)[:2])
        
        stack = np.zeros(INPUT_DIM)
        stack[:, :, 0] = frame[:, :, 0]
        
        score = 0
        lives = 5
        for t in range(max_t):
            state = tf.expand_dims(stack, axis=0)
            action = int(agent.act(state, eps))
            next_frame, reward, done, info = env.step(action) 

            if info['ale.lives'] < lives:
                reward = -1.0
                lives = info['ale.lives']
                done = True
            else:
                reward -= 0.5

            next_frame = tf.cast(next_frame, tf.float32)
            next_frame /= 255.0
            next_frame = tf.image.rgb_to_yuv(next_frame)[:, :, :1]
            next_frame = tf.image.resize(next_frame, size=tf.convert_to_tensor(INPUT_DIM, dtype=tf.int32)[:2])
            
            if train:
                agent.step(frame, action, reward, next_frame, done)

            for i in range(STACK-1):
                stack[:, :, STACK-(i+1)] = stack[:, :, STACK-(i+2)]
            stack[:, :, 0] = next_frame[:, :, 0]
            frame = next_frame

            score += reward
            if done and lives == 0:
                break
                
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        agent.local_model.save_weights('breakout_local_model_checkpoint')
        agent.target_model.save_weights('breakout_target_model_checkpoint')
        print('\rEpisode {}\tT_Step: {}\tAverage Score: {:.2f}\tMax score: {:.2f}\tMin score: {:.2f}\teps: {:.2f}'.format(i_episode, agent.t_step, np.mean(scores_window), np.max(scores_window), np.min(scores_window), eps), end="")
        if train:
            eps = max(eps_end, eps_decay*eps) # decrease epsilon
            if i_episode % window_size == 0:
                eval_score = evaluate(agent)
                agent.memory.save()
                print('\rEpisode {}\tAverage Score: {:.2f}\tMax score: {:.2f}\tMin score: {:.2f}\teps: {:.2f}\tEval Score: {:.2f}'.format(i_episode, np.mean(scores_window), np.max(scores_window), np.min(scores_window), eps, eval_score))
    env.close()
    return scores

In [None]:
agent = Agent(INPUT_DIM, ACTION_SIZE)
# agent.local_model.load_weights('breakout_local_model_checkpoint')
# agent.target_model.load_weights('breakout_target_model_checkpoint')
# agent.memory.load()
scores = dqn(agent, n_episodes=1000, max_t=800, eps_start=1.0, eps_end=0.1, eps_decay=0.99, train=True)

In [None]:
import time
def evaluate(agent, render=False):
    
    env = gym.make('Breakout-v0')
    state = env.reset()
    state = tf.cast(state, tf.float32)
    state /= 255
    state = tf.image.rgb_to_yuv(state)[:, :, :1]
    state = tf.image.resize(state, size=tf.convert_to_tensor(INPUT_DIM, dtype=tf.int32)[:2])
    stack = np.zeros(INPUT_DIM)
    stack[:, :, 0] = state[:, :, 0]
    state = tf.expand_dims(stack, axis=0)
    env.step(1)
    score = 0
    for t in range(200):
        if render:
            show_state(env)
        action = int(agent.act(state, 0.))
        # print(action)
        state, reward, done, info = env.step(action)
        state = tf.cast(state, tf.float32)
        state /= 255
        state = tf.image.rgb_to_yuv(state)[:, :, :1]
        state = tf.image.resize(state, size=tf.convert_to_tensor(INPUT_DIM, dtype=tf.int32)[:2])
        for i in range(STACK-1):
            stack[:, :, STACK-(i+1)] = stack[:, :, STACK-(i+2)]
        stack[:, :, 0] = state[:, :, 0]
        
        state = tf.expand_dims(stack, axis=0)
        score += reward
        if render:
            time.sleep(1)
        if done:
            break
    env.close()
    return score

In [None]:
env.close()

In [None]:
evaluate(agent, True)

In [None]:
agent = Agent(INPUT_DIM, ACTION_SIZE)
agent.local_model.load_weights('breakout_local_model_checkpoint')
agent.target_model.load_weights('breakout_target_model_checkpoint')

env.close()

In [None]:
plt.imshow(tf.image.rgb_to_yuv(state)[0, :, :, 0])
plt.show()