# Notebook Setup

In [None]:
!apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
%pip install -U tf-agents pyvirtualdisplay
%pip install -U gym>=0.21.0
%pip install -U gym[box2d,atari,accept-rom-license]

In [None]:
# Import Libraries

# TensorFlow
import tensorflow as tf
from tensorflow import keras

# Maths
import numpy as np
import matplotlib.pyplot as plt

# Gym
import gym

# Virtual Display
import pyvirtualdisplay

# Common
import os
import sys

# Saving
import pickle

# Other
from collections import deque

In [None]:
# Global Variables
PROJECT_ROOT_DIR = './drive/MyDrive/ML/FRAMES/'

# SEEDS
np.random.seed(69)
tf.random.set_seed(420)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Helper Functions

In [None]:
# Pre-Processing
def preprocess_observation(observation):

    # Slice Top Off
    img = observation[14:210:2, ::2]

    # Grey Scale
    img = img.mean(axis=2)
    img = (img - 128).astype(np.float32)

    return img.reshape(98, 80, 1)

In [None]:
# Epsilon Greedy Policy
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(np.array([state]))
        return np.argmax(Q_values[0])

In [None]:
# Sample Experiences
def sample_experiences(batch_size):
    indices = np.random.randint(len(REPLAY_MEMORY), size=batch_size)
    batch = [REPLAY_MEMORY[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [None]:
# One Iteration
def play(env, state, epsilon, frame_skip):

    # Get Action
    action = epsilon_greedy_policy(state, epsilon)

    # Do Action frame_skip Times
    iter_reward = 0
    for frame in range(frame_skip):
        next_state, reward, done, info = env.step(action)
        iter_reward += reward

    next_state = preprocess_observation(next_state)

    # Add Last Frame to Buffer
    REPLAY_MEMORY.append((state, action, iter_reward, next_state, done))
    return next_state, iter_reward, done, info

In [None]:
# Config
batch_size = 32
discount_rate = 0.99
learning_rate = 0.00025
momentum = 0.95
optimizer = keras.optimizers.SGD(learning_rate=learning_rate, momentum=momentum, nesterov=True)
loss_fn = keras.losses.mean_squared_error

# Train from Memory
def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1 - dones) * discount_rate * max_next_Q_values).reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis = 1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

# Q-Learning with Frames

In [None]:
# Create Environment
keras.backend.clear_session()

env = gym.make("AssaultNoFrameskip-v4")
input_shape = (98, 80, 1)
n_outputs = env.action_space.n

initializer = keras.initializers.VarianceScaling()

# Create Model
model = keras.models.Sequential([
                               keras.layers.Conv2D(filters=32, kernel_size=8, strides=4,
                                                   padding="same", activation="relu",
                                                   kernel_initializer=initializer,
                                                   input_shape=input_shape),
                               keras.layers.Conv2D(filters=16, kernel_size=4, strides=2,
                                                   padding="same", activation="relu",
                                                   kernel_initializer=initializer),
                               keras.layers.Conv2D(filters=32, kernel_size=3, strides=1,
                                                   padding="same", activation="relu",
                                                   kernel_initializer=initializer),
                               keras.layers.Flatten(),
                               keras.layers.Dense(units=512, activation="relu",
                                                  kernel_initializer=initializer),
                               keras.layers.Dense(n_outputs, activation="relu",
                                                  kernel_initializer=initializer)
])

# model.summary()

In [None]:
# Train Convolutional Model

# CONFIG
env.seed(710)

REPLAY_MEMORY = deque(maxlen=2000)

FRAMESKIP = 4
START_EPISODE = 0
EPISODES = 1750

EPSILON = 1
EPSILON_MAX = 1
DECAY = 0.99884936993651

# Load Existing Model (if it exists)
if os.path.isfile("./drive/MyDrive/ML/FRAMES/WEIGHTS/checkpoint.ckpt.index"):
    model.load_weights("./drive/MyDrive/ML/FRAMES/WEIGHTS/checkpoint.ckpt")
    print("Successfully Loaded Previous Weights")
else:
    print("Weights not loaded.")

episode_rewards = []
best_score = 0
step = 0

# Load Previous Values
try:
    with open(PROJECT_ROOT_DIR + "VARIABLES/" + "vars.pickle", 'rb') as v:
        dic = pickle.load(v)
        START_EPISODE = dic["episode"]
        EPSILON = dic["eps"]
        episode_rewards = dic["rewards"]
        step = dic["step"]
        print("Successfully Loaded Previous Variables")
except:
    print("No Files Loaded")

for episode in range(START_EPISODE, EPISODES):
    
    # Reset Env
    obs = preprocess_observation(env.reset())
    episode_rewards.append(0)

    # Decay Epsilon
    EPSILON = EPSILON_MAX * (DECAY ** episode)

    # Each Episode
    while True:

        step += 1
        obs, reward, done, info = play(env, obs, EPSILON, FRAMESKIP)

        episode_rewards[episode] += reward

        if step % 1000 == 0:
            
            # Save Model
            model.save_weights(PROJECT_ROOT_DIR + "WEIGHTS/checkpoint.ckpt")

            # Save Variables
            dic = {"rewards":episode_rewards, "eps":EPSILON, "step":step, "episode":episode}
            with open(PROJECT_ROOT_DIR + "VARIABLES/" + "vars.pickle", "wb") as v:
                pickle.dump(dic, v)

        if done:
            # Episode is Finished
            break
 
        print("\rEpisode: {}, Steps: {}, eps: {:.3f}, current_reward: {}, percentage: {:.2f}".format(episode, step + 1, EPSILON, episode_rewards[episode], episode/EPISODES*100), end="")
    
    # Train Model on Buffer Sample 
    training_step(batch_size)

np.savetxt(PROJECT_ROOT_DIR + "RESULTS/" + "res.csv", np.asarray(episode_rewards), delimiter=",")

In [None]:
# Plot Results

EPISODES_PER_EPOCH = 9

average_rewards = []
for i in range(0, len(episode_rewards)):
    if i == 0:
        average_rewards.append(episode_rewards[i])
    elif i < EPISODES_PER_EPOCH:
        average_rewards.append(np.mean(episode_rewards[:i + 1]))
    else:
        average_rewards.append(np.mean(episode_rewards[i - EPISODES_PER_EPOCH: i]))


bins = [x/10 for x in range(len(average_rewards))] 
plt.figure(figsize=(10, 5))
plt.plot(bins, average_rewards)

# plt.xticks(list(range(0, 18)))
plt.xlabel("Epoch")
plt.ylabel("Mean Reward Over Last 10 Episodes")
plt.show()

In [None]:
# Plot Results

EPISODES_PER_EPOCH = 99

average_rewards = []
for i in range(0, len(episode_rewards)):
    if i == 0:
        average_rewards.append(episode_rewards[i])
    elif i < EPISODES_PER_EPOCH:
        average_rewards.append(np.mean(episode_rewards[:i + 1]))
    else:
        average_rewards.append(np.mean(episode_rewards[i - EPISODES_PER_EPOCH: i]))


bins = [x/100 for x in range(len(average_rewards))] 
plt.figure(figsize=(10, 5))
plt.plot(bins, average_rewards)

# plt.xticks(list(range(0, 18)))
plt.xlabel("Epoch")
plt.ylabel("Mean Reward Over Last 100 Episodes")
plt.show()

In [None]:
test_rewards = []

env.seed(6969)

# Testing
for episode in range(0, 10):

    # Reset Env
    obs = preprocess_observation(env.reset())
    test_rewards.append(0)

    # Each Episode
    while True:

        step += 1
        next_step_img, reward, done, info = play(env, obs, EPSILON, FRAMESKIP)
        test_rewards[episode] += reward

        if done:
          # Episode finished
            break

    print("\nEpisode: {}, Reward: {}".format(episode + 1, test_rewards[episode]), end="")

In [None]:
np.savetxt(PROJECT_ROOT_DIR + "RESULTS/" + "frames_test.csv", np.asarray(test_rewards), delimiter=",")

In [None]:
# Visualise One Episode

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')


# Animation Helpers
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim


# Reset Env
next_step_img = preprocess_observation(env.reset())

frames = []

# Each Episode
while True:

    step += 1
    next_step_img, reward, done, info = play(env, next_step_img, EPSILON, FRAMESKIP)

    img = env.render(mode="rgb_array")
    frames.append(img)

    if done:
        # Episode finished
        break

plot_animation(frames)