In [1]:
%matplotlib ipympl

import tensorflow as tf
from tensorflow import keras
import gymnasium
from gymnasium.wrappers import AtariPreprocessing
from gymnasium.wrappers import FrameStackObservation, TimeLimit
from collections import deque
import ale_py
import matplotlib.pyplot as plt
import numpy as np


import matplotlib.animation as animation
import matplotlib.pyplot as plt



tf.keras.utils.disable_interactive_logging()

2024-11-20 12:00:28.628719: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-20 12:00:30.141177: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nick/miniconda3/envs/tf_env/lib/python3.9/site-packages/nvidia/cudnn/lib:/home/nick/miniconda3/envs/tf_env/lib/
2024-11-20 12:00:30.141897: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nick

In [2]:
env = gymnasium.make("BreakoutNoFrameskip-v4", render_mode="rgb_array")

A.L.E: Arcade Learning Environment (version 0.10.1+unknown)
[Powered by Stella]


In [3]:
class AtariPreprocessingFire(AtariPreprocessing):
    def reset(self, **kwargs):
        obs, reset_info = super().reset(**kwargs)
        super().step(1)
        return obs, reset_info
    def step(self, action):
        self.lives_before_action = self.ale.lives()
        obs, rewards, terminated, truncated, info = super().step(action)
        done = terminated or truncated
        if not done and self.ale.lives() < self.lives_before_action:
            super().step(1)
        return obs, rewards, terminated, truncated, info

env = AtariPreprocessingFire(env)

In [4]:
env = AtariPreprocessingFire(env)
env = FrameStackObservation(env, stack_size=4)



In [5]:
def plot_observation(obs):
    obs = obs.astype(np.float32)
    img = obs[:3]
    current_frame_delta = np.maximum(obs[3] - obs[:3].mean(axis=0), 0.)
    img[0] += current_frame_delta
    img[2] += current_frame_delta
    img = np.clip(img / 150, 0, 1)
    img = np.transpose(img, (1,2,0))
    plt.imshow(img)

In [6]:
update_period = 4
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0,
                                          decay_steps=250000 // update_period,
                                          end_learning_rate=0.01)
loss_fn = keras.losses.huber
discount_factor = 0.99
replay_buffer = deque(maxlen=50000)

In [7]:
q_net = keras.models.Sequential([
    keras.layers.Input(shape=env.observation_space.shape),
    keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 255.),
    keras.layers.Conv2D(32, (8,8), strides=4, activation="relu", data_format="channels_first"),
    keras.layers.Conv2D(64, (4,4), strides=2, activation="relu", data_format="channels_first"),
    keras.layers.Conv2D(64, (3,3), strides=1, activation="relu", data_format="channels_first"),
    keras.layers.Flatten(),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dense(4)
])

target_net = keras.models.clone_model(q_net)
target_net.set_weights(q_net.get_weights())

In [8]:
def epsilon_greedy_policy(obs, action_space, epsilon):
    if np.random.rand() < epsilon:
        return action_space.sample()
    else:
        q_values = q_net.predict(obs[np.newaxis])
        return np.argmax(q_values[0])    

In [9]:

class DqnAgent:
    def __init__(self, q_network, target_network, replay_buffer, discount_factor ,**kwargs):
        self.n_train_step = 0
        self.q_network = q_network
        self.target_network = target_network
        self.replay_buffer = replay_buffer
        self.discount_factor = discount_factor

        for key, value in kwargs.items():
            setattr(self, key, value)
        
        # metrics
        self.episodes = 0
        self.environment_steps = 0
        
    
    def initialization(self, num_steps):
        """
        Collect the initial experiences, before training
        """
        state, _ = env.reset()
        for _ in range(num_steps):
            action = env.action_space.sample()
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            self.replay_buffer.append((state, action, reward, next_state, done))
            state = next_state
            if done:
                state, _ = env.reset()

    def train_step(self):
        rnd_indices = np.random.randint(0, len(self.replay_buffer), 32)
        samples = [self.replay_buffer[index] for index in rnd_indices]
        states, actions, rewards, next_states, dones = [np.array([sample[index_field] for sample in samples]) 
                                                        for index_field in range(5)]

        target_q_value = self.q_network.predict(next_states)
        target_mask = tf.one_hot(np.argmax(target_q_value, axis=1), 4).numpy()
        max_target_q_value = (self.target_network.predict(next_states) * target_mask).sum(axis=1)
        max_target_q_value = (max_target_q_value * self.discount_factor * (1 - dones) + rewards)
        max_target_q_value = max_target_q_value.reshape(-1,1)
        q_mask = tf.one_hot(actions, 4)
        
        with tf.GradientTape() as tape:
            q_value = self.q_network(states)
            # We use the tf.reduce_sum instead of .sum() because self.q_network(states)
            # outputs a tensor object and not an array
            q_value = tf.reduce_sum(q_value * q_mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(loss_fn(max_target_q_value, q_value))

        grads = tape.gradient(loss, self.q_network.trainable_variables)
        grad_norm = tf.reduce_mean([tf.norm(g) for g in grads if g is not None]).numpy()
        optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))
        return loss, grad_norm
        
    def collect_step(self, state, env, update_period=4):
        
        epsilon = epsilon_fn(self.n_train_step)
        
        for step in range(update_period):
            self.environment_steps += 1
            
            action = epsilon_greedy_policy(state, env.action_space, epsilon)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            self.replay_buffer.append((state, action, reward, next_state, done))
            state = next_state
            
            if done:
                state, _ = env.reset()
                self.episodes += 1

        loss, grad = self.train_step()
        self.n_train_step += 1
        if self.n_train_step % 2000 == 0:
            self.target_network.set_weights(self.q_network.get_weights())
        epsilon = epsilon_fn(self.n_train_step)
        return state, loss, grad



In [10]:
agent = DqnAgent(q_network=q_net,
                target_network= target_net,
                replay_buffer=replay_buffer,
                discount_factor=discount_factor
                )
agent.initialization(20000)

In [11]:
n_iterations = 10000


def train_agent(n_iterations):
    state, _ = env.reset()
    for iteration in range(n_iterations):
        state, loss, grad = agent.collect_step(state, env, update_period)
    
        if iteration % 1000 == 0:
            print(f"\nNumberOfEpisodes = {agent.episodes} \nEnvironmentSteps = {agent.environment_steps} \nLoss = {loss} \nGrad = {grad}")

In [12]:
train_agent(10000)










NumberOfEpisodes = 0 
EnvironmentSteps = 4 
Loss = 0.00028323911828920245 
Grad = 0.010298940353095531

NumberOfEpisodes = 25 
EnvironmentSteps = 4004 
Loss = 0.014929721131920815 
Grad = 0.007817496545612812

NumberOfEpisodes = 49 
EnvironmentSteps = 8004 
Loss = 4.557302872854052e-06 
Grad = 0.00021296701743267477

NumberOfEpisodes = 70 
EnvironmentSteps = 12004 
Loss = 7.859755714889616e-05 
Grad = 0.0022250935435295105

NumberOfEpisodes = 94 
EnvironmentSteps = 16004 
Loss = 0.005851211491972208 
Grad = 0.012458881363272667

NumberOfEpisodes = 117 
EnvironmentSteps = 20004 
Loss = 0.002285025082528591 
Grad = 0.03544222190976143

NumberOfEpisodes = 141 
EnvironmentSteps = 24004 
Loss = 0.00012176336895208806 
Grad = 0.0036012395285069942

NumberOfEpisodes = 166 
EnvironmentSteps = 28004 
Loss = 0.0002436703653074801 
Grad = 0.012821783311665058


KeyboardInterrupt: 

In [None]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.show()
    return anim

In [None]:
frames = []

state, _ = env.reset()
for _ in range(1000):
    action = q_net.predict(state[np.newaxis])
    state, rewards, terminated, truncated, info = env.step(np.argmax(action))
    img = env.render()
    done = terminated or truncated
    frames.append(img)
    
    if done:
        break



In [None]:
plot_animation(frames)