In [1]:
%matplotlib ipympl

In [2]:
import tensorflow as tf
from tensorflow import keras
import gymnasium
from gymnasium.wrappers import AtariPreprocessing, FrameStackObservation
from collections import deque
import numpy as np
import ale_py

import matplotlib.pyplot as plt
tf.keras.utils.disable_interactive_logging()

2024-12-19 15:25:45.061419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-19 15:25:45.882111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nick/miniconda3/envs/tf_env/lib/python3.9/site-packages/nvidia/cudnn/lib:/home/nick/miniconda3/envs/tf_env/lib/
2024-12-19 15:25:45.882236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nick

In [3]:
env = gymnasium.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array")

A.L.E: Arcade Learning Environment (version 0.10.1+unknown)
[Powered by Stella]


In [4]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.show()
    return anim

In [5]:
class AtariPreprocessingFrameSkip(AtariPreprocessing):
    def reset(self, **kwargs):
        obs, reset_info = super().reset(**kwargs)
        for _ in range(40):
            super().step(0)
        return obs, reset_info
    def step(self, action):
        self.lives_before_action = self.ale.lives()
        obs, rewards, terminated, truncated, info = super().step(action)
        done = terminated or truncated
        if not done and self.ale.lives() < self.lives_before_action:
            for _ in range(40):
                super().step(0)
        return obs, rewards, terminated, truncated, info

In [6]:
env = AtariPreprocessingFrameSkip(env)
env = FrameStackObservation(env, stack_size=4)



In [7]:
n_inputs = env.observation_space.shape
n_outputs = env.action_space.n

q_net = keras.models.Sequential([
    keras.layers.Input(shape=n_inputs),
    keras.layers.Lambda(lambda obs: tf.cast(obs, tf.float32) / 255.),
    keras.layers.Conv2D(32, kernel_size=(8,8), strides=4, activation="relu", data_format="channels_first"),
    keras.layers.Conv2D(64, kernel_size=(4,4), strides=2, activation="relu", data_format="channels_first"),
    keras.layers.Conv2D(64, kernel_size=(3,3), strides=1, activation="relu", data_format="channels_first"),
    keras.layers.Flatten(),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dense(n_outputs)
    
])

target_net = keras.models.clone_model(q_net)
target_net.set_weights(q_net.get_weights())

In [8]:
class DqnAgent:
    def __init__(self, env, q_net, target_net, discount_rate, replay_buffer, loss_fn, optimizer):
        self.env = env
        self.q_net = q_net
        self.target_net = target_net
        self.discount_rate = discount_rate
        self.replay_buffer = replay_buffer
        self.loss_fn = loss_fn
        self.optimizer = optimizer

        self.epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
                        initial_learning_rate=1.0,
                        decay_steps=20000,
                        end_learning_rate=0.01
                        )
    
    def greedy_policy(self, state, epsilon):
        if np.random.rand() < epsilon:
            return self.env.action_space.sample()

        q_values = self.q_net.predict(state[np.newaxis])
        return np.argmax(q_values[0])

    def initialize_buffer(self, n_max_steps):
        state, _ = self.env.reset()
        for _ in range(n_max_steps):
            action = self.env.action_space.sample()
            next_state, reward, terminated, truncated, info = self.env.step(action)
            done = terminated or truncated
            self.replay_buffer.append((state, action, reward, next_state, done))
            state = next_state
            if done:
                state, _ = self.env.reset()
    
    def collect_step(self, state, iteration):
        epsilon = self.epsilon_fn(iteration)
        action = self.greedy_policy(state, epsilon)
        next_state, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        self.replay_buffer.append((state, action, reward, next_state, done))
        return next_state, reward, done

    def train_step(self, batch_size):
        random_indices = np.random.randint(0, len(self.replay_buffer), batch_size)
        samples = [self.replay_buffer[index] for index in random_indices]
        states, actions, rewards, next_states, dones = [np.array([sample[field_index] 
                                                                  for sample in samples]) 
                                                        for field_index in range(5)]

        q_next_values = self.q_net.predict(next_states)
        q_next_mask = tf.one_hot(np.argmax(q_next_values, axis=1), n_outputs).numpy()

        t_q_values = self.target_net.predict(next_states)
        t_q_max_values = (t_q_values * q_next_mask).sum(axis=1)
        t_q_max_values = (rewards + t_q_max_values * self.discount_rate * (1 - dones))
        t_q_max_values.reshape(-1, 1)

        q_mask = tf.one_hot(actions, n_outputs)

        with tf.GradientTape() as tape:
            q_values = self.q_net(states)
            q_values = tf.reduce_sum(q_values * q_mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.loss_fn(t_q_max_values, q_values))

        gradients = tape.gradient(loss, self.q_net.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.q_net.trainable_variables))
            
        
        

In [9]:
n_iterations = 50000
n_max_steps = 1000
discount_rate = 0.99
replay_buffer = deque(maxlen=50000)
loss_fn = keras.losses.Huber()
optimizer = keras.optimizers.Adam(learning_rate=1e-4)

In [10]:
dqn_agent = DqnAgent(
    env=env,
    q_net=q_net,
    target_net=target_net,
    discount_rate=discount_rate,
    replay_buffer=replay_buffer,
    loss_fn=loss_fn,
    optimizer=optimizer
)

In [11]:
dqn_agent.initialize_buffer(20000)

In [None]:
writer = tf.summary.create_file_writer("logs")



total_rewards = 0
episode_rewards = [0]
state, _ = env.reset()
for iteration in range(n_iterations):
    keras.backend.clear_session()
    for step in range(4):
        state, reward, done = dqn_agent.collect_step(state, iteration)
        total_rewards += reward
        if done:
            episode_rewards.append(total_rewards)
            total_rewards = 0
            state,_ = env.reset()
    print(f"\rIteration: {iteration}\tEpisode reward: {episode_rewards[-1]}", end="")
    with writer.as_default():
        tf.summary.scalar("Reward", episode_rewards[-1], step=len(episode_rewards))
    dqn_agent.train_step(batch_size=32)
    if iteration % 2000 == 0:
        dqn_agent.target_net.set_weights(dqn_agent.q_net.get_weights())









Iteration: 20490	Episode reward: 90.00