In [1]:
%matplotlib ipympl

import numpy as np
import tensorflow as tf
from tensorflow import keras
import gymnasium

import ale_py
import matplotlib.animation as animation
import matplotlib.pyplot as plt

from collections import deque
tf.keras.utils.disable_interactive_logging()
tf.debugging.set_log_device_placement(False)
tf.keras.backend.clear_session()

2024-11-27 15:04:58.871689: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-27 15:05:00.896570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nick/miniconda3/envs/tf_env/lib/python3.9/site-packages/nvidia/cudnn/lib:/home/nick/miniconda3/envs/tf_env/lib/
2024-11-27 15:05:00.896671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nick

In [2]:
env = gymnasium.make("LunarLander-v3", render_mode="rgb_array")
# obs space = [lander_x , lander_y, vel_x, vel_y, angle, angular_vel, l_contact, r_contact]

In [3]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.show()
    return anim

In [4]:
inputs = keras.layers.Input(shape=env.observation_space.shape)
h_1 = keras.layers.Dense(32, activation="relu")(inputs)
h_2 = keras.layers.Dense(32, activation="relu")(h_1)
outputs = keras.layers.Dense(4)(h_2)
q_net = keras.Model(inputs,outputs)

target_net = keras.models.clone_model(q_net)
target_net.set_weights(q_net.get_weights())

In [5]:
def epsilon_greedy_policy(epsilon, state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        q_val = q_net.predict(state[np.newaxis])
        return np.argmax(q_val[0])

In [6]:
n_iterations = 10000
# env.spec.max_episode_steps=1000 
n_steps = 4

replay_buffer = deque(maxlen=50000)
loss_fn = keras.losses.huber
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0,
                                                       decay_steps = 12000,
                                                       end_learning_rate=0.01)

In [7]:
def initialize_buffer(env, n_steps, replay_buffer):
    state, _ = env.reset()
    for _ in range(n_steps):
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            state, _ = env.reset()

In [8]:
def train_step(q_net, target_net, replay_buffer):
    rnd_indices = np.random.randint(0, len(replay_buffer), 32)
    samples = [replay_buffer[index] for index in rnd_indices]
    states, actions, rewards, next_states, dones = [np.array([sample[index] for sample in samples]) for index in range(5)]

    target_value = q_net.predict(next_states)
    target_mask = tf.one_hot(np.argmax(target_value, axis=1), 4).numpy()

    max_target_value = (target_net.predict(next_states) * target_mask).sum(axis=1) 
    max_target_value = (max_target_value * 0.99 * (1 - dones) + rewards)
    max_target_value = max_target_value.reshape(-1, 1)

    q_val_mask = tf.one_hot(actions, 4)

    with tf.GradientTape() as tape:
        q_value = q_net(states)
        q_value = tf.reduce_sum(q_value * q_val_mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(max_target_value, q_value))
    
    gradients = tape.gradient(loss, q_net.trainable_variables)
    grad_norm = tf.reduce_mean([tf.norm(g) for g in gradients if g is not None]).numpy()
    optimizer.apply_gradients(zip(gradients, q_net.trainable_variables))
    return loss, grad_norm

In [9]:
initialize_buffer(env, 20000, replay_buffer)

In [11]:
state,_ = env.reset()

In [10]:
global_step = 0

state, _ = env.reset()
for iteration in range(n_iterations):
    for step in range(n_steps):
        epsilon = epsilon_fn(global_step)
        action = epsilon_greedy_policy(epsilon, state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        global_step += 1
        done = terminated or truncated
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state

        if done:
            state, _ = env.reset()
            
    loss, grad_norm  = train_step(q_net, target_net, replay_buffer)
    print(f"\riteration:{iteration}",end="")
    if global_step % 100 == 0:
        tf.keras.backend.clear_session()  # Clear any lingering computation graphs
    if global_step % 400 == 0:
        target_net.set_weights(q_net.get_weights())









iteration:9999