In [1]:
import gymnasium as gym 
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras.layers import Concatenate
from collections import deque
import random

In [2]:
env = gym.make("Pendulum-v1", render_mode= "human", g = 9.81)
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
state_size = env.observation_space.shape[0] # x, y, angular velocity
action_size = env.action_space.shape[0] # torque [-2, 2]
gamma = 0.89 # discount rate
learning_rate = 0.001 # learning rate

# Define the actor model
states_inputs = Input(shape=(state_size,))
dense = Dense(400, activation='relu')(states_inputs)
dense = Dense(300, activation='relu')(dense)
outputs = Dense(action_size, activation='tanh')(dense)
outputs = keras.layers.Lambda(lambda x: x * 2.0)(outputs)  # Scale action to [-2, 2]
actor_model = Model(inputs=states_inputs, outputs=outputs)
# Define the critic model
state_input = Input(shape=(state_size,))
action_input = Input(shape=(action_size,))
concat = Concatenate()([state_input, action_input])
dense = Dense(400, activation='relu')(concat)
dense = Dense(300, activation='relu')(dense)
output = Dense(1, activation='linear')(dense)
critic_model = Model(inputs=[state_input, action_input], outputs=output)

try:
    actor_model.load_weights('actor_model.weights.h5')
    critic_model.load_weights('critic_model.weights.h5')
except:
    pass

actor_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

target_actor = keras.models.clone_model(actor_model)
target_actor.set_weights(actor_model.get_weights())
target_critic = keras.models.clone_model(critic_model)
target_critic.set_weights(critic_model.get_weights())

ckpt = tf.train.Checkpoint(actor_optimizer=actor_optimizer,
                           critic_optimizer=critic_optimizer)

# Restore the latest checkpoint with optimizer states
ckpt.restore(tf.train.latest_checkpoint("optimizers_ckpt")).expect_partial()



<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x28d95051090>

In [4]:
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def store(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones
    def size(self):
        return len(self.buffer)

class OUActionNoise: # Ornstein-Uhlenbeck Process
    def __init__(self, mean, std_dev, theta=0.15, dt=1e-2):
        self.theta = theta
        self.mu = mean
        self.sigma = std_dev
        self.dt = dt
        self.x_prev = np.zeros_like(self.mu)
    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x
    
def soft_update(target_weights, online_weights, tau=0.005):
    for (target, online) in zip(target_weights, online_weights):
        target.assign(target * (1 - tau) + online * tau) 

noise = OUActionNoise(mean=np.zeros(action_size), std_dev=0.01 * np.ones(action_size))
replay_buffer = ReplayBuffer()

In [5]:
epochs = 1000
batch_size = 64
history = []

# trigger = lambda t: t % 10 == 0
# env = RecordVideo(env, './pendulum_video', episode_memory=trigger, disable_logger=True)

for epoch in range(epochs):
    total_reward = 0
    state, _ = env.reset()
    episode_memory = []
    done = False
    while not done:
        action = actor_model(tf.convert_to_tensor([state], dtype=tf.float32))[0].numpy()
        # action += noise()
        action = np.clip(action, -2.0, 2.0)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # cos_theta = next_state[0] 
        # sin_theta = next_state[1]
        # theta = np.arctan2(sin_theta, cos_theta)  # [-pi, pi]
        # theta_error = ((theta - np.pi) + np.pi) % (2 * np.pi) - np.pi
        # theta_dt2 = next_state[2]
        # reward = float( -(theta_error**2 + 0.1 * theta_dt2**2 + 0.001 * (action**2)))

        replay_buffer.store(state, action, reward, next_state, done)

        total_reward += reward
        state = next_state

        if replay_buffer.size() >= batch_size:
            states_b, actions_b, rewards_b, next_states_b, dones_b = replay_buffer.sample(batch_size)

            states_b = tf.convert_to_tensor(states_b, dtype=tf.float32)
            actions_b = tf.convert_to_tensor(actions_b, dtype=tf.float32)
            rewards_b = tf.convert_to_tensor(rewards_b, dtype=tf.float32)
            next_states_b = tf.convert_to_tensor(next_states_b, dtype=tf.float32)
            dones_b = tf.convert_to_tensor(dones_b, dtype=tf.float32)

            with tf.GradientTape() as tape_critic:
                next_actions = target_actor(next_states_b)
                next_q = tf.squeeze(target_critic([next_states_b, next_actions]), axis=1)
                target_q = rewards_b + gamma * (1 - dones_b) * next_q
                current_q = tf.squeeze(critic_model([states_b, actions_b]), axis=1)
                critic_loss = tf.keras.losses.MSE(target_q, current_q)
            with tf.GradientTape() as tape_actor:
                action_pred = actor_model(states_b, training=True)
                actor_loss = -tf.reduce_mean(critic_model([states_b, action_pred]))
            critic_grads = tape_critic.gradient(critic_loss, critic_model.trainable_variables)
            actor_grads = tape_actor.gradient(actor_loss, actor_model.trainable_variables)

            critic_optimizer.apply_gradients(zip(critic_grads, critic_model.trainable_variables))
            actor_optimizer.apply_gradients(zip(actor_grads, actor_model.trainable_variables))

            soft_update(target_actor.variables, actor_model.variables, tau=0.005)
            soft_update(target_critic.variables, critic_model.variables, tau=0.005)
    history.append(total_reward)
    # if terminated or truncated:
    #     break
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Total Reward: {float(total_reward):.4f}")
    

KeyboardInterrupt: 

In [None]:
# epochs = 40
# batch_size = 32
# history = []

# # trigger = lambda t: t % 10 == 0
# # env = RecordVideo(env, './pendulum_video', episode_memory=trigger, disable_logger=True)

# for epoch in range(epochs):
#     total_reward = 0
#     state, _ = env.reset()
#     episode_memory = []
#     done = False
#     while not done:
#         action = actor_model(tf.convert_to_tensor([state], dtype=tf.float32))[0].numpy()
#         action += noise()
#         action = np.clip(action, -2.0, 2.0)

#         next_state, reward, terminated, truncated, _ = env.step(action)
#         done = terminated or truncated

#         # cos_theta = next_state[0] 
#         # sin_theta = next_state[1]
#         # theta = np.arctan2(sin_theta, cos_theta)  # [-pi, pi]
#         # theta_error = ((theta - np.pi) + np.pi) % (2 * np.pi) - np.pi
#         # theta_dt2 = next_state[2]
#         # reward = float( -(theta_error**2 + 0.1 * theta_dt2**2 + 0.001 * (action**2)))

#         replay_buffer.store(state, action, reward, next_state, done)

#         total_reward += reward
#         state = next_state

#         if replay_buffer.size() >= batch_size:
#             states_b, actions_b, rewards_b, next_states_b, dones_b = replay_buffer.sample(batch_size)

#             states_b = tf.convert_to_tensor(states_b, dtype=tf.float32)
#             actions_b = tf.convert_to_tensor(actions_b, dtype=tf.float32)
#             rewards_b = tf.convert_to_tensor(rewards_b, dtype=tf.float32)
#             next_states_b = tf.convert_to_tensor(next_states_b, dtype=tf.float32)
#             dones_b = tf.convert_to_tensor(dones_b, dtype=tf.float32)

#             with tf.GradientTape() as tape_critic:
#                 next_actions = target_actor(next_states_b)
#                 next_q = tf.squeeze(target_critic([next_states_b, next_actions]), axis=1)
#                 target_q = rewards_b + gamma * (1 - dones_b) * next_q
#                 current_q = tf.squeeze(critic_model([states_b, actions_b]), axis=1)
#                 critic_loss = tf.keras.losses.MSE(target_q, current_q)
#             with tf.GradientTape() as tape_actor:
#                 action_pred = actor_model(states_b, training=True)
#                 actor_loss = -tf.reduce_mean(critic_model([states_b, action_pred]))
#             critic_grads = tape_critic.gradient(critic_loss, critic_model.trainable_variables)
#             actor_grads = tape_actor.gradient(actor_loss, actor_model.trainable_variables)

#             critic_optimizer.apply_gradients(zip(critic_grads, critic_model.trainable_variables))
#             actor_optimizer.apply_gradients(zip(actor_grads, actor_model.trainable_variables))

#             soft_update(target_actor.variables, actor_model.variables, tau=0.005)
#             soft_update(target_critic.variables, critic_model.variables, tau=0.005)
#     history.append(total_reward)
#     # if terminated or truncated:
#     #     break
#     if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Total Reward: {float(total_reward):.4f}")
    

In [None]:
# Save model weights
actor_model.save_weights("actor_model.weights.h5")
critic_model.save_weights("critic_model.weights.h5")

# Save optimizers with tf.train.Checkpoint
ckpt = tf.train.Checkpoint(actor_optimizer=actor_optimizer,
                           critic_optimizer=critic_optimizer)
ckpt.save("optimizers_ckpt/ckpt")


'optimizers_ckpt/ckpt-1'