In [None]:
import gymnasium as gym 
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras.layers import Concatenate
from collections import deque
import random

In [None]:
env = gym.make("Pendulum-v1", render_mode= "human", g = 9.81)
tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
state_size = env.observation_space.shape[0] # x, y, angular velocity
action_size = env.action_space.shape[0] # torque [-2, 2]
gamma = 0.99 # discount rate
learning_rate = 0.001 # learning rate

# Define the actor model
states_inputs = Input(shape=(state_size,))
dense = Dense(400, activation='relu')(states_inputs)
dense = Dense(300, activation='relu')(dense)
outputs = Dense(action_size, activation='tanh')(dense)
outputs = keras.layers.Lambda(lambda x: x * 2.0)(outputs)  # Scale action to [-2, 2]
actor_model = Model(inputs=states_inputs, outputs=outputs)

# Critic 1
state_input1 = Input(shape=(state_size,))
action_input1 = Input(shape=(action_size,))
concat1 = Concatenate()([state_input1, action_input1])
dense1 = Dense(400, activation='relu')(concat1)
dense1 = Dense(300, activation='relu')(dense1)
output1 = Dense(1)(dense1)
critic_model1 = Model([state_input1, action_input1], output1)

# Critic 2
state_input2 = Input(shape=(state_size,))
action_input2 = Input(shape=(action_size,))
concat2 = Concatenate()([state_input2, action_input2])
dense2 = Dense(400, activation='relu')(concat2)
dense2 = Dense(300, activation='relu')(dense2)
output2 = Dense(1)(dense2)
critic_model2 = Model([state_input2, action_input2], output2)

try:
    actor_model.load_weights('saves/td3/actor_model.weights.h5')
    critic_model1.load_weights('saves/td3/critic_model1.weights.h5')
    critic_model2.load_weights('saves/td3/critic_model2.weights.h5')
except:
    pass

actor_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
critic_optimizer1 = tf.keras.optimizers.Adam(learning_rate=learning_rate)
critic_optimizer2 = tf.keras.optimizers.Adam(learning_rate=learning_rate)

target_actor = keras.models.clone_model(actor_model)
target_actor.set_weights(actor_model.get_weights())

target_critic1 = keras.models.clone_model(critic_model1)
target_critic1.set_weights(critic_model1.get_weights())
target_critic2 = keras.models.clone_model(critic_model2)
target_critic2.set_weights(critic_model2.get_weights())

ckpt = tf.train.Checkpoint(actor_optimizer=actor_optimizer,
                           critic_optimizer1=critic_optimizer1, 
                           critic_optimizer2=critic_optimizer2)

# Restore the latest checkpoint with optimizer states
ckpt.restore(tf.train.latest_checkpoint("saves/td3/optimizers_ckpt")).expect_partial()



<tensorflow.python.checkpoint.checkpoint.InitializationOnlyStatus at 0x24914cda740>

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def store(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones
    def size(self):
        return len(self.buffer)
# No need for OU noise in TD3 as per original paper
# class OUActionNoise: # Ornstein-Uhlenbeck Process
#     def __init__(self, mean, std_dev, theta=0.15, dt=1e-2):
#         self.theta = theta
#         self.mu = mean
#         self.sigma = std_dev
#         self.dt = dt
#         self.x_prev = np.zeros_like(self.mu)
#     def __call__(self):
#         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
#             self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
#         self.x_prev = x
#         return x
    
def soft_update(target_weights, online_weights, tau=0.005):
    for (target, online) in zip(target_weights, online_weights):
        target.assign(target * (1 - tau) + online * tau) 

# noise = OUActionNoise(mean=np.zeros(action_size), std_dev=0.01 * np.ones(action_size))
replay_buffer = ReplayBuffer()

In [None]:
epoch = 200
batch_size = 64
history = []
policy_delay = 2  # Delayed policy updates

for e in range(epoch): 
    state, _ = env.reset() 
    done = False 
    total_reward = 0
    step = 0
    while not done:
        step += 1
        action = actor_model(tf.convert_to_tensor([state], dtype=tf.float32)).numpy()[0]
        action = action + np.random.normal(0, 0.1, size=action_size)
        action = np.clip(action, -2.0, 2.0)  # clip action to [-2, 2]

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        replay_buffer.store(state, action, reward, next_state, done)

        total_reward += reward
        state = next_state

        if replay_buffer.size() >= batch_size: 
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
            states = tf.convert_to_tensor(states, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.float32)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

            next_actions = target_actor(next_states)
            target1 = tf.squeeze(target_critic1([next_states, next_actions]), axis=1)
            target2 = tf.squeeze(target_critic2([next_states, next_actions]), axis=1)
            target_q = rewards + gamma * (1 - dones) * tf.minimum(target1, target2)
            with tf.GradientTape() as tape_critic1, tf.GradientTape() as tape_critic2:
                q1 = critic_model1([states, actions], training=True)
                q2 = critic_model2([states, actions], training=True)

                # Compute losses
                loss1 = tf.keras.losses.MSE(target_q, q1)
                loss2 = tf.keras.losses.MSE(target_q, q2)

            # Get gradients for each critic once
            critic_grad1 = tape_critic1.gradient(loss1, critic_model1.trainable_variables)
            critic_grad2 = tape_critic2.gradient(loss2, critic_model2.trainable_variables)

            # Apply gradients
            critic_optimizer1.apply_gradients(zip(critic_grad1, critic_model1.trainable_variables))
            critic_optimizer2.apply_gradients(zip(critic_grad2, critic_model2.trainable_variables))
            if step % policy_delay == 0:  # Delayed policy updates
                with tf.GradientTape() as tape_actor: 
                    action = actor_model(states)
                    actor_loss = -tf.reduce_mean(critic_model1([states, action]))

                actor_grad = tape_actor.gradient(actor_loss, actor_model.trainable_variables)
                actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))

                soft_update(target_actor.variables, actor_model.variables, tau=0.005)
                soft_update(target_critic1.variables, critic_model1.variables, tau=0.005)
                soft_update(target_critic2.variables, critic_model2.variables, tau=0.005)
    history.append(total_reward)
    if (e + 1) % 10 == 0:
        print(f"Epoch {e + 1}/{epoch}, Total Reward: {float(total_reward):.4f}")

        

NameError: name 'env' is not defined

In [None]:
# Save model weights
actor_model.save_weights("saves/td3/actor_model.weights.h5")
critic_model1.save_weights("saves/td3/critic_model1.weights.h5")
critic_model2.save_weights("saves/td3/critic_model2.weights.h5")

# Save optimizers with tf.train.Checkpoint
ckpt = tf.train.Checkpoint(actor_optimizer=actor_optimizer,
                           critic_optimizer1=critic_optimizer1,
                           critic_optimizer2=critic_optimizer2)
ckpt.save("saves/td3/optimizers_ckpt/ckpt")


'saves/td3/optimizers_ckpt/ckpt-1'