In [47]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras.layers import Concatenate
from collections import deque
import random
from quanser.hardware import HIL 
from pal.products.qube import QubeServo3
from pal.utilities.math import SignalGenerator, ddt_filter
import time

In [48]:
# # board.close()
# # Open connection to QUBE
# # board = HIL("qube_servo3_usb", "0")

# encoder_channels = np.array([0, 1], dtype=np.uint32)
# motor_channels = np.array([0], dtype=np.uint32)
# counts = np.zeros(2, dtype=np.int32)

# ENCODER_RES = 2048
# ARM_RAD_PER_COUNT = 2*np.pi / ENCODER_RES
# PEND_RAD_PER_COUNT = 2*np.pi / ENCODER_RES

# dt = 0.01  # 10 ms
# theta_arm_prev  = counts[0] * ARM_RAD_PER_COUNT
# theta_pend_prev = counts[1] * PEND_RAD_PER_COUNT

In [49]:
global KILL_THREAD
KILL_THREAD = False

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def store(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return (np.array(states, dtype=np.float32),
            np.array(actions, dtype=np.float32),
            np.array(rewards, dtype=np.float32),
            np.array(next_states, dtype=np.float32),
            np.array(dones, dtype=np.float32))
    def size(self):
        return len(self.buffer)
    
def soft_update(target_weights, online_weights, tau=0.005):
    for (target, online) in zip(target_weights, online_weights):
        target.assign(target * (1 - tau) + online * tau) 

def sig_handler(*args): 
    global KILL_THREAD
    KILL_THREAD = True

# signal.signal(signal.SIGINT, sig_handler)

replay_buffer = ReplayBuffer()

In [50]:
state_size = 4
action_size = 1
gamma = 0.99 # discount rate
learning_rate = 0.001 # learning rate

# Define the actor model
states_inputs = Input(shape=(state_size,))
dense = Dense(400, activation='relu')(states_inputs)
dense = Dense(300, activation='relu')(dense)
outputs = Dense(action_size, activation='tanh')(dense)
outputs = keras.layers.Lambda(lambda x: x * 2.0)(outputs)  # Scale action to [-2, 2]
actor_model = Model(inputs=states_inputs, outputs=outputs)

# Critic 1
state_input1 = Input(shape=(state_size,))
action_input1 = Input(shape=(action_size,))
concat1 = Concatenate()([state_input1, action_input1])
dense1 = Dense(400, activation='relu')(concat1)
dense1 = Dense(300, activation='relu')(dense1)
output1 = Dense(1)(dense1)
critic_model1 = Model([state_input1, action_input1], output1)

# Critic 2
state_input2 = Input(shape=(state_size,))
action_input2 = Input(shape=(action_size,))
concat2 = Concatenate()([state_input2, action_input2])
dense2 = Dense(400, activation='relu')(concat2)
dense2 = Dense(300, activation='relu')(dense2)
output2 = Dense(1)(dense2)
critic_model2 = Model([state_input2, action_input2], output2)

try:
    actor_model.load_weights('saves/quanser/actor_model.weights.h5')
    critic_model1.load_weights('saves/quanser/critic_model1.weights.h5')
    critic_model2.load_weights('saves/quanser/critic_model2.weights.h5')
except:
    pass

actor_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
critic_optimizer1 = tf.keras.optimizers.Adam(learning_rate=learning_rate)
critic_optimizer2 = tf.keras.optimizers.Adam(learning_rate=learning_rate)

target_actor = keras.models.clone_model(actor_model)
target_actor.set_weights(actor_model.get_weights())

target_critic1 = keras.models.clone_model(critic_model1)
target_critic1.set_weights(critic_model1.get_weights())
target_critic2 = keras.models.clone_model(critic_model2)
target_critic2.set_weights(critic_model2.get_weights())

ckpt = tf.train.Checkpoint(actor_optimizer=actor_optimizer,
                           critic_optimizer1=critic_optimizer1, 
                           critic_optimizer2=critic_optimizer2)

# Restore the latest checkpoint with optimizer states
ckpt.restore(tf.train.latest_checkpoint("saves/td3_quanser/optimizers_ckpt")).expect_partial()



<tensorflow.python.checkpoint.checkpoint.InitializationOnlyStatus at 0x1a1e0664c40>

In [62]:
frequency = 500  # Hz
state_theta_dot = np.array([0,0], dtype=np.float64)
state_alpha_dot = np.array([0,0], dtype=np.float64)
with QubeServo3(hardware = 1, pendulum = 1, frequency=frequency) as board:
    while True:
        # Have to initialize the board first before reading motorPosition or it won't read
        board.read_outputs()
        theta = board.motorPosition 
        alpha = board.pendulumPosition 
        theta = np.clip(theta, -np.pi/2, np.pi/2)
        theta_dot, state_theta_dot = ddt_filter(theta, state_theta_dot, 50, 1/frequency)
        # u - input
        # state - previous state returned by this function -- initialize to np.array([0,0], dtype=np.float64)
        # Ts - sample time in seconds
        # A - filter bandwidth in rad/s
        alpha_dot, state_alpha_dot = ddt_filter(alpha, state_alpha_dot, 100, 1/frequency)
        print(f"Theta: {theta:.3f}, Theta dot: {theta_dot:.3f}, Alpha: {alpha:.3f}, Alpha dot: {alpha_dot:.3f}")
        time.sleep(0.5)


Theta: 0.000, Theta dot: 0.000, Alpha: 0.000, Alpha dot: 0.000
Theta: 0.000, Theta dot: 0.000, Alpha: 0.000, Alpha dot: 0.000
Theta: 0.000, Theta dot: 0.000, Alpha: 0.000, Alpha dot: 0.000
Theta: 0.000, Theta dot: 0.000, Alpha: 0.000, Alpha dot: 0.000
Theta: 0.000, Theta dot: 0.000, Alpha: 0.000, Alpha dot: 0.000
Theta: 0.000, Theta dot: 0.000, Alpha: 0.000, Alpha dot: 0.000
Theta: 0.914, Theta dot: 43.536, Alpha: 0.120, Alpha dot: 10.877
Theta: 1.331, Theta dot: 59.258, Alpha: 0.291, Alpha dot: 24.518
Theta: 1.405, Theta dot: 57.121, Alpha: 0.221, Alpha dot: 13.646
Theta: 1.485, Theta dot: 55.479, Alpha: 0.141, Alpha dot: 3.913
Theta: 1.491, Theta dot: 50.488, Alpha: 0.015, Alpha dot: -8.234
Theta: 1.571, Theta dot: 49.478, Alpha: 0.095, Alpha dot: 0.515
Theta: 1.571, Theta dot: 44.766, Alpha: 0.334, Alpha dot: 22.176
Theta: 1.571, Theta dot: 40.502, Alpha: 0.285, Alpha dot: 13.682
Theta: 1.571, Theta dot: 36.645, Alpha: 0.123, Alpha dot: -3.588
Theta: 1.571, Theta dot: 33.155, Alpha:

KeyboardInterrupt: 

In [61]:
batch_size = 32
history = []
policy_delay = 2  # Delayed policy updates
step = 0
total_reward = 0.0
frequency = 500  # Hz
state_theta_dot = np.array([0,0], dtype=np.float64)
state_alpha_dot = np.array([0,0], dtype=np.float64)

with QubeServo3(hardware = 1, pendulum = 1, frequency=10) as board:
    while True: 
        avg_q1, avg_q2, avg_target_q = 0.0, 0.0, 0.0
        step += 1 
        board.read_outputs()
        theta = board.motorPosition * -1
        alpha = board.pendulumPosition 
        theta = np.clip(theta, -np.pi/2, np.pi/2)

        theta_dot, state_theta_dot = ddt_filter(theta, state_theta_dot, 50, 1/frequency)
        alpha_dot, state_alpha_dot = ddt_filter(alpha, state_alpha_dot, 100, 1/frequency)

        state = np.array([theta, theta_dot, alpha, alpha_dot], dtype=np.float32)

        action = actor_model(tf.convert_to_tensor([state], dtype=tf.float32)).numpy()[0]
        action = action + np.random.normal(0, 0.1, size=action_size)  # Add exploration noise
        action = np.clip(action, -2.0, 2.0) 
        board.write_voltage(action)

        board.read_outputs()
        next_theta = board.motorPosition * -1
        next_alpha = board.pendulumPosition
        next_theta = np.clip(next_theta, -np.pi/2, np.pi/2)
        next_theta_dot, state_theta_dot = ddt_filter(next_theta, state_theta_dot, 50, 1/frequency)
        next_alpha_dot, state_alpha_dot = ddt_filter(next_alpha, state_alpha_dot, 100, 1/frequency)
        next_state = np.array([next_theta, next_theta_dot, next_alpha, next_alpha_dot], dtype=np.float32)

        wrapped_alpha = ((alpha - np.pi + np.pi) % (2*np.pi)) - np.pi
        reward = -(wrapped_alpha**2 + 0.1*alpha_dot**2 + 0.1*action**2)
        total_reward += reward

        replay_buffer.store(state, action, reward, next_state, False)
        state = next_state
        
        if replay_buffer.size() >= batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
            states      = tf.convert_to_tensor(states, dtype=tf.float32)
            actions     = tf.convert_to_tensor(actions.reshape(-1,1), dtype=tf.float32)
            rewards     = tf.convert_to_tensor(rewards, dtype=tf.float32)
            next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
            dones       = tf.convert_to_tensor(dones, dtype=tf.float32)

            # add clipped noise to target action
            noise = np.clip(np.random.normal(0, 0.2, size=actions.shape), -0.5, 0.5)
            next_actions = target_actor(next_states) + noise
            next_actions = tf.clip_by_value(next_actions, -2.0, 2.0)  # Pendulum action bounds

            # Compute target Q-values with both critics
            target1 = tf.squeeze(target_critic1([next_states, next_actions]), axis=1)
            target2 = tf.squeeze(target_critic2([next_states, next_actions]), axis=1)
            target_q = rewards + gamma * (1 - dones) * tf.minimum(target1, target2)

            with tf.GradientTape() as tape_critic1, tf.GradientTape() as tape_critic2:
                q1 = critic_model1([states, actions], training=True)
                q2 = critic_model2([states, actions], training=True)

                # Compute losses
                loss1 = tf.keras.losses.MSE(target_q, q1)
                loss2 = tf.keras.losses.MSE(target_q, q2)

            avg_q1 = tf.reduce_mean(q1).numpy().item()
            avg_q2 = tf.reduce_mean(q2).numpy().item()
            avg_target_q = tf.reduce_mean(target_q).numpy().item()

            # Get gradients for each critic once
            critic_grad1 = tape_critic1.gradient(loss1, critic_model1.trainable_variables)
            critic_grad2 = tape_critic2.gradient(loss2, critic_model2.trainable_variables)

            # Apply gradients
            critic_optimizer1.apply_gradients(zip(critic_grad1, critic_model1.trainable_variables))
            critic_optimizer2.apply_gradients(zip(critic_grad2, critic_model2.trainable_variables))
            if step % policy_delay == 0:  # Delayed policy updates
                with tf.GradientTape() as tape_actor: 
                    action = actor_model(states)
                    actor_loss = -tf.reduce_mean(critic_model1([states, action]))

                actor_grad = tape_actor.gradient(actor_loss, actor_model.trainable_variables)
                actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))

                soft_update(target_actor.variables, actor_model.variables, tau=0.005)
                soft_update(target_critic1.variables, critic_model1.variables, tau=0.005)
                soft_update(target_critic2.variables, critic_model2.variables, tau=0.005)
        history.append(total_reward)
        if step % 1 == 0:
            print(f"Epoch {step + 1}/{step}, Total Reward: {float(reward):.4f}, "
            f"Q1: {avg_q1:.4f}, Q2: {avg_q2:.4f}, TargetQ: {avg_target_q:.4f}")


  print(f"Epoch {step + 1}/{step}, Total Reward: {float(reward):.4f}, "


Epoch 2/1, Total Reward: -10.0835, Q1: -913.3600, Q2: -909.9741, TargetQ: -846.1288
Epoch 3/2, Total Reward: -106.5498, Q1: -952.4332, Q2: -948.6178, TargetQ: -960.0959
Epoch 4/3, Total Reward: -841.1502, Q1: -859.0465, Q2: -859.4873, TargetQ: -915.0922
Epoch 5/4, Total Reward: -1184.3095, Q1: -889.6292, Q2: -888.7556, TargetQ: -949.9143
Epoch 6/5, Total Reward: -8.8291, Q1: -910.0326, Q2: -911.4404, TargetQ: -853.1587
Epoch 7/6, Total Reward: -1755.6012, Q1: -841.2726, Q2: -848.4001, TargetQ: -812.2518
Epoch 8/7, Total Reward: -2680.9104, Q1: -887.2628, Q2: -893.1108, TargetQ: -952.3666
Epoch 9/8, Total Reward: -662.6842, Q1: -928.9559, Q2: -935.4340, TargetQ: -1000.8057
Epoch 10/9, Total Reward: -5411.6230, Q1: -1046.9744, Q2: -1057.1564, TargetQ: -1055.8726
Epoch 11/10, Total Reward: -2834.5914, Q1: -770.9133, Q2: -779.5778, TargetQ: -739.4778
Epoch 12/11, Total Reward: -663.2492, Q1: -823.7369, Q2: -832.7930, TargetQ: -891.7587
Epoch 13/12, Total Reward: -12.7601, Q1: -923.7491, Q2

KeyboardInterrupt: 

In [None]:
# batch_size = 32
# history = []
# policy_delay = 2  # Delayed policy updates
# step = 0
# total_reward = 0.0

# try:
#     total_reward = 0.0
#     while True:
#         step += 1
    
#         # 1) read state
#         board.read_encoder(encoder_channels, len(encoder_channels), counts)
#         theta_arm  = counts[0] * ARM_RAD_PER_COUNT
#         theta_pend = counts[1] * PEND_RAD_PER_COUNT
#         theta_arm_dot  = (theta_arm  - theta_arm_prev)  / dt
#         theta_pend_dot = (theta_pend - theta_pend_prev) / dt
#         state = np.array([theta_arm, theta_pend, theta_arm_dot, theta_pend_dot], dtype=np.float32)

#         # 2) select action
#         action_vec = actor_model(tf.convert_to_tensor([state], dtype=tf.float32)).numpy()[0]
#         action_val = float(np.clip(action_vec[0], -2.0, 2.0))  # scalar in [-2,2]; tune to your safe V range

#         # 3) apply action (analog write wants numpy float64 buffer)
#         voltages = np.array([action_val], dtype=np.float64)
#         board.write_analog(motor_channels, len(motor_channels), voltages)

#         # 4) get next_state after action
#         time.sleep(dt)  # maintain loop timing around the actuation
#         board.read_encoder(encoder_channels, len(encoder_channels), counts)
#         next_theta_arm  = counts[0] * ARM_RAD_PER_COUNT
#         next_theta_pend = counts[1] * PEND_RAD_PER_COUNT
#         next_theta_arm_dot  = (next_theta_arm  - theta_arm)  / dt
#         next_theta_pend_dot = (next_theta_pend - theta_pend) / dt
#         next_state = np.array([next_theta_arm, next_theta_pend, next_theta_arm_dot, next_theta_pend_dot], dtype=np.float32)

#         # 5) reward (example: upright pendulum, gentle motion)
#         reward = - ( (np.angle(np.exp(1j*(next_theta_pend - np.pi))))**2
#                      + 0.1*next_theta_pend_dot**2 + 0.01*action_val**2 )
#         total_reward += reward

#         # 6) store
#         replay_buffer.store(state, action_val, reward, next_state, False)

#         # 7) train if enough samples
#         if replay_buffer.size() >= batch_size:
#             states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
#             states      = tf.convert_to_tensor(states, dtype=tf.float32)
#             actions     = tf.convert_to_tensor(actions.reshape(-1,1), dtype=tf.float32)
#             rewards     = tf.convert_to_tensor(rewards, dtype=tf.float32)
#             next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
#             dones       = tf.convert_to_tensor(dones, dtype=tf.float32)

#             # target policy smoothing
#             noise = np.clip(np.random.normal(0, 0.2, size=(actions.shape[0], 1)), -0.5, 0.5)
#             target_act = tf.clip_by_value(target_actor(next_states) + noise, -2.0, 2.0)

#             # twin critics target
#             t1 = tf.squeeze(target_critic1([next_states, target_act]), axis=1)
#             t2 = tf.squeeze(target_critic2([next_states, target_act]), axis=1)
#             target_q = rewards + gamma * (1.0 - dones) * tf.minimum(t1, t2)

#             # critic updates
#             with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
#                 q1 = tf.squeeze(critic_model1([states, actions]), axis=1)
#                 q2 = tf.squeeze(critic_model2([states, actions]), axis=1)
#                 loss1 = tf.keras.losses.MSE(target_q, q1)
#                 loss2 = tf.keras.losses.MSE(target_q, q2)
#             critic_optimizer1.apply_gradients(zip(tape1.gradient(loss1, critic_model1.trainable_variables),
#                                                   critic_model1.trainable_variables))
#             critic_optimizer2.apply_gradients(zip(tape2.gradient(loss2, critic_model2.trainable_variables),
#                                                   critic_model2.trainable_variables))

#             # delayed actor + target updates
#             if step % policy_delay == 0:
#                 with tf.GradientTape() as tape_actor:
#                     pi = actor_model(states)
#                     actor_loss = -tf.reduce_mean(critic_model1([states, pi]))
#                 actor_optimizer.apply_gradients(zip(tape_actor.gradient(actor_loss, actor_model.trainable_variables),
#                                                     actor_model.trainable_variables))
#                 soft_update(target_actor.variables,   actor_model.variables,   tau=0.005)
#                 soft_update(target_critic1.variables, critic_model1.variables, tau=0.005)
#                 soft_update(target_critic2.variables, critic_model2.variables, tau=0.005)

#         if step % 100 == 0:
#             print(f"Step {step}  reward_sum: {total_reward:.2f}")

#         # update prev angles for next derivative
#         theta_arm_prev, theta_pend_prev = next_theta_arm, next_theta_pend

# except KeyboardInterrupt:
#     print("\nStopping (Ctrl+C). Saving…")
# finally:
#     # save weights (use .save_weights if you prefer checkpoint style)
#     actor_model.save_weights("saves/quanser/actor_model.weights.h5")
#     critic_model1.save_weights("saves/quanser/critic_model1.weights.h5")
#     critic_model2.save_weights("saves/quanser/critic_model2.weights.h5")
#     # set motor to 0V and close safely
#     board.write_analog(motor_channels, 1, np.array([0.0], dtype=np.float64))
#     board.close()
#     print("Done.")


Stopping (Ctrl+C). Saving…
Done.


In [None]:
# while True:
#     # read state
#     # board.close()
#     board.read_encoder(encoder_channels, len(encoder_channels), counts)
#     theta_arm = counts[0] * ARM_RAD_PER_COUNT
#     theta_pend = counts[1] * PEND_RAD_PER_COUNT

#     # compute action from policy
#     action = actor_model(tf.convert_to_tensor([state], dtype=tf.float32)).numpy()[0]
#     u = float(action)

#     # send to motor
#     board.write_analog(motor_channels, 1, [u])

#     time.sleep(dt)  # ~0.01s


  u = float(action)


TypeError: a bytes-like object is required, not 'list'

In [None]:
import numpy as np
import time
import tensorflow as tf
from collections import deque

# --- HIL/QUBE setup ---
board.close()
board = HIL("qube_servo3_usb", "0")
encoder_channels = np.array([0, 1], dtype=np.uint32)
motor_channels = np.array([0], dtype=np.uint32)
counts = np.zeros(2, dtype=np.int32)

ENCODER_RES = 2048
ARM_RAD_PER_COUNT = 2*np.pi / ENCODER_RES
PEND_RAD_PER_COUNT = 2*np.pi / ENCODER_RES
dt = 0.01  # 10 ms loop

# --- Replay Buffer ---
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def store(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    def sample(self, batch_size):
        batch = np.array(random.sample(self.buffer, batch_size))
        states, actions, rewards, next_states, dones = map(np.stack, zip(*batch))
        return states, actions, rewards, next_states, dones
    def size(self):
        return len(self.buffer)

replay_buffer = ReplayBuffer()

# --- Soft update ---
def soft_update(target_weights, online_weights, tau=0.005):
    for (target, online) in zip(target_weights, online_weights):
        target.assign(target * (1 - tau) + online * tau)

# --- TD3 models already defined: actor_model, critic_model1, critic_model2, 
# target_actor, target_critic1, target_critic2
# optimizers: actor_optimizer, critic_optimizer1, critic_optimizer2

state_size = 4
action_size = 1
gamma = 0.99
batch_size = 32
policy_delay = 2
step = 0

theta_arm_prev = 0.0
theta_pend_prev = 0.0

try:
    while True:
        step += 1

        # --- 1. Read state ---
        board.read_encoder(encoder_channels, len(encoder_channels), counts)
        theta_arm = counts[0] * ARM_RAD_PER_COUNT
        theta_pend = counts[1] * PEND_RAD_PER_COUNT

        theta_arm_dot = (theta_arm - theta_arm_prev) / dt
        theta_pend_dot = (theta_pend - theta_pend_prev) / dt
        theta_arm_prev, theta_pend_prev = theta_arm, theta_pend

        state = np.array([theta_arm, theta_pend, theta_arm_dot, theta_pend_dot], dtype=np.float32)

        # --- 2. Compute action ---
        action = actor_model(tf.convert_to_tensor([state], dtype=tf.float32)).numpy()[0]
        u_array = np.array([float(action)], dtype=np.float64)

        # --- 3. Apply action ---
        board.write_analog(motor_channels, 1, u_array)

        # --- 4. Read next state ---
        board.read_encoder(encoder_channels, len(encoder_channels), counts)
        next_theta_arm = counts[0] * ARM_RAD_PER_COUNT
        next_theta_pend = counts[1] * PEND_RAD_PER_COUNT
        next_theta_arm_dot = (next_theta_arm - theta_arm) / dt
        next_theta_pend_dot = (next_theta_pend - theta_pend) / dt
        next_state = np.array([next_theta_arm, next_theta_pend, next_theta_arm_dot, next_theta_pend_dot], dtype=np.float32)

        # --- 5. Compute reward ---
        reward = - (next_theta_pend**2 + 0.1 * next_theta_pend_dot**2)

        # --- 6. Store transition ---
        replay_buffer.store(state, action, reward, next_state, False)

        # --- 7. Train TD3 ---
        if replay_buffer.size() >= batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
            states = tf.convert_to_tensor(states, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.float32)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

            # Target actions with clipped noise
            noise = np.clip(np.random.normal(0, 0.2, size=actions.shape), -0.5, 0.5)
            next_actions = tf.clip_by_value(target_actor(next_states) + noise, -2.0, 2.0)

            # Target Q-values
            target1 = tf.squeeze(target_critic1([next_states, next_actions]), axis=1)
            target2 = tf.squeeze(target_critic2([next_states, next_actions]), axis=1)
            target_q = rewards + gamma * (1 - dones) * tf.minimum(target1, target2)

            # Critic updates
            with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
                q1 = critic_model1([states, actions], training=True)
                q2 = critic_model2([states, actions], training=True)
                loss1 = tf.keras.losses.MSE(target_q, q1)
                loss2 = tf.keras.losses.MSE(target_q, q2)

            critic_grad1 = tape1.gradient(loss1, critic_model1.trainable_variables)
            critic_grad2 = tape2.gradient(loss2, critic_model2.trainable_variables)
            critic_optimizer1.apply_gradients(zip(critic_grad1, critic_model1.trainable_variables))
            critic_optimizer2.apply_gradients(zip(critic_grad2, critic_model2.trainable_variables))

            # Delayed actor update
            if step % policy_delay == 0:
                with tf.GradientTape() as tape_actor:
                    act = actor_model(states)
                    actor_loss = -tf.reduce_mean(critic_model1([states, act]))
                actor_grad = tape_actor.gradient(actor_loss, actor_model.trainable_variables)
                actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))

                soft_update(target_actor.variables, actor_model.variables)
                soft_update(target_critic1.variables, critic_model1.variables)
                soft_update(target_critic2.variables, critic_model2.variables)

        # --- 8. Sleep to maintain loop ---
        time.sleep(dt)

except KeyboardInterrupt:
    print("Stopping (Ctrl+C) and saving models...")

finally:
    # Save models
    actor_model.save("td3_actor.h5")
    critic_model1.save("td3_critic1.h5")
    critic_model2.save("td3_critic2.h5")
    board.close()
    print("Training finished and models saved.")


HILError: -1410