In [3]:
import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt
from keras.models import Model 
from keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 

In [4]:
def system_dynamics(x_t, u_t):
    """
    System dynamics: x_t+1 = x_t + 0.5*u_t
    """
    return x_t + 0.5 * u_t

In [None]:
# --- Minimal Actor-Critic for Linear System ---

# 1. Define small actor and critic networks
actor_in = Input(shape=(1,))
actor_h = Dense(32, activation='relu')(actor_in)
actor_out = Dense(1, activation='linear')(actor_h)
actor = Model(actor_in, actor_out)
actor_optimizer = tf.keras.optimizers.Adam(1e-3)

critic_in = Input(shape=(2,))  # [x, u]
critic_h = Dense(32, activation='relu')(critic_in)
critic_out = Dense(1, activation='linear')(critic_h)
critic = Model(critic_in, critic_out)
critic_optimizer = tf.keras.optimizers.Adam(1e-3)

# 2. Training loop
n_episodes = 200
n_steps = 20
gamma = 1.0

history = []

for episode in range(n_episodes):
    x = np.random.uniform(-5, 5)  # random initial state
    episode_reward = 0
    for t in range(n_steps):
        x_tensor = tf.convert_to_tensor([[x]], dtype=tf.float32)
        with tf.GradientTape() as tape_actor, tf.GradientTape() as tape_critic:
            u = actor(x_tensor)
            u_np = u.numpy()[0, 0]
            x_next = system_dynamics(x, u_np)
            r = x**2 + 0.001 * u_np**2

            # Critic Q(s,a)
            q_pred = critic(tf.convert_to_tensor([[x, u_np]], dtype=tf.float32))
            # Target: r + gamma * Q(s', a')
            u_next = actor(tf.convert_to_tensor([[x_next]], dtype=tf.float32))
            q_next = critic(tf.convert_to_tensor([[x_next, u_next.numpy()[0,0]]], dtype=tf.float32))
            q_target = r + q_next

            # Critic loss (MSE)
            critic_loss = tf.reduce_mean((q_pred - q_target)**2)

        # Critic update
        critic_grads = tape_critic.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grads, critic.trainable_variables))

        # Actor loss: maximize Q, so minimize -Q
        with tf.GradientTape() as tape_actor:
            u = actor(x_tensor)
            q_val = critic(tf.concat([x_tensor, u], axis=1))
            actor_loss = -tf.reduce_mean(q_val)
        actor_grads = tape_actor.gradient(actor_loss, actor.trainable_variables)
        actor_optimizer.apply_gradients(zip(actor_grads, actor.trainable_variables))

        x = x_next
        episode_reward += r

    history.append(episode_reward)
    if (episode+1) % 10 == 0:
        print(f"Episode {episode+1}, Total Reward: {episode_reward:.2f}")

# 3. Plot learning curve
plt.plot(history)
plt.xlabel('Episode')
plt.ylabel('Total Cost')
plt.title('Actor-Critic Learning Curve')
plt.grid()
plt.show()

# 4. Visualize learned policy vs analytical
x_test = np.linspace(-5, 5, 100).reshape(-1, 1)
u_actor = actor.predict(x_test)
u_analytical = -2 * x_test
plt.plot(x_test, u_actor, label='Learned Policy (Actor NN)')
plt.plot(x_test, u_analytical, '--', label='Analytical Policy')
plt.xlabel('x')
plt.ylabel('u')
plt.legend()
plt.title('Learned Policy vs Analytical')
plt.grid()
plt.show()

Episode 10, Total Reward: 60293.19
Episode 20, Total Reward: 11.37
Episode 30, Total Reward: 10.84
Episode 40, Total Reward: 41.27
Episode 50, Total Reward: 0.13
Episode 60, Total Reward: 5.55
Episode 70, Total Reward: 30.37
Episode 80, Total Reward: 10.83
Episode 90, Total Reward: 1003231379456.00
Episode 100, Total Reward: nan
Episode 110, Total Reward: nan
Episode 120, Total Reward: nan
