## **Actor-Critic Methods**

Actor-Critic methods combine the advantages of value-based and policy-based methods. The actor learns the policy (which action to take), and the critic evaluates the action by estimating the value function. 


**Imports**

In [3]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras import layers


**Data Loading**

In [None]:
# Create an environment
env = gym.make('CartPole-v1')

# Define the actor and critic networks
def create_actor_network():
    model = tf.keras.Sequential([
        layers.Dense(24, activation='relu', input_shape=env.observation_space.shape),
        layers.Dense(env.action_space.n, activation='softmax')
    ])
    return model

def create_critic_network():
    model = tf.keras.Sequential([
        layers.Dense(24, activation='relu', input_shape=env.observation_space.shape),
        layers.Dense(1)  # Value prediction (state value)
    ])
    return model


**Model Building**

In [None]:
# Hyperparameters
gamma = 0.99  # Discount factor
learning_rate = 0.01

# Initialize models
actor_model = create_actor_network()
critic_model = create_critic_network()
optimizer = tf.keras.optimizers.Adam(learning_rate)

# Define the actor-critic training loop
def train_step(state, action, reward, next_state, done):
    with tf.GradientTape() as tape:
        logits = actor_model(state)
        action_prob = logits[0, action]
        value = critic_model(state)
        next_value = critic_model(next_state)
        td_target = reward + gamma * next_value * (1 - done)
        td_error = td_target - value

        # Actor loss (policy gradient)
        actor_loss = -tf.math.log(action_prob) * td_error
        # Critic loss (value prediction)
        critic_loss = td_error ** 2

        total_loss = actor_loss + critic_loss

    grads = tape.gradient(total_loss, actor_model.trainable_variables + critic_model.trainable_variables)
    optimizer.apply_gradients(zip(grads, actor_model.trainable_variables + critic_model.trainable_variables))


**Training Loop**

In [None]:
def actor_critic(env, n_episodes=1000):
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            state = np.expand_dims(state, axis=0)
            action_probs = actor_model(state)
            action = np.random.choice(np.arange(env.action_space.n), p=action_probs[0].numpy())
            next_state, reward, done, _ = env.step(action)

            train_step(state, action, reward, next_state, done)

            state = next_state

actor_critic(env)
