## **Actor-Critic**

Combines policy-based and value-based methods by having two neural networks: an actor for policy optimization and a critic for value estimation.

**Imports**

In [3]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim

**Environment Setup**

In [None]:
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

**Actor-Critic Model**

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, state):
        action_probs = self.actor(state)
        value = self.critic(state)
        return action_probs, value

model = ActorCritic(state_dim, action_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)

**Training Loop**

In [None]:
def train_actor_critic(env, model, optimizer, episodes=500):
    gamma = 0.99
    for episode in range(episodes):
        state = env.reset()
        state = torch.FloatTensor(state)
        log_probs = []
        values = []
        rewards = []

        # Generate an episode
        while True:
            action_probs, value = model(state)
            action = torch.multinomial(action_probs, 1).item()
            next_state, reward, done, _ = env.step(action)
            log_probs.append(torch.log(action_probs[action]))
            values.append(value)
            rewards.append(reward)

            state = torch.FloatTensor(next_state)
            if done:
                break

        # Compute returns and losses
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.FloatTensor(returns)

        policy_loss = []
        value_loss = []
        for log_prob, value, G in zip(log_probs, values, returns):
            advantage = G - value.item()
            policy_loss.append(-log_prob * advantage)
            value_loss.append((value - G) ** 2)

        optimizer.zero_grad()
        loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
        loss.backward()
        optimizer.step()

        if (episode + 1) % 50 == 0:
            print(f"Episode {episode+1}/{episodes}, Loss: {loss.item():.4f}")

train_actor_critic(env, model, optimizer)
