# Actor-Critic: Combining Policy and Value Methods

> **"Actor-Critic methods combine the best of both policy and value approaches."**

## Learning Objectives
- Understand the actor-critic architecture and its advantages
- Implement A2C (Advantage Actor-Critic) from scratch
- Learn about advantage estimation and variance reduction
- Master modern actor-critic algorithms (A3C, PPO, SAC)
- Apply actor-critic methods to complex RL problems


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import random

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

print("Libraries imported successfully!")


Libraries imported successfully!


## 1. Actor-Critic Methods

### What are Actor-Critic Methods?
Actor-Critic methods combine the benefits of both policy gradient methods (actor) and value function methods (critic). The actor learns the policy, while the critic learns the value function to provide better estimates for policy updates.

### Key Components

#### 1. Actor (Policy)
- Learns the policy π(a|s)
- Updated using policy gradient with value function estimates

#### 2. Critic (Value Function)
- Learns the state-value function V(s) or action-value function Q(s,a)
- Provides better estimates for policy updates

#### 3. Advantage Function
A(s,a) = Q(s,a) - V(s) measures how much better an action is compared to the average.


In [2]:
class ActorCriticAgent:
    """Actor-Critic agent implementation."""
    
    def __init__(self, n_states, n_actions, learning_rate_actor=0.01, 
                 learning_rate_critic=0.01, gamma=0.99):
        self.n_states = n_states
        self.n_actions = n_actions
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.gamma = gamma
        
        # Actor parameters (policy)
        self.theta = np.random.randn(n_states, n_actions) * 0.1
        
        # Critic parameters (value function)
        self.w = np.random.randn(n_states) * 0.1
        
    def softmax_policy(self, state):
        """Compute policy using softmax function."""
        logits = self.theta[state]
        exp_logits = np.exp(logits - np.max(logits))
        probabilities = exp_logits / np.sum(exp_logits)
        return probabilities
    
    def select_action(self, state):
        """Select action according to current policy."""
        probabilities = self.softmax_policy(state)
        action = np.random.choice(self.n_actions, p=probabilities)
        return action
    
    def value_function(self, state):
        """Compute state value function."""
        return self.w[state]
    
    def update_actor(self, state, action, advantage):
        """Update actor (policy) parameters."""
        probabilities = self.softmax_policy(state)
        
        # Policy gradient
        policy_gradient = np.zeros(self.n_actions)
        policy_gradient[action] = 1 - probabilities[action]
        
        # Update parameters
        self.theta[state] += self.learning_rate_actor * advantage * policy_gradient
    
    def update_critic(self, state, target_value):
        """Update critic (value function) parameters."""
        current_value = self.value_function(state)
        td_error = target_value - current_value
        
        # Update parameters
        self.w[state] += self.learning_rate_critic * td_error

print("Actor-Critic agent class defined successfully!")


Actor-Critic agent class defined successfully!


In [3]:
# Train Actor-Critic agent
def train_actor_critic(agent, env, episodes=1000):
    """Train Actor-Critic agent on the environment."""
    episode_rewards = []
    episode_lengths = []
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        steps = 0
        max_steps = 100
        total_reward = 0
        
        while not done and steps < max_steps:
            # Select action
            action = agent.select_action(state)
            
            # Take step
            next_state, reward, done = env.step(action)
            
            # Compute target value
            if done:
                target_value = reward
            else:
                target_value = reward + agent.gamma * agent.value_function(next_state)
            
            # Compute advantage
            current_value = agent.value_function(state)
            advantage = target_value - current_value
            
            # Update critic
            agent.update_critic(state, target_value)
            
            # Update actor
            agent.update_actor(state, action, advantage)
            
            # Update state and tracking variables
            state = next_state
            total_reward += reward
            steps += 1
        
        # Store episode statistics
        episode_rewards.append(total_reward)
        episode_lengths.append(steps)
        
        # Print progress
        if episode % 100 == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            print(f"Episode {episode}, Average Reward: {avg_reward:.2f}")
    
    return episode_rewards, episode_lengths

# Create environment and agent
env = GridWorldPG(size=4)
agent = ActorCriticAgent(n_states=env.n_states, n_actions=env.n_actions, 
                        learning_rate_actor=0.01, learning_rate_critic=0.01, gamma=0.99)

print("Training Actor-Critic agent...")
print("=" * 50)

# Train the agent
episode_rewards, episode_lengths = train_actor_critic(agent, env, episodes=1000)

# Plot training progress
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot episode rewards
axes[0].plot(episode_rewards, alpha=0.6)
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Total Reward')
axes[0].set_title('Episode Rewards During Training')
axes[0].grid(True, alpha=0.3)

# Plot moving average
window = 50
if len(episode_rewards) >= window:
    moving_avg = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
    axes[0].plot(range(window-1, len(episode_rewards)), moving_avg, 'r-', linewidth=2, label=f'Moving Average ({window})')
    axes[0].legend()

# Plot episode lengths
axes[1].plot(episode_lengths, alpha=0.6)
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Episode Length')
axes[1].set_title('Episode Lengths During Training')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTraining completed!")
print(f"Final average reward (last 100 episodes): {np.mean(episode_rewards[-100:]):.2f}")
print(f"Final average episode length: {np.mean(episode_lengths[-100:]):.2f}")

# Compare with REINFORCE
print("\nComparing Actor-Critic with REINFORCE:")
print("=" * 50)

# Train REINFORCE agent for comparison
reinforce_agent = REINFORCEAgent(n_states=env.n_states, n_actions=env.n_actions, 
                                learning_rate=0.01, gamma=0.99)
reinforce_rewards, reinforce_lengths = train_reinforce(reinforce_agent, env, episodes=1000)

# Plot comparison
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.plot(episode_rewards, alpha=0.6, label='Actor-Critic')
plt.plot(reinforce_rewards, alpha=0.6, label='REINFORCE')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Reward Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(episode_lengths, alpha=0.6, label='Actor-Critic')
plt.plot(reinforce_lengths, alpha=0.6, label='REINFORCE')
plt.xlabel('Episode')
plt.ylabel('Episode Length')
plt.title('Episode Length Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Actor-Critic final average reward: {np.mean(episode_rewards[-100:]):.2f}")
print(f"REINFORCE final average reward: {np.mean(reinforce_rewards[-100:]):.2f}")


NameError: name 'GridWorldPG' is not defined