In [None]:
import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the Pendulum-v1 environment (continuous action space)
env = gym.make("Pendulum-v1")  
# Pendulum-v1:
#   Observation: 3-dimensional vector (e.g., [cos(theta), sin(theta), theta_dot])
#   Action: 1-dimensional continuous value in the range [-2, 2]
state_dim = env.observation_space.shape[0]  # typically 3
action_dim = env.action_space.shape[0]        # typically 1

# ----------------------------
# Define the Actor Network
# ----------------------------
class Actor(nn.Module):
    """
    Actor network that takes a state and outputs the parameters for a Gaussian distribution
    over actions. The network outputs:
      - mean: tensor of shape [batch_size, action_dim]
      - std: tensor of shape [action_dim] (shared across batch; learned as a log_std parameter)
    """
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.mean = nn.Linear(hidden_dim, action_dim)
        # Learn the log_std as a parameter (one value per action dimension)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
    
    def forward(self, state):
        # state: [batch_size, state_dim]
        x = F.relu(self.fc1(state))       # x: [batch_size, hidden_dim]
        x = F.relu(self.fc2(x))           # x: [batch_size, hidden_dim]
        mean = self.mean(x)               # mean: [batch_size, action_dim]
        # Use exponentiation to ensure std is positive.
        std = torch.exp(self.log_std)     # std: [action_dim] (broadcasted over batch)
        return mean, std

# ----------------------------
# Define the Critic Network
# ----------------------------
class Critic(nn.Module):
    """
    Critic network that takes a state and outputs a scalar value V(s).
    """
    def __init__(self, state_dim, hidden_dim=128):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, 1)
    
    def forward(self, state):
        # state: [batch_size, state_dim]
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.value(x)  # shape: [batch_size, 1]
        return value

# ----------------------------
# Initialize networks and optimizers
# ----------------------------
actor = Actor(state_dim, action_dim).to(device)
critic = Critic(state_dim).to(device)

actor_optimizer = optim.Adam(actor.parameters(), lr=1e-3)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

# Hyperparameters
num_episodes = 500         # Total episodes to train
gamma = 0.99               # Discount factor

# To record episode rewards for plotting
episode_rewards = []

# ----------------------------
# Training loop using A2C
# ----------------------------
for episode in range(num_episodes):
    state = env.reset()  
    # Convert state (a numpy array of shape [state_dim]) to tensor of shape [1, state_dim]
    state = torch.tensor([state], dtype=torch.float32, device=device)
    
    log_probs = []      # To store log probabilities of actions taken
    values = []         # To store state value estimates from the critic
    rewards = []        # To store rewards received during the episode
    
    done = False
    t = 0
    while not done:
        t += 1
        
        # Get value estimate from the critic (shape: [1, 1])
        value = critic(state)
        values.append(value)
        
        # Get action distribution parameters from the actor (mean and std)
        mean, std = actor(state)  # mean: [1, action_dim], std: [action_dim]
        # Create normal distribution and sample an action
        dist = torch.distributions.Normal(mean, std)
        action = dist.sample()               # action: [1, action_dim]
        log_prob = dist.log_prob(action)     # log_prob: [1, action_dim]
        # Sum log probability over action dimensions to get a scalar per sample.
        log_prob = log_prob.sum(dim=-1)       # shape: [1]
        log_probs.append(log_prob)
        
        # The environment expects a numpy array; detach and move action to CPU.
        action_np = action.cpu().detach().numpy()[0]
        next_state, reward, done, _ = env.step(action_np)
        rewards.append(reward)
        
        # Prepare the next state tensor if not terminal.
        if not done:
            next_state = torch.tensor([next_state], dtype=torch.float32, device=device)
        else:
            next_state = None
        
        # Update state for next iteration: if terminal, use zeros.
        state = next_state if next_state is not None else torch.zeros((1, state_dim), device=device, dtype=torch.float32)
    
    # Compute discounted returns (cumulative rewards) backward from the end of the episode.
    R = 0
    returns = []
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    # Convert returns into a tensor of shape [T, 1] where T is episode length.
    returns = torch.tensor(returns, dtype=torch.float32, device=device).unsqueeze(1)
    
    # Convert list of values and log_probs to tensors.
    values = torch.cat(values, dim=0)       # shape: [T, 1]
    log_probs = torch.cat(log_probs, dim=0)   # shape: [T]
    
    # Compute advantage: difference between returns and critic's value estimates.
    advantages = returns - values.detach()    # shape: [T, 1]
    
    # Calculate losses:
    # Actor loss: negative of the log-probability multiplied by the advantage, averaged over the episode.
    actor_loss = -(log_probs * advantages.squeeze()).mean()
    # Critic loss: Mean squared error between the estimated values and the returns.
    critic_loss = F.mse_loss(values, returns)
    
    # Backpropagation for actor network
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()
    
    # Backpropagation for critic network
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()
    
    total_reward = sum(rewards)
    episode_rewards.append(total_reward)
    if (episode + 1) % 20 == 0:
        print(f"Episode {episode+1}: Total Reward = {total_reward:.2f}, Length = {t}")
        
# Plotting the total rewards per episode to see training progress
plt.figure()
plt.plot(episode_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Actor-Critic (A2C) on Pendulum-v1")
plt.show()

env.close()
