In [11]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import deque
import random
import copy
import os

# Set a random seed for reproducibility
SEED = 0
torch.manual_seed(SEED)
np_random_seed = np.random.RandomState(SEED) # Use RandomState for consistent numpy randomness
random.seed(SEED)

# Define the device for training (GPU if available, else CPU)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Network Definitions (Actor and Critic)
class Actor(nn.Module):
    """Actor network to map states to actions."""
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.max_action = max_action
        
        # Define the layers of the actor network
        self.l1 = nn.Linear(state_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, action_dim)

    def forward(self, state):
        """Forward pass through the actor network."""
        x = F.relu(self.l1(state)) # First layer with ReLU activation
        x = F.relu(self.l2(x)) # Second layer with ReLU activation
        # The output is scaled by max_action and passed through tanh to constrain actions
        return self.max_action * torch.tanh(self.l3(x))

class Critic(nn.Module):
    """Critic network to estimate Q-values for a given state-action pair."""
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        
        # Define the layers of the critic network
        # Input includes both state and action dimensions
        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1) # Output is a single Q-value

    def forward(self, state, action):
        """Forward pass through the critic network."""
        # Concatenate state and action tensors
        state_action = torch.cat([state, action], 1)
        
        x = F.relu(self.l1(state_action)) # First layer with ReLU activation
        x = F.relu(self.l2(x)) # Second layer with ReLU activation
        return self.l3(x) # Output the Q-value

# 2. Replay Buffer
class ReplayBuffer:
    """A buffer to store and sample past experiences."""
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity) # Use deque for efficient appending/popping

    def push(self, transition):
        """Adds a new transition to the buffer."""
        self.buffer.append(transition)

    def sample(self, batch_size):
        """Samples a random batch of transitions from the buffer."""
        # If the buffer is smaller than batch_size, sample all available transitions
        batch_size = min(batch_size, len(self.buffer))
        # Ensure we don't try to sample if the buffer is empty
        if batch_size == 0:
            return []
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        """Returns the current size of the buffer."""
        return len(self.buffer)

# 3. Ornstein-Uhlenbeck Noise for Exploration
class OUNoise:
    """
    Ornstein-Uhlenbeck process.
    Used for exploration in continuous action spaces.
    It generates temporally correlated noise, helping the agent explore smoothly.
    """
    def __init__(self, action_dim, mu=0.0, theta=0.15, sigma=0.3): # Increased sigma
        self.action_dim = action_dim
        self.mu = mu # Mean of the noise
        self.theta = theta # Rate of mean reversion
        self.sigma = sigma # Volatility of the noise
        self.reset()

    def reset(self):
        """Reset the internal state (noise) to mean (mu)."""
        self.state = copy.deepcopy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        # Calculate dx based on the OU process formula
        dx = self.theta * (self.mu - x) + self.sigma * np_random_seed.randn(self.action_dim)
        self.state = x + dx
        return self.state

# 4. DDPG Agent
class DDPG:
    """The DDPG agent that manages the actor-critic networks and the learning process."""
    def __init__(self, state_dim, action_dim, max_action):
        self.device = DEVICE

        # Main networks (Actor and Critic)
        self.actor = Actor(state_dim, action_dim, max_action).to(self.device)
        self.critic = Critic(state_dim, action_dim).to(self.device)

        # Target networks (Actor_target and Critic_target) for stability
        self.actor_target = Actor(state_dim, action_dim, max_action).to(self.device)
        self.critic_target = Critic(state_dim, action_dim).to(self.device)

        # Initialize target networks with the same weights as main networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        # Optimizers for both networks
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.replay_buffer = ReplayBuffer(capacity=1_000_000) # Increased capacity
        self.gamma = 0.99  # Discount factor for future rewards
        self.tau = 0.005  # Soft update parameter for target networks

        # Ornstein-Uhlenbeck noise for exploration
        self.ou_noise = OUNoise(action_dim)


    def select_action(self, state, add_noise=True):
        """
        Selects a deterministic action from the actor network.
        Adds OU noise for exploration during training if add_noise is True.
        """
        # Convert state to a PyTorch tensor and move to the correct device
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        
        # Get the action from the actor network
        self.actor.eval() # Set actor to evaluation mode
        with torch.no_grad(): # Disable gradient calculations
            action = self.actor(state).cpu().data.numpy().flatten()
        self.actor.train() # Set actor back to training mode
        
        if add_noise:
            # Add OU noise to the action
            noise_sample = self.ou_noise.sample()
            action = action + noise_sample
            # Clip the action to ensure it stays within the valid range
            action = np.clip(action, -self.actor.max_action, self.actor.max_action)
            
        return action

    def train(self, batch_size):
        """Performs a single training step to update networks."""
        # Ensure enough experiences are in the buffer for a batch
        if len(self.replay_buffer) < batch_size:
            return

        transitions = self.replay_buffer.sample(batch_size)
        
        # Handle cases where buffer might be sampled empty due to min(batch_size, len(buffer))
        if not transitions:
            return

        # Unzip the batch of transitions
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*transitions)

        # Convert numpy arrays to PyTorch tensors and move to the correct device
        state = torch.FloatTensor(np.array(state_batch)).to(self.device)
        action = torch.FloatTensor(np.array(action_batch)).to(self.device)
        reward = torch.FloatTensor(np.array(reward_batch)).unsqueeze(1).to(self.device)
        next_state = torch.FloatTensor(np.array(next_state_batch)).to(self.device)
        # Convert done flag to float and move to device
        done = torch.FloatTensor(np.array(done_batch).astype(float)).unsqueeze(1).to(self.device)

        # --- Update Critic Network ---
        with torch.no_grad(): # No gradients needed for target computations
            # Get next actions from the actor target network
            next_action = self.actor_target(next_state)
            # Calculate target Q-values using critic target network
            target_Q = self.critic_target(next_state, next_action)
            # Bellman equation for target Q-value
            target_Q = reward + ( (1 - done) * self.gamma * target_Q )

        # Get current Q-values from the main critic network
        current_Q = self.critic(state, action)
        # Calculate critic loss (Mean Squared Error between current and target Q-values)
        critic_loss = F.mse_loss(current_Q, target_Q)

        # Optimize critic network
        self.critic_optimizer.zero_grad() # Clear previous gradients
        critic_loss.backward() # Backpropagate loss
        self.critic_optimizer.step() # Update critic weights

        # --- Update Actor Network ---
        # Calculate actor loss (maximize Q-value predicted by critic for actor's actions)
        actor_loss = -self.critic(state, self.actor(state)).mean()
        
        # Optimize actor network
        self.actor_optimizer.zero_grad() # Clear previous gradients
        actor_loss.backward() # Backpropagate loss
        self.actor_optimizer.step() # Update actor weights

        # --- Soft Update Target Networks ---
        # Update critic target network
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        # Update actor target network
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def save_weights(self, filename="best_ddpg_agent.pth"):
        """Saves the state dictionaries of the actor and critic networks."""
        torch.save({
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_target_state_dict': self.actor_target.state_dict(),
            'critic_target_state_dict': self.critic_target.state_dict(),
            'actor_optimizer_state_dict': self.actor_optimizer.state_dict(),
            'critic_optimizer_state_dict': self.critic_optimizer.state_dict(),
        }, filename)
        # print(f"Agent weights saved to {filename}") # Suppress frequent save messages

    def load_weights(self, filename="best_ddpg_agent.pth"):
        """Loads the state dictionaries into the actor and critic networks."""
        if not os.path.exists(filename):
            print(f"Error: {filename} not found. Cannot load weights.")
            return

        checkpoint = torch.load(filename, map_location=self.device)
        self.actor.load_state_dict(checkpoint['actor_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        self.actor_target.load_state_dict(checkpoint['actor_target_state_dict'])
        self.critic_target.load_state_dict(checkpoint['critic_target_state_dict'])
        self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict'])
        self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict'])
        print(f"Agent weights loaded from {filename}")


def evaluate_agent(env, agent, num_eval_episodes=1, seed=SEED):
    """
    Evaluates the agent's performance over a specified number of episodes
    without exploration noise.
    """
    total_rewards = []
    for _ in range(num_eval_episodes):
        state, _ = env.reset(seed=seed)
        episode_reward = 0
        done = False
        while not done:
            action = agent.select_action(state, add_noise=False) # No noise for evaluation
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
        total_rewards.append(episode_reward)
    return np.mean(total_rewards)


# 5. Training Loop
def main():
    env = gym.make("Pendulum-v1")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    agent = DDPG(state_dim, action_dim, max_action)

    num_episodes = 101 # Increased number of training episodes
    batch_size = 128
    eval_interval = 50 # Evaluate agent every 50 episodes

    best_eval_reward = -np.inf # Track the best evaluation reward achieved
    best_agent_path = "best_ddpg_agent.pth" # File to save best agent weights

    print(f"Starting DDPG training on Pendulum-v1 for {num_episodes} episodes...")

    for episode in range(num_episodes):
        state, _ = env.reset(seed=SEED) # Reset environment for new episode
        episode_reward = 0
        done = False
        step = 0

        # Reset OU noise for the new episode
        agent.ou_noise.reset() 

        while not done:
            # Select action with OU noise for exploration during training
            action = agent.select_action(state, add_noise=True)

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent.replay_buffer.push((state, action, reward, next_state, done))
            state = next_state
            episode_reward += reward
            step += 1

            # Train the agent (only if buffer has enough samples)
            agent.train(batch_size)

        # Evaluate agent periodically
        if (episode + 1) % eval_interval == 0:
            avg_eval_reward = evaluate_agent(env, agent, num_eval_episodes=5, seed=SEED)
            print(f"Episode: {episode + 1}/{num_episodes}, Training Reward: {episode_reward:.2f}, "
                  f"Avg. Eval Reward: {avg_eval_reward:.2f}")

            if avg_eval_reward > best_eval_reward:
                best_eval_reward = avg_eval_reward
                agent.save_weights(best_agent_path) # Save weights if this is the best evaluation so far
                print(f"--- New best evaluation reward: {best_eval_reward:.2f}. Agent weights saved. ---")
        elif episode % 50 == 0: # Print training reward for other intervals
            print(f"Episode: {episode + 1}/{num_episodes}, Training Reward: {episode_reward:.2f}")


    env.close()
    print("\nTraining complete. Generating video of best performance...")

    # --- Generate video of the best-performing agent ---
    # Create a directory for videos if it doesn't exist
    video_dir = "videos"
    os.makedirs(video_dir, exist_ok=True)

    # Create a new environment wrapped with RecordVideo
    # render_mode="rgb_array" is required for recording
    eval_env_video = gym.make("Pendulum-v1", render_mode="rgb_array")
    eval_env_video = gym.wrappers.RecordVideo(
        eval_env_video, 
        video_folder=video_dir, 
        name_prefix="best_ddpg_pendulum",
        episode_trigger=lambda x: True # Record every episode
    )
    
    # Create a new agent for evaluation
    eval_agent = DDPG(state_dim, action_dim, max_action)
    eval_agent.load_weights(best_agent_path) # Load the best weights

    # Run one episode with the best agent and record it
    state, _ = eval_env_video.reset(seed=SEED)
    done = False
    total_eval_reward_video = 0

    while not done:
        # Select action without noise for deterministic evaluation
        action = eval_agent.select_action(state, add_noise=False)
        state, reward, terminated, truncated, _ = eval_env_video.step(action)
        done = terminated or truncated
        total_eval_reward_video += reward
    
    eval_env_video.close()
    print(f"Video of best performance (reward: {total_eval_reward_video:.2f}) saved to '{video_dir}/best_ddpg_pendulum-0.mp4'")

if __name__ == '__main__':
    # Initialize numpy random state globally for OUNoise
    np_random_seed = np.random.RandomState(SEED)
    main()



Starting DDPG training on Pendulum-v1 for 101 episodes...
Episode: 1/101, Training Reward: -1184.02
Episode: 50/101, Training Reward: -127.64, Avg. Eval Reward: -128.97
--- New best evaluation reward: -128.97. Agent weights saved. ---
Episode: 51/101, Training Reward: -126.82
Episode: 100/101, Training Reward: -125.64, Avg. Eval Reward: -125.62
--- New best evaluation reward: -125.62. Agent weights saved. ---
Episode: 101/101, Training Reward: -125.25

Training complete. Generating video of best performance...
Agent weights loaded from best_ddpg_agent.pth
Video of best performance (reward: -125.62) saved to 'videos/best_ddpg_pendulum-0.mp4'


In [23]:
from IPython.display import HTML, display
import os

video_path = "videos/best_ddpg_pendulum-episode-0.mp4" 

if os.path.exists(video_path):
    display(HTML(f"""
    <video controls autoplay width="700">
        <source src="{video_path}" type="video/mp4">
        Your browser does not support the video tag.
    </video>
    """))
else:
    print(f"Error: Video file not found at {os.path.abspath(video_path)}")