In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

# Hyperparameters
ENV_NAME = 'BipedalWalker-v3'
HIDDEN_SIZE = 256
LEARNING_RATE = 3e-4
GAMMA = 0.99
LAMDA = 0.95
CLIP_EPSILON = 0.2
ENTROPY_COEF = 0.01
VALUE_LOSS_COEF = 0.5
MAX_GRAD_NORM = 0.5
PPO_EPOCHS = 10
MINI_BATCH_SIZE = 64
TOTAL_TIMESTEPS = 1_000_000
ROLLOUT_LENGTH = 2048

# Device configuration (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the environment
env = gym.make(ENV_NAME)
obs_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
action_high = torch.tensor(env.action_space.high).to(device)
action_low = torch.tensor(env.action_space.low).to(device)

# Define the Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, obs_size, action_size):
        super(ActorCritic, self).__init__()
        # Common network
        self.shared = nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
        )
        # Actor network
        self.actor_mean = nn.Sequential(
            nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, action_size),
            nn.Tanh()  # Assuming action space is bounded between -1 and 1
        )
        # Actor log_std (learned)
        self.actor_log_std = nn.Parameter(torch.zeros(action_size))
        # Critic network
        self.critic = nn.Sequential(
            nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, 1)
        )

    def forward(self, x):
        shared_out = self.shared(x)
        # Actor
        mean = self.actor_mean(shared_out)
        std = self.actor_log_std.exp().expand_as(mean)
        dist = Normal(mean, std)
        # Critic
        value = self.critic(shared_out)
        return dist, value

# Initialize the network and optimizer
model = ActorCritic(obs_size, action_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Storage for rollouts
class RolloutBuffer:
    def __init__(self):
        self.obs = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.dones = []
        self.values = []
    
    def clear(self):
        self.__init__()

buffer = RolloutBuffer()

# Function to compute Generalized Advantage Estimation (GAE)
def compute_gae(next_value, rewards, dones, values):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + GAMMA * values[step + 1] * (1 - dones[step]) - values[step]
        gae = delta + GAMMA * LAMDA * (1 - dones[step]) * gae
        returns.insert(0, gae + values[step])
    return returns

# Main training loop
state = env.reset()
state = torch.FloatTensor(state).to(device)
episode_rewards = []
episode_reward = 0
timesteps = 0

while timesteps < TOTAL_TIMESTEPS:
    for _ in range(ROLLOUT_LENGTH):
        dist, value = model(state)
        action = dist.sample()
        action_clipped = torch.clamp(action, action_low, action_high)
        log_prob = dist.log_prob(action).sum(dim=-1)
        next_state, reward, done, _ = env.step(action_clipped.cpu().numpy())
        next_state = torch.FloatTensor(next_state).to(device)
        # Store in buffer
        buffer.obs.append(state)
        buffer.actions.append(action)
        buffer.log_probs.append(log_prob)
        buffer.rewards.append(reward)
        buffer.dones.append(done)
        buffer.values.append(value)
        state = next_state
        episode_reward += reward
        timesteps += 1
        if done:
            state = env.reset()
            state = torch.FloatTensor(state).to(device)
            episode_rewards.append(episode_reward)
            episode_reward = 0
            # Print average reward every 10 episodes
            if len(episode_rewards) % 10 == 0:
                avg_reward = np.mean(episode_rewards[-10:])
                print(f"Average Reward: {avg_reward}")
        if timesteps >= TOTAL_TIMESTEPS:
            break
    # Compute next value
    with torch.no_grad():
        _, next_value = model(state)
    next_value = next_value.detach()
    # Compute returns and advantages
    returns = compute_gae(next_value, buffer.rewards, buffer.dones, [v.detach() for v in buffer.values])
    advantages = [ret - val.detach() for ret, val in zip(returns, buffer.values)]
    # Flatten the buffers
    obs_tensor = torch.stack(buffer.obs)
    actions_tensor = torch.stack(buffer.actions)
    log_probs_tensor = torch.stack(buffer.log_probs)
    returns_tensor = torch.stack(returns).detach()
    advantages_tensor = torch.stack(advantages).detach()
    values_tensor = torch.stack(buffer.values).detach()
    # Clear buffer
    buffer.clear()
    # PPO Optimization step
    for _ in range(PPO_EPOCHS):
        # Create mini-batches
        indices = np.arange(len(obs_tensor))
        np.random.shuffle(indices)
        for start in range(0, len(obs_tensor), MINI_BATCH_SIZE):
            end = start + MINI_BATCH_SIZE
            mini_batch_indices = indices[start:end]
            mb_obs = obs_tensor[mini_batch_indices]
            mb_actions = actions_tensor[mini_batch_indices]
            mb_log_probs = log_probs_tensor[mini_batch_indices]
            mb_returns = returns_tensor[mini_batch_indices]
            mb_advantages = advantages_tensor[mini_batch_indices]
            # Forward pass
            dist, value = model(mb_obs)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(mb_actions).sum(dim=-1)
            # Ratio for clipping
            ratio = (new_log_probs - mb_log_probs).exp()
            surr1 = ratio * mb_advantages
            surr2 = torch.clamp(ratio, 1.0 - CLIP_EPSILON, 1.0 + CLIP_EPSILON) * mb_advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = VALUE_LOSS_COEF * (mb_returns - value).pow(2).mean()
            loss = actor_loss + critic_loss - ENTROPY_COEF * entropy
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()

env.close()


  state = torch.FloatTensor(state).to(device)


ValueError: expected sequence of length 24 at dim 1 (got 0)

: 