In [1]:
import gymnasium as gym
import rware
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
import matplotlib.pyplot as plt

class SingleAgentRobotWarehouse(gym.Env):
    def __init__(self, env_name="rware-tiny-1ag-v2", **kwargs):
        self.env = gym.make(env_name, **kwargs)
        self.action_space = self.env.action_space[0]
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(np.prod(self.env.observation_space[0].shape),), dtype=np.float32
        )

    def reset(self, seed=None):
        obs, info = self.env.reset(seed=seed)
        return obs[0].flatten(), info

    def step(self, action):
        # Create dummy actions for other agents (required by multi-agent API)
        obs, rewards, terminated, truncated, info = self.env.step([action])
        done = terminated or truncated
        return obs[0].flatten(), rewards[0], done, False, info

In [2]:

class ActorCritic(nn.Module):
    def __init__(self, input_dim, hidden_dim, action_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU()
        )
        self.actor = nn.Linear(256, action_dim)
        self.critic = nn.Linear(256, 1)

    def forward(self, x):
        x = self.shared(x)
        return self.actor(x), self.critic(x)


In [3]:
class PPO:
    def __init__(self, env, lr=3e-4, gamma=0.99, epsilon=0.2, 
                 ent_coef=0.01, batch_size=64, n_epochs=10):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.ent_coef = ent_coef
        self.k_epochs = n_epochs
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        
        obs_dim = self.env.observation_space.shape[0]
        self.model = ActorCritic(obs_dim, 256, self.env.action_space.n)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.mse_loss = nn.MSELoss()

    def get_action(self, state):
        state = torch.FloatTensor(state)
        logits, value = self.model(state)
        probs = Categorical(logits=logits)
        action = probs.sample()
        return action.item(), probs.log_prob(action), value.squeeze()

    def compute_gae(self, rewards, values, dones):
        gae = 0
        returns = []
        advantages = []
        
        next_value = 0
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + self.gamma * next_value * (1 - dones[step]) - values[step]
            gae = delta + self.gamma * 0.95 * (1 - dones[step]) * gae
            next_value = values[step]
            returns.insert(0, gae + values[step])
            advantages.insert(0, gae)
        
        advantages = torch.tensor(advantages)
        return torch.tensor(returns), (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    def update(self, states, actions, log_probs, returns, advantages):
        for _ in range(self.k_epochs):
            for batch in self.get_batches(states, actions, log_probs, returns, advantages):
                state_batch, action_batch, old_log_probs_batch, return_batch, advantage_batch = batch

                # Calculate the new log probabilities and values
                new_log_probs, values, entropy = self.model(state_batch, action_batch)

                # Calculate the ratio
                ratio = torch.exp(new_log_probs - old_log_probs_batch)

                # Calculate the surrogate losses
                surr1 = ratio * advantage_batch
                surr2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantage_batch

                # Calculate the actor and critic losses
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_batch - values).pow(2).mean()
                entropy_loss = entropy.mean()

                # Convert to float
                actor_loss = actor_loss.float()
                critic_loss = critic_loss.float()
                entropy_loss = entropy_loss.float()

                # Calculate the total loss
                loss = actor_loss + 0.5 * critic_loss - self.ent_coef * entropy_loss

                # Perform backpropagation
                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                self.optimizer.step()

    def train(self, total_timesteps):
        states, actions, log_probs, rewards, dones, values = [], [], [], [], [], []
        episode_rewards = []
        state, _ = self.env.reset()

        for _ in range(total_timesteps):
            action, log_prob, value = self.get_action(state)
            next_state, reward, done, _, _ = self.env.step(action)

            states.append(state)
            actions.append(action)
            log_probs.append(log_prob.item())
            rewards.append(reward)
            dones.append(done)
            values.append(value.item())

            state = next_state
            if done:
                state, _ = self.env.reset()
                episode_rewards.append(sum(rewards))
                states, actions, log_probs, rewards, dones, values = [], [], [], [], [], []

            if len(states) >= self.batch_size:
                returns, advantages = self.compute_gae(rewards, values, dones)
                self.update(states, actions, log_probs, returns, advantages)
                states, actions, log_probs, rewards, dones, values = [], [], [], [], [], []

        return episode_rewards


In [None]:
# Training setup
env = SingleAgentRobotWarehouse()
ppo = PPO(env, lr=2.5e-4, gamma=0.99, epsilon=0.1, ent_coef=0.01)
rewards = ppo.train(total_timesteps=1_000_000)

# Plot results
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('PPO Training Progress')
plt.show()

  logger.warn(


AttributeError: 'PPO' object has no attribute 'get_batches'