Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [1]:
%%capture
# !git clone https://github.com/cuongtv312/marl-delivery.git
%cd marl-delivery
# !uv add -r requirements.txt

In [None]:
%%capture
!pip install stable-baselines3

In [2]:
%%capture
!pip install gymnasium

In [5]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import torch
from collections import deque

In [3]:
env = Environment('map.txt', 10, 2, 5)
state = env.reset()
print(state["robots"])
print(state["packages"])
print(state["map"])

[(5, 4, 0), (5, 3, 0)]
[(1, 6, 5, 4, 3, 0, 26), (2, 2, 2, 2, 3, 0, 22), (3, 4, 5, 5, 6, 0, 22)]
[[1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1]]


In [4]:
def convert_state(state, max_robots=20, max_packages=1000, device='cuda'):
    """
    Convert raw state dict to input tensor dict for CNNPolicy.
    The tensors are concatenated together as a single input for the model.
    """
    # Convert map to (1, 1, H, W)
    map_tensor = torch.tensor(state["map"], dtype=torch.float32, device=device).unsqueeze(0).unsqueeze(0)

    # Process robots (each tuple has 3 values)
    robots = state["robots"]
    robot_array = np.zeros((max_robots, 3), dtype=np.float32)
    for i, robot in enumerate(robots[:max_robots]):
        robot_array[i] = np.array(robot, dtype=np.float32)
    robot_tensor = torch.tensor(robot_array.flatten(), device=device)  # shape (max_robots*3,)

    # Process packages (each tuple has 7 values)
    packages = state["packages"]
    package_array = np.zeros((max_packages, 7), dtype=np.float32)
    for i, pkg in enumerate(packages[:max_packages]):
        package_array[i] = np.array(pkg, dtype=np.float32)
    package_tensor = torch.tensor(package_array.flatten(), device=device)  # shape (max_packages*7,)

    # Time step as tensor (1,)
    time_tensor = torch.tensor([state["time_step"]], dtype=torch.float32, device=device)

    # Concatenate all tensors
    combined_input = torch.cat([
        map_tensor.flatten(),  # Flatten map (1, 1, H, W) -> (H*W,)
        robot_tensor,          # shape (max_robots*3,)
        package_tensor,        # shape (max_packages*7,)
        time_tensor            # shape (1,)
    ], dim=-1).unsqueeze(0)  # Final shape: (1, H*W + max_robots*3 + max_packages*7 + 1)

    return combined_input


In [6]:
def reward_shaping(original_reward, env: Environment, prev_state, actions):
    shaped_reward = original_reward
    shaping_factor = 0.5

    for i, robot in enumerate(env.robots):
        prev_pos = tuple(prev_state['robots'][i][:2])
        curr_pos = robot.position
        grid = env.load_map()  # Giả sử đây là 2D list (0: trống, 1: tường)

        if robot.carrying:
            pkg = env.packages[robot.carrying - 1]
            d_prev = bfs_distance(prev_pos, pkg.target, grid)
            d_curr = bfs_distance(curr_pos, pkg.target, grid)
            if d_curr < d_prev:
                shaped_reward += shaping_factor
            else:
                shaped_reward -= 0.2
        else:
            for pkg in env.packages:
                if pkg.status == 'waiting':
                    d_prev = bfs_distance(prev_pos, pkg.start, grid)
                    d_curr = bfs_distance(curr_pos, pkg.start, grid)
                    if d_curr < d_prev:
                        shaped_reward += shaping_factor * 0.5
                    break

        if curr_pos == prev_pos:
            shaped_reward -= 0.1

        for pkg in env.packages:
            if pkg.status == 'delivering' and pkg.picked_by == i:
                waiting_time = pkg.pick_time - pkg.start_time if pkg.pick_time else 0
                shaped_reward += max(0, 1.0 - 0.01 * waiting_time)

    return shaped_reward
def bfs_distance(start, goal, grid):
    rows, cols = len(grid), len(grid[0])
    visited = set()
    queue = deque([(start, 0)])

    while queue:
        (x, y), dist = queue.popleft()
        if (x, y) == goal:
            return dist

        for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]:
            nx, ny = x + dx, y + dy
            if 0 <= nx < rows and 0 <= ny < cols:
                if grid[nx][ny] == 0 and (nx, ny) not in visited:  # 0 là ô trống
                    visited.add((nx, ny))
                    queue.append(((nx, ny), dist + 1))
    
    return float('inf')  # Không tìm được đường đi


In [7]:
env = Environment('map.txt', 10, 2, 5)
state = env.reset()
print(state["robots"])
print(state["packages"])
# print(state["map"])

# print(env.load_map())
print(env.step([('L', 1),('U',0)]))

[(5, 4, 0), (5, 3, 0)]
[(1, 6, 5, 4, 3, 0, 26), (2, 2, 2, 2, 3, 0, 22), (3, 4, 5, 5, 6, 0, 22)]
({'time_step': 1, 'map': [[1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1]], 'robots': [(5, 3, 0), (4, 3, 0)], 'packages': []}, -0.02, False, {})


In [8]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)


        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

# ==== Hyperparameters ====
DEVICE = torch.device("cuda")
GAMMA = 0.99
LR = 2.5e-4
CLIP_EPS = 0.2
UPDATE_EPOCHS = 4
MAX_TIMESTEPS = 1000
ENTROPY_COEF = 0.01
VALUE_COEF = 0.5

# ==== CNN Policy ====
class CNNPolicy(nn.Module):
    def __init__(self, act_dim):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))  # Tạo output luôn có size (B, C, 1, 1)

        self.fc1 = nn.Linear(64, 256)
        self.fc2 = nn.Linear(256, act_dim)
        self.value_head = nn.Linear(256, 1)

    def forward(self, state):
        x = torch.relu(self.conv1(state))
        x = torch.relu(self.conv2(x))
        x = self.global_pool(x)  # output: (B, 64, 1, 1)
        x = x.squeeze(-1).squeeze(-1)  # output: (B, 64)
        x = torch.relu(self.fc1(x))

        action_probs = torch.softmax(self.fc2(x), dim=-1)
        state_value = self.value_head(x)
        return action_probs, state_value


# ==== Buffer ====
class RolloutBuffer:
    def __init__(self):
        self.states, self.actions, self.logprobs = [], [], []
        self.rewards, self.dones, self.values = [], [], []

    def clear(self):
        self.__init__()

# ==== PPO Agent ====
class PPOAgent:
    def __init__(self,act_dim):
        self.policy = CNNPolicy(act_dim).to(DEVICE)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.buffer = RolloutBuffer()

    def select_action(self, state):
        state_3d = state.unsqueeze(0).unsqueeze(0)
        # state_tensor = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).to(DEVICE)
        probs, value = self.policy(state_3d)
        dist = Categorical(probs)
        action = dist.sample()

        self.buffer.states.append(state_3d)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(dist.log_prob(action))
        self.buffer.values.append(value.squeeze())

        return action.item()

    def compute_returns_and_advantages(self, next_value):
        returns, advs = [], []
        gae = 0
        values = self.buffer.values + [next_value]
        for i in reversed(range(len(self.buffer.rewards))):
            delta = self.buffer.rewards[i] + GAMMA * values[i + 1] * (1 - self.buffer.dones[i]) - values[i]
            gae = delta + GAMMA * gae * (1 - self.buffer.dones[i])
            advs.insert(0, gae)
            returns.insert(0, gae + values[i])
        return returns, advs

    def update(self, next_value):
        returns, advs = self.compute_returns_and_advantages(next_value)

        states = torch.cat(self.buffer.states).to(DEVICE)
        actions = torch.tensor(self.buffer.actions).to(DEVICE)
        old_logprobs = torch.stack(self.buffer.logprobs).detach().to(DEVICE)
        returns = torch.tensor(returns).detach().unsqueeze(1).to(DEVICE)
        advs = torch.tensor(advs).detach().unsqueeze(1).to(DEVICE)

        for _ in range(UPDATE_EPOCHS):
            probs, values = self.policy(states)
            dist = Categorical(probs)
            logprobs = dist.log_prob(actions)
            entropy = dist.entropy().mean()

            ratio = torch.exp(logprobs - old_logprobs)
            surr1 = ratio * advs
            surr2 = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advs
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = nn.MSELoss()(values, returns)

            loss = actor_loss + VALUE_COEF * critic_loss - ENTROPY_COEF * entropy

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        self.buffer.clear()

# ==== Training Loop ====
def train(env):
    obs_shape = env.observation_space.shape  # (H, W)
    act_dim = int(np.prod(env.action_space.nvec))

    agent = PPOAgent(act_dim)

    for episode in range(10):
        state, _ = env.reset()
        episode_reward = 0

        for t in range(MAX_TIMESTEPS):
            action_flat = agent.select_action(state)
            print(action_flat)
            action = np.unravel_index(action_flat, env.action_space.nvec)
            action = np.array(action)

            next_state, reward, done, _, _ = env.step(action)
            agent.buffer.rewards.append(reward)
            agent.buffer.dones.append(done)

            state = next_state
            episode_reward += reward

            if done:
                break

        with torch.no_grad():
            next_state_tensor = next_state.unsqueeze(0).unsqueeze(0)
            _, next_value = agent.policy(next_state_tensor)
        agent.update(next_value.squeeze())
        print(f"Episode {episode}, Reward: {episode_reward}")

    return agent

In [11]:
# Example usage
env = Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10)

state = env.reset()
print(len(state))
model = train(env)

2
126381
532055
457905
315133
159436
707697
203187
421280
122151
17843
100828
418593
268933
175739
573221
680143
489842
544234
495906
526937
651024
267410
396176
6758
677667
510727
572351
59510
133945
171678
48370
421637
151592
735567
348736
41701
384294
608294
194444
211570
45564
370620
641121
687926
173749
174803
289855
194869
567231
67765
200326
162303
320127
49607
743674
346534
284657
715131
14409
69605
657914
260405
285437
468340
116039
514507
246254
464569
226258
118011
326781
218568
719004
330586
403383
640816
538744
749320
16274
351407
704479
18892
363201
171536
573223
707640
552374
592835
25966
390250
55
466239
401310
288182
480327
533935
330310
390735
281130
267849
Episode 0, Reward: -1.3300000000000027
267310
115487
558763
385633
108915
640946
68783
575436
594262
412217
309029
141186
123588
346769
446309
74471
572318
168131
240803
261835
296914
180429
56082
566706
335097
420219
620846
65292
531438
721244
304328
174770
610628
70392
305310
43865
470698
523775
487933
617745
688

In [36]:
def evaluate(agent, env, episodes=10, render=False):
    total_rewards = []

    for episode in range(episodes):
        state, _ = env.reset()
        episode_reward = 0

        for _ in range(MAX_TIMESTEPS):
            # Không cần lưu logprob hay value
            state_tensor = state.unsqueeze(0).unsqueeze(0)
            with torch.no_grad():
                action_probs, _ = agent.policy(state_tensor)
            action = torch.argmax(action_probs, dim=-1).item()
            action = np.unravel_index(action, env.action_space.nvec)
            action = np.array(action)
            print(action)
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            episode_reward += reward

            if render:
                env.render()

            if done:
                break

        total_rewards.append(episode_reward)
    avg_reward = np.mean(total_rewards)
    return avg_reward


In [37]:
eval_env = Env('map1.txt', 100, 5, 100, -0.01, 10., 1., 10)
model.policy.eval()
ev = evaluate(model, eval_env, render=False)
print(ev)

[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1

In [None]:
!pip freeze | grep stable_baselines3