Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [2]:
# %%capture
# !git clone https://github.com/cuongtv312/marl-delivery.git
%cd marl-delivery

[Errno 2] No such file or directory: 'marl-delivery'
/home/hungmanh/home_work/RL/marl-delivery


In [3]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import torch
from collections import deque

In [4]:
env = Environment('map.txt', 10, 2, 5)
state = env.reset()
print(state["robots"])
print(state["packages"])
print(state["map"])

[(5, 4, 0), (5, 3, 0)]
[(1, 6, 5, 4, 3, 0, 26), (2, 2, 2, 2, 3, 0, 22), (3, 4, 5, 5, 6, 0, 22)]
[[1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1]]


In [5]:
def convert_state(state):
    """
    Chuyển đổi trạng thái đầu vào thành 2 phần: 
        - map_tensor: 2D numpy array (giữ nguyên spatial shape)
        - feature_vector: vector phẳng từ robots, packages và time_step

    Không normalize, không padding.

    Args:
        state (dict): Dictionary chứa:
            "map": list 2D
            "robots": list các tuple (x, y, status)
            "packages": list các tuple (id, pickup_x, pickup_y, dropoff_x, dropoff_y, appear_time, deadline)
            "time_step": int

    Returns:
        map_tensor (np.ndarray): shape (n, n)
        feature_vector (np.ndarray): vector 1D float32
    """
    current_time = state.get("time_step", 0)

    # Dữ liệu bản đồ giữ nguyên shape (n, n)
    map_tensor = np.array(state["map"], dtype=np.float32)

    # Robot features: (x, y, status)
    robot_features = []
    for robot_x, robot_y, status in state["robots"]:
        robot_features.extend([robot_x, robot_y, float(status)])

    # Package features: (pickup_x, pickup_y, dropoff_x, dropoff_y, appear_time, deadline, is_active)
    package_features = []
    for pkg in sorted(state["packages"], key=lambda p: (p[3], p[0])):  # sort by dropoff_x then id
        _, px, py, dx, dy, t_appear, t_deadline = pkg
        is_active = 1.0 if (current_time >= t_appear and current_time < t_deadline) else 0.0
        package_features.extend([px, py, dx, dy, t_appear, t_deadline, is_active])

    # Gộp robot + package + time_step
    feature_vector = np.array(
        robot_features + package_features + [float(current_time)],
        dtype=np.float32
    )

    return map_tensor, feature_vector


In [7]:
map_tensor, feature_vector = convert_state(state)
print(map_tensor)
print(feature_vector)

[[1. 1. 1. 1. 1. 1. 1.]
 [1. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 1.]
 [1. 1. 1. 1. 1. 1. 1.]]
[ 5.  4.  0.  5.  3.  0.  2.  2.  2.  3.  0. 22.  1.  6.  5.  4.  3.  0.
 26.  1.  4.  5.  5.  6.  0. 22.  1.  0.]


In [8]:
from collections import deque

def reward_shaping(original_reward, env, prev_state, actions):
    shaped_reward = original_reward
    shaping_factor = 0.5

    map_grid = env.load_map()

    for i, robot in enumerate(env.robots):
        prev_pos = tuple(prev_state['robots'][i][:2])
        curr_pos = robot.position

        # --- 1. Thưởng/Penalty di chuyển hợp lý ---
        if robot.carrying:
            pkg = env.packages[robot.carrying - 1]
            d_prev = bfs_distance(prev_pos, pkg.target, map_grid)
            d_curr = bfs_distance(curr_pos, pkg.target, map_grid)
            if d_curr < d_prev:
                shaped_reward += shaping_factor
            else:
                shaped_reward -= 0.3  # đi xa mục tiêu
        else:
            # tìm gói hàng gần nhất đang chờ
            waiting_pkgs = [p for p in env.packages if p.status == 'waiting']
            if waiting_pkgs:
                pkg = min(waiting_pkgs, key=lambda p: bfs_distance(prev_pos, p.start, map_grid))
                d_prev = bfs_distance(prev_pos, pkg.start, map_grid)
                d_curr = bfs_distance(curr_pos, pkg.start, map_grid)
                if d_curr < d_prev:
                    shaped_reward += shaping_factor * 0.3
                else:
                    shaped_reward -= 0.1

        # --- 2. Phạt đứng yên ---
        if curr_pos == prev_pos:
            shaped_reward -= 0.2

        # --- 3. Phạt va chạm tường hoặc bước không hợp lệ ---
        if not (0 <= curr_pos[0] < len(map_grid) and 0 <= curr_pos[1] < len(map_grid[0])):
            shaped_reward -= 1.0  # đi ra ngoài bản đồ
        elif map_grid[curr_pos[0]][curr_pos[1]] == 1:
            shaped_reward -= 0.8  # đụng tường

        # --- 4. Giao hàng trễ bị phạt ---
        for pkg in env.packages:
            if pkg.status == 'delivering' and pkg.picked_by == i:
                waiting_time = pkg.pick_time - pkg.start_time if pkg.pick_time else 0
                shaped_reward += max(0.0, 1.0 - 0.01 * waiting_time)

            if pkg.status == 'delivered' and pkg.delivered_by == i:
                if env.time_step > pkg.deadline:
                    shaped_reward -= 1.0  # giao trễ
                else:
                    shaped_reward += 2.0  # giao đúng hạn

        # --- 5. Tránh va chạm giữa các robot ---
        for j, other_robot in enumerate(env.robots):
            if i != j and curr_pos == other_robot.position:
                shaped_reward -= 0.5  # phạt va chạm

        # --- 6. Hành động không hiệu quả ---
        if not robot.carrying and actions[i] == 'WAIT':
            shaped_reward -= 0.1

    return shaped_reward


def bfs_distance(start, goal, grid):
    rows, cols = len(grid), len(grid[0])
    visited = set()
    queue = deque([(start, 0)])
    while queue:
        (x, y), dist = queue.popleft()
        if (x, y) == goal:
            return dist
        for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]:
            nx, ny = x + dx, y + dy
            if 0 <= nx < rows and 0 <= ny < cols:
                if grid[nx][ny] == 0 and (nx, ny) not in visited:
                    visited.add((nx, ny))
                    queue.append(((nx, ny), dist + 1))
    return float('inf')


In [9]:
env = Environment('map.txt', 10, 2, 5)
state = env.reset()
print(state["robots"])
print(state["packages"])
# print(state["map"])
# print(env.load_map())
print(env.step([('L', 1),('U',0)]))

[(5, 4, 0), (5, 3, 0)]
[(1, 6, 5, 4, 3, 0, 26), (2, 2, 2, 2, 3, 0, 22), (3, 4, 5, 5, 6, 0, 22)]
({'time_step': 1, 'map': [[1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1]], 'robots': [(5, 3, 0), (4, 3, 0)], 'packages': [(1, 6, 5, 4, 3, 0, 26), (2, 2, 2, 2, 3, 0, 22), (3, 4, 5, 5, 6, 0, 22)]}, -0.02, False, {})


In [None]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)


        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        map_shape = first_state[0].shape
        feature_shape = first_state[1].shape

        # Define observation space as a dictionary
        self.observation_space = spaces.Dict({
            "map": spaces.Box(low=0, high=100, shape=map_shape, dtype=np.float32),
            "feature": spaces.Box(low=0, high=100, shape=feature_shape, dtype=np.float32)
        })

        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [19]:
example_env = Env('map.txt', 10, 2, 5)
x, y, done, infos = example_env.reset()


(array([[1., 1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1.]], dtype=float32), array([ 5.,  4.,  0.,  5.,  3.,  0.,  2.,  2.,  2.,  3.,  0., 22.,  1.,
        6.,  5.,  4.,  3.,  0., 26.,  1.,  4.,  5.,  5.,  6.,  0., 22.,
        1.,  0.], dtype=float32))


ValueError: not enough values to unpack (expected 4, got 2)

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

DEVICE = torch.device("cpu")
LR = 1e-4
GAMMA = 0.99
CLIP_EPS = 0.2
UPDATE_EPOCHS = 10
VALUE_COEF = 0.5
ENTROPY_COEF = 0.01
MAX_TIMESTEPS = 1000
# ==== MAPPO Hyperparameters ====
MAX_ROBOTS = 20
MAX_PACKAGES = 1000
MAX_TIMESTEPS = 1000

# ==== MAPPO Policy ====
class MAPPOPolicy(nn.Module):
    def __init__(self, state_dim, n_agents, move_action_dim=5, pkg_action_dim=3, hidden_dim=256):
        super().__init__()
        self.n_agents = n_agents
        self.move_action_dim = move_action_dim
        self.pkg_action_dim = pkg_action_dim

        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

        self.move_heads = nn.ModuleList([nn.Linear(hidden_dim, move_action_dim) for _ in range(n_agents)])
        self.pkg_heads = nn.ModuleList([nn.Linear(hidden_dim, pkg_action_dim) for _ in range(n_agents)])
        self.value_head = nn.Linear(hidden_dim, 1)

    def forward(self, state):
        # state: (batch, state_dim)
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        move_logits = [head(x) for head in self.move_heads]  # [(batch, move_action_dim)] * n_agents
        pkg_logits = [head(x) for head in self.pkg_heads]    # [(batch, pkg_action_dim)] * n_agents
        value = self.value_head(x)
        move_logits = torch.stack(move_logits, dim=1)  # (batch, n_agents, move_action_dim)
        pkg_logits = torch.stack(pkg_logits, dim=1)    # (batch, n_agents, pkg_action_dim)
        return move_logits, pkg_logits, value

# ==== MAPPO Buffer ====
class MAPPOBuffer:
    def __init__(self):
        self.states, self.move_actions, self.pkg_actions = [], [], []
        self.logprobs_move, self.logprobs_pkg = [], []
        self.rewards, self.dones, self.values = [], [], []

    def clear(self):
        self.__init__()

# ==== MAPPO Agent ====
class MAPPOAgent:
    def __init__(self, state_dim, n_agents, move_action_dim=5, pkg_action_dim=3):
        self.policy = MAPPOPolicy(state_dim, n_agents, move_action_dim, pkg_action_dim).to(DEVICE)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.buffer = MAPPOBuffer()
        self.n_agents = n_agents

    def select_action(self, state):
        # state: (1, state_dim)
        move_logits, pkg_logits, value = self.policy(state)
        move_dists = [Categorical(logits=move_logits[0, i]) for i in range(self.n_agents)]
        pkg_dists = [Categorical(logits=pkg_logits[0, i]) for i in range(self.n_agents)]
        move_actions = [dist.sample() for dist in move_dists]
        pkg_actions = [dist.sample() for dist in pkg_dists]
        logprobs_move = [dist.log_prob(a) for dist, a in zip(move_dists, move_actions)]
        logprobs_pkg = [dist.log_prob(a) for dist, a in zip(pkg_dists, pkg_actions)]

        self.buffer.states.append(state)
        self.buffer.move_actions.append(torch.stack(move_actions))
        self.buffer.pkg_actions.append(torch.stack(pkg_actions))
        self.buffer.logprobs_move.append(torch.stack(logprobs_move))
        self.buffer.logprobs_pkg.append(torch.stack(logprobs_pkg))
        self.buffer.values.append(value.squeeze())

        # Return as numpy for env.step
        actions = [(move.item(), pkg.item()) for move, pkg in zip(move_actions, pkg_actions)]
        return np.array(actions)

    def compute_returns_and_advantages(self, next_value):
        returns, advs = [], []
        gae = 0
        values = self.buffer.values + [next_value]
        for i in reversed(range(len(self.buffer.rewards))):
            delta = self.buffer.rewards[i] + GAMMA * values[i + 1] * (1 - self.buffer.dones[i]) - values[i]
            gae = delta + GAMMA * gae * (1 - self.buffer.dones[i])
            advs.insert(0, gae)
            returns.insert(0, gae + values[i])
        return returns, advs

    def update(self, next_value):
        returns, advs = self.compute_returns_and_advantages(next_value)

        states = torch.cat(self.buffer.states).to(DEVICE)
        move_actions = torch.stack(self.buffer.move_actions).to(DEVICE)
        pkg_actions = torch.stack(self.buffer.pkg_actions).to(DEVICE)
        old_logprobs_move = torch.stack(self.buffer.logprobs_move).detach().to(DEVICE)
        old_logprobs_pkg = torch.stack(self.buffer.logprobs_pkg).detach().to(DEVICE)
        returns = torch.tensor(returns).detach().unsqueeze(1).to(DEVICE)
        advs = torch.tensor(advs).detach().unsqueeze(1).to(DEVICE)

        for _ in range(UPDATE_EPOCHS):
            move_logits, pkg_logits, values = self.policy(states)
            loss_actor, loss_critic, entropy = 0, 0, 0

            for i in range(self.n_agents):
                move_dist = Categorical(logits=move_logits[:, i])
                pkg_dist = Categorical(logits=pkg_logits[:, i])
                logprob_move = move_dist.log_prob(move_actions[:, i])
                logprob_pkg = pkg_dist.log_prob(pkg_actions[:, i])
                entropy += move_dist.entropy().mean() + pkg_dist.entropy().mean()

                ratio_move = torch.exp(logprob_move - old_logprobs_move[:, i])
                ratio_pkg = torch.exp(logprob_pkg - old_logprobs_pkg[:, i])
                surr1 = (ratio_move + ratio_pkg) * advs.squeeze()
                surr2 = torch.clamp(ratio_move + ratio_pkg, 1 - CLIP_EPS, 1 + CLIP_EPS) * advs.squeeze()
                loss_actor += -torch.min(surr1, surr2).mean()

            loss_critic = nn.MSELoss()(values, returns)
            loss = loss_actor + VALUE_COEF * loss_critic - ENTROPY_COEF * entropy

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        self.buffer.clear()

# ==== Training Loop ====
def train_mappo(env, convert_state, n_agents, episodes=5):
    # Get state_dim from convert_state
    state, _ = env.reset()
    state_tensor = convert_state(state)
    state_dim = state_tensor.shape[1]
    agent = MAPPOAgent(state_dim, n_agents)
    for episode in range(episodes):
        state, _ = env.reset()
        state_tensor = convert_state(state)
        episode_reward = 0
        for t in range(MAX_TIMESTEPS):
            action = agent.select_action(state_tensor)
            next_state, reward, done, _, _ = env.step(action)
            agent.buffer.rewards.append(reward)
            agent.buffer.dones.append(done)
            state_tensor = convert_state(next_state)
            episode_reward += reward
            if done:
                break
        with torch.no_grad():
            _, _, next_value = agent.policy(state_tensor)
        agent.update(next_value.squeeze())
        print(f"Episode {episode}, Reward: {episode_reward}")
    return agent

In [19]:
# Training
env = Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10)
n_agents = env.env.n_robots  # or set manually
trained_agent = train_mappo(env, convert_state, n_agents, episodes=5)

# Testing in another environment
test_env = Env('map1.txt', 100, 5, 20, -0.01, 10., 1., 10)
state, _ = test_env.reset()
state_tensor = convert_state(state)
with torch.no_grad():
    move_logits, pkg_logits, _ = trained_agent.policy(state_tensor)
    move_actions = torch.argmax(move_logits, dim=-1)
    pkg_actions = torch.argmax(pkg_logits, dim=-1)
    actions = [(move.item(), pkg.item()) for move, pkg in zip(move_actions[0], pkg_actions[0])]
print(actions)

IndexError: too many indices for tensor of dimension 2

In [36]:
def evaluate(agent, env, episodes=10, render=False):
    total_rewards = []

    for episode in range(episodes):
        state, _ = env.reset()
        episode_reward = 0

        for _ in range(MAX_TIMESTEPS):
            # Không cần lưu logprob hay value
            state_tensor = state.unsqueeze(0).unsqueeze(0)
            with torch.no_grad():
                action_probs, _ = agent.policy(state_tensor)
            action = torch.argmax(action_probs, dim=-1).item()
            action = np.unravel_index(action, env.action_space.nvec)
            action = np.array(action)
            print(action)
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            episode_reward += reward

            if render:
                env.render()

            if done:
                break

        total_rewards.append(episode_reward)
    avg_reward = np.mean(total_rewards)
    return avg_reward


In [37]:
eval_env = Env('map1.txt', 100, 5, 100, -0.01, 10., 1., 10)
model.policy.eval()
ev = evaluate(model, eval_env, render=False)
print(ev)

[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1 2 2 1 4 2]
[1 0 4 2 1

In [None]:
!pip freeze | grep stable_baselines3