Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [2]:
# %%capture
# !git clone https://github.com/cuongtv312/marl-delivery.git
%cd /home/hungmanh/home_work/RL/marl-delivery
!uv add -r requirements.txt

/home/hungmanh/home_work/RL/marl-delivery
[2mResolved [1m92 packages[0m [2min 6ms[0m[0m
[2mAudited [1m86 packages[0m [2min 0.16ms[0m[0m


In [2]:
%%capture
!uv add stable-baselines3

In [3]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [4]:
N_MAX_SIZE = 20
M_MAX_AGENTS = 20 
P_MAX_PACKAGES = 1000 
MAX_TIME_STEPS = 2000 

def convert_state(state):
    """
    Chuyển đổi trạng thái đầu vào thành một vector NumPy phẳng, cố định kích thước,
    phù hợp để đưa vào thuật toán MAPPO (thường là cho Critic hoặc state toàn cục).

    Args:
        state (dict): Dictionary chứa thông tin trạng thái:
            "map": list 2 chiều (nxn)
            "robots": list các tuple (vị trí robot, trạng thái)
            "packages": list các tuple (id, vị trí lấy, vị trí giao, t_xuất_hiện, deadline)
            "time_step": int, bước thời gian hiện tại

    Returns:
        np.ndarray: Vector trạng thái đã được làm phẳng và chuẩn hóa.
    """
    ret_components = {} 

    current_time = state.get("time_step", 0) 

    time_feature = np.array([current_time / MAX_TIME_STEPS], dtype=np.float32)
    ret_components["time"] = time_feature

    game_map = np.array(state["map"], dtype=np.float32)
    n_map = game_map.shape[0]

    padded_map = np.zeros((N_MAX_SIZE, N_MAX_SIZE), dtype=np.float32)
    padded_map[:n_map, :n_map] = game_map
    ret_components["map"] = padded_map.flatten()

    robot_features_dim = 3
    robots_features = np.zeros(M_MAX_AGENTS * robot_features_dim, dtype=np.float32)
    
    pos_norm_factor = max(1, n_map - 1)

    for i in range(len(state["robots"])):
        if i >= M_MAX_AGENTS:
            break
        robot_pos1, robot_pos2, robot_status = state["robots"][i]
        offset = i * robot_features_dim

        robots_features[offset] = robot_pos1 / pos_norm_factor
        robots_features[offset+1] = robot_pos2 / pos_norm_factor

        if 0 <= robot_status <= 2: 
            robots_features[offset + 2 + int(robot_status)] = 1.0
            
    ret_components["robots"] = robots_features

    package_features_dim = 7
    packages_features = np.zeros(P_MAX_PACKAGES * package_features_dim, dtype=np.float32)

    sorted_packages = sorted(state["packages"], key=lambda p: (p[3], p[0]))

    for i in range(len(sorted_packages)):
        if i >= P_MAX_PACKAGES: 
            break
        
        pkg_id, pickup_loc1, pickup_loc2, dropoff_loc1, dropoff_loc2, appear_time, deadline_time = sorted_packages[i]
        offset = i * package_features_dim


        packages_features[offset] = pickup_loc1 / pos_norm_factor
        packages_features[offset+1] = pickup_loc2 / pos_norm_factor

        packages_features[offset+2] = dropoff_loc1 / pos_norm_factor
        packages_features[offset+3] = dropoff_loc2 / pos_norm_factor

        packages_features[offset+4] = appear_time / MAX_TIME_STEPS
        packages_features[offset+5] = deadline_time / MAX_TIME_STEPS
        

        is_active = 1.0 if (current_time >= appear_time and current_time < deadline_time) else 0.0
        packages_features[offset+6] = is_active
        
    ret_components["packages"] = packages_features
    
    final_vector = np.concatenate(
        [ret_components["time"], ret_components["map"], ret_components["robots"], ret_components["packages"]]
    )
    return final_vector.astype(np.float32)

In [5]:
env = Environment('map.txt', 100, 5, 100)
state = env.reset()
print(state["robots"])
print(state["packages"])
convert_state(state)


[(6, 5, 0), (5, 4, 0), (4, 4, 0), (2, 5, 0), (3, 5, 0)]
[(1, 4, 6, 6, 2, 0, 23), (2, 4, 3, 2, 4, 0, 29), (3, 3, 3, 5, 2, 0, 25), (4, 6, 3, 4, 3, 0, 22), (5, 4, 5, 2, 5, 0, 17), (6, 2, 6, 4, 5, 0, 22)]


array([0., 1., 1., ..., 0., 0., 0.], shape=(7461,), dtype=float32)

In [10]:

from collections import deque 
def reward_shaping(original_reward, env: Environment, prev_state, actions):
    shaped_reward = original_reward
    shaping_factor = 0.5

    for i, robot in enumerate(env.robots):
        prev_pos = tuple(prev_state['robots'][i][:2])
        curr_pos = robot.position
        grid = env.load_map()  # Giả sử đây là 2D list (0: trống, 1: tường)

        if robot.carrying:
            pkg = env.packages[robot.carrying - 1]
            d_prev = bfs_distance(prev_pos, pkg.target, grid)
            d_curr = bfs_distance(curr_pos, pkg.target, grid)
            if d_curr < d_prev:
                shaped_reward += shaping_factor
            else:
                shaped_reward -= 0.2
        else:
            for pkg in env.packages:
                if pkg.status == 'waiting':
                    d_prev = bfs_distance(prev_pos, pkg.start, grid)
                    d_curr = bfs_distance(curr_pos, pkg.start, grid)
                    if d_curr < d_prev:
                        shaped_reward += shaping_factor * 0.5
                    break

        if curr_pos == prev_pos:
            shaped_reward -= 0.1

        for pkg in env.packages:
            if pkg.status == 'delivering' and pkg.picked_by == i:
                waiting_time = pkg.pick_time - pkg.start_time if pkg.pick_time else 0
                shaped_reward += max(0, 1.0 - 0.01 * waiting_time)

    return shaped_reward
def bfs_distance(start, goal, grid):
    rows, cols = len(grid), len(grid[0])
    visited = set()
    queue = deque([(start, 0)])

    while queue:
        (x, y), dist = queue.popleft()
        if (x, y) == goal:
            return dist

        for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]:
            nx, ny = x + dx, y + dy
            if 0 <= nx < rows and 0 <= ny < cols:
                if grid[nx][ny] == 0 and (nx, ny) not in visited:  # 0 là ô trống
                    visited.add((nx, ny))
                    queue.append(((nx, ny), dist + 1))
    
    return float('inf')  # Không tìm được đường đi

In [8]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)


        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)

        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [11]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback

# Parallel environments

vec_env = make_vec_env(lambda: Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10), n_envs=10)
eval_env = Monitor(Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10), "ppo_delivery")

eval_callback = EvalCallback(eval_env, best_model_save_path="best_model/",
                             log_path="logs/", eval_freq=5000,
                             deterministic=True, render=False)

model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=10000, callback=eval_callback)
model.save("ppo_delivery")

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | 66.7     |
| time/              |          |
|    fps             | 789      |
|    iterations      | 1        |
|    time_elapsed    | 25       |
|    total_timesteps | 20480    |
---------------------------------


In [7]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

GAMMA = 0.99
CLIP_EPS = 0.2
LR = 3e-4
UPDATE_EPOCHS = 4
BATCH_SIZE = 64
MAX_TIMESTEPS = 1000

# Define CNN Policy

class CNNPolicy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(CNNPolicy, self).__init__()

        # Adjusted for 1D convolution (since height is 1)
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)  # 1D convolution
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)  # 1D convolution
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)  # Max pooling for 1D data

        # Calculate the size after convolution and pooling for 1D
        self.fc1 = nn.Linear(64 * (obs_dim[1] // 2), 256)  # Adjusted to reflect the reduced size
        self.fc2 = nn.Linear(256, act_dim)
        self.value_head = nn.Linear(256, 1)

    def forward(self, state):
        # state is assumed to be in the shape (batch_size, 1, height, width)
        # Squeeze out the height dimension (which is 1) to make it (batch_size, 1, width)
        state = state.squeeze(2)  # This will remove the height dimension (1) from shape [batch_size, 1, 1, width] to [batch_size, 1, width]

        # Now state is of shape (batch_size, 1, width)
        x = torch.relu(self.conv1(state))  # Conv1d operation
        x = torch.relu(self.conv2(x))  # Conv1d operation
        x = self.pool(x)  # Apply max pooling to reduce size
        x = x.view(x.size(0), -1)  # Flatten the tensor for the fully connected layer
        x = torch.relu(self.fc1(x))

        # Action probabilities (policy)
        action_probs = torch.softmax(self.fc2(x), dim=-1)

        # Value estimation
        state_value = self.value_head(x)

        return action_probs, state_value




# Rollout Buffer to store PPO rollouts
class RolloutBuffer:
    def __init__(self):
        self.states, self.actions, self.logprobs = [], [], []
        self.rewards, self.dones, self.values = [], [], []

    def clear(self):
        self.__init__()

# PPO Agent that uses the CNNPolicy
class PPOAgent:
    def __init__(self, obs_dim, act_dim):
        self.policy = CNNPolicy(obs_dim, act_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.buffer = RolloutBuffer()

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0)  # Add batch and channel dimensions (1, 1, H, W)
        probs, value = self.policy(state)
        dist = Categorical(probs)
        action = dist.sample()
        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(dist.log_prob(action))
        self.buffer.values.append(value)
        return action.item()

    def compute_returns_and_advantages(self, next_value):
        returns, advs = [], []
        gae = 0
        values = self.buffer.values + [next_value]
        for i in reversed(range(len(self.buffer.rewards))):
            delta = self.buffer.rewards[i] + GAMMA * values[i+1] * (1 - self.buffer.dones[i]) - values[i]
            gae = delta + GAMMA * gae * (1 - self.buffer.dones[i])
            advs.insert(0, gae)
            returns.insert(0, gae + values[i])
        return returns, advs

    def update(self, next_value):
        returns, advs = self.compute_returns_and_advantages(next_value)

        states = torch.cat(self.buffer.states)
        actions = torch.tensor(self.buffer.actions)
        old_logprobs = torch.stack(self.buffer.logprobs).detach()
        returns = torch.tensor(returns).detach().unsqueeze(1)
        advs = torch.tensor(advs).detach().unsqueeze(1)

        for _ in range(UPDATE_EPOCHS):
            probs, values = self.policy(states)
            dist = Categorical(probs)
            logprobs = dist.log_prob(actions)
            ratio = torch.exp(logprobs - old_logprobs)
            
            surr1 = ratio * advs
            surr2 = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advs
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = nn.MSELoss()(values, returns)

            loss = actor_loss + 0.5 * critic_loss

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        self.buffer.clear()

# Training Loop
def train(env):
    obs_dim = env.observation_space.shape  # (Height, Width)
    act_dim = int(np.prod(env.action_space.nvec))  # Flatten MultiDiscrete

    agent = PPOAgent(obs_dim, act_dim)

    for episode in range(1000):
        state, _ = env.reset()
        episode_reward = 0

        for t in range(MAX_TIMESTEPS):
            action_flat = agent.select_action(state)

            # Convert flat action back to multi-discrete
            action = np.unravel_index(action_flat, env.action_space.nvec)
            action = np.array(action)

            next_state, reward, done, _, _ = env.step(action)
            agent.buffer.rewards.append(reward)
            agent.buffer.dones.append(done)

            state = next_state
            episode_reward += reward

            if done:
                break

        with torch.no_grad():
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).unsqueeze(0)
            _, next_value = agent.policy(next_state_tensor)
        agent.update(next_value)
        print(f"Episode {episode}, Reward: {episode_reward}")


In [11]:
# Example usage
env = Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10)
obs_dim = env.observation_space.shape
print(obs_dim[1])
train(env)

571


KeyboardInterrupt: 

In [12]:
obs,_ = eval_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, _, info = eval_env.step(action)
    # print('='*10)
    # eval_env.unwrapped.env.render()
    if dones:
        break

print(info)

{'total_reward': -3.1199999999999988, 'total_time_steps': 100, 'episode': {'r': 42.03, 'l': 100, 't': 64.221242}}


In [7]:
!pip freeze | grep stable_baselines3

stable_baselines3==2.6.0
