Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [None]:
%%capture
!git clone https://github.com/cuongtv312/marl-delivery.git
%cd marl-delivery
!pip install -r requirements.txt

In [None]:
%%capture
!pip install stable-baselines3

In [None]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [None]:
# TODO: Modify this one to add more information to the Agents
def convert_state(state):
    max_row = len(state["map"]) - 1
    max_col = len(state["map"][0]) - 1 if max_row > 0 else 0
    time_step = state["time_step"]
    max_time = 100  # Giả định theo Env khởi tạo

    # Xử lý robot: [row, col, carrying] -> normalized
    robots = []
    for r in state["robots"]:
        norm_row = r[0] / max_row if max_row > 0 else 0
        norm_col = r[1] / max_col if max_col > 0 else 0
        robots.extend([norm_row, norm_col, r[2]])

    # Xử lý packages: [start_row, start_col, target_row, target_col, time_left]
    packages = []
    for p in state["packages"]:
        deadline = p[6]
        time_left = max(0, deadline - time_step) / (deadline - p[5] + 1e-5)
        norm_features = [
            p[1]/max_row, p[2]/max_col,
            p[3]/max_row, p[4]/max_col,
            time_left
        ]
        packages.extend(norm_features)

    # Đệm zeros cho packages (giới hạn 20 gói)
    max_pkg = 20
    if len(packages) < max_pkg * 5:
        packages += [0] * (max_pkg * 5 - len(packages))
    else:
        packages = packages[:max_pkg * 5]

    # Kết hợp tất cả đặc trưng
    combined = np.concatenate([
        np.array(robots).astype(np.float32),
        np.array(packages).astype(np.float32),
        [time_step / max_time]
    ])
    return combined

In [None]:
# TODO: Modify this one to make the agent learn faster

def reward_shaping(r, env, state, action):
    additional = 0
    for robot in env.robots:
        if robot.carrying:
            pkg = env.packages[robot.carrying-1]
            # Thưởng theo khoảng cách tới đích
            dist = np.linalg.norm(np.array(robot.position) - np.array(pkg.target))
            additional += 0.1 * (1 - dist/env.n_rows)
        else:
            # Khuyến khích đi gần điểm xuất phát của package
            for p in env.packages:
                if p.status == 'waiting' and p.start_time <= env.t:
                    dist = np.linalg.norm(np.array(robot.position) - np.array(p.start))
                    additional += 0.05 * (1 - dist/10) if dist < 5 else 0

    # Phạt hành động di chuyển không cần thiết
    for act in action:
        if act[0] in ['L','R','U','D'] and not robot.carrying:
            additional -= 0.01

    return r + additional

In [None]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)
        self.n_robots = self.env.n_robots #Lấy ra số lượng robot

        self.actions_per_robot = 5 * 3 #Mỗi robot có 15 hành động do 5 hành động là [S, L, R, U, D], 3 thao tác là [0, 1, 2]
        self.total_action = self.actions_per_robot ** self.n_robots

        #Chuyển qua môi trường rời rạc thay vì multi-discrete để sử dụng DQN
        self.action_space = spaces.Discrete(self.total_action)

        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        decoded_actions = []
        for _ in range(self.n_robots):
          robot_action = action % self.actions_per_robot
          action = action // self.actions_per_robot

          move_code = robot_action // 3
          package_code = robot_action % 3

          move = self.le1.inverse_transform([move_code])[0]
          package = self.le2.inverse_transform([package_code])[0]

          decoded_actions.append((move, package))

        decoded_actions = decoded_actions[::-1]

        # You should not modify the infos object
        s, r, done, infos = self.env.step(decoded_actions)
        new_r = reward_shaping(r, self.env, self.prev_state, decoded_actions)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback

# Tạo môi trường vector hóa
vec_env = make_vec_env(lambda: Env('map2.txt', 1000, 5, 100, -0.01, 10., 1., 10), n_envs=10)
eval_env = Monitor(Env('map2.txt', 1000, 5, 100, -0.01, 10., 1., 10))

# Callback đánh giá
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./dqn_best/",
    eval_freq=5000,
    deterministic=True
)

# Tạo mạng DQN tùy chỉnh
model = DQN(
    "MlpPolicy",
    vec_env,
    learning_starts=2000,
    buffer_size=100000,
    batch_size=128,
    learning_rate=1e-5,
    gamma=0.99,
    exploration_final_eps=0.02,
    policy_kwargs=dict(net_arch=[256, 256]),
    verbose=1
)

# Huấn luyện
model.learn(total_timesteps=100000, callback=eval_callback)
model.save("dqn_delivery_model")

Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 174      |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 247      |
|    time_elapsed     | 40       |
|    total_timesteps  | 10000    |
| train/              |          |
|    learning_rate    | 1e-05    |
|    loss             | 0.418    |
|    n_updates        | 199      |
----------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 174      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 247      |
|    time_elapsed    | 40       |
|    total_timesteps | 10000    |
---------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_

In [None]:
obs,_ = eval_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, _, info = eval_env.step(action)
    #print('='*10)
    #eval_env.unwrapped.env.render()
    if dones:
        break

print(info)

{'total_reward': -1.2000000000000006, 'total_time_steps': 1000, 'episode': {'r': 831.698656, 'l': 1000, 't': 706.00212}}


In [None]:
raw_env = eval_env.env.env

# Lấy danh sách các Package
pkgs = raw_env.packages

# Đếm số giao đúng hạn, giao trễ và chưa giao
on_time = 0
late = 0
undelivered = 0

# Lưu ý: ở env.py, mỗi Package có .deadline và .status ('delivered' khi đã giao) :contentReference[oaicite:0]{index=0}:contentReference[oaicite:1]{index=1}
for pkg in pkgs:
    if pkg.status == 'delivered':
        # pkg.status đã được set ngay trong step(), khi self.t <= pkg.deadline thì là đúng hạn
        # tuy nhiên để kiểm tra chính xác thời điểm giao, bạn có thể bổ sung thêm thuộc tính pkg.delivered_time
        # nếu chưa có, tạm xét: gói có deadline >= thời điểm kết thúc (raw_env.t) là đúng hạn
        if raw_env.t <= pkg.deadline:
            on_time += 1
        else:
            late += 1
    else:
        undelivered += 1

print(f"Số gói giao đúng hạn: {on_time}")
print(f"Số gói giao trễ hạn:   {late}")
print(f"Số gói chưa được giao:  {undelivered}")

Số gói giao đúng hạn: 0
Số gói giao trễ hạn:   0
Số gói chưa được giao:  100


In [None]:
!python main.py --seed 10 --max_time_steps 1000 --map map2.txt --num_agents 5 --n_packages 100

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1
State robot:  [(5, 3, 0), (5, 2, 0), (9, 17, 72), (3, 17, 0), (3, 16, 5)]
N robots =  5
Actions =  [('L', '1'), ('S', '1'), ('S', '2'), ('L', '1'), ('R', '2')]
[7, 7, 72, 5, 5]
1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	R4	R3	0	1
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1
1	0	R1	R0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	R2	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0	0	0	1
1	0	0	0	0	0	1	1	1	1	1	1	1	0	0	0	0

In [None]:
!pip freeze | grep stable_baselines3

stable_baselines3==2.6.0
