Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [37]:
# %%capture
# !git clone https://github.com/cuongtv312/marl-delivery.git
%cd marl-delivery

[Errno 2] No such file or directory: 'marl-delivery'
/home/hungmanh/home_work/RL/marl-delivery


In [38]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import torch
from collections import deque

In [56]:
MAX_PACKAGES = 100  # Số lượng gói hàng tối đa bạn muốn hỗ trợ

def convert_state(state):
    current_time = state.get("time_step", 0)
    map_tensor = np.array(state["map"], dtype=np.float32)

    # Robot features: (x, y, status)
    robot_features = []
    for robot_x, robot_y, status in state["robots"]:
        robot_features.extend([robot_x, robot_y, float(status)])

    # Package features: (pickup_x, pickup_y, dropoff_x, dropoff_y, appear_time, deadline, is_active)
    package_features = []
    for pkg in sorted(state["packages"], key=lambda p: (p[3], p[0])):
        _, px, py, dx, dy, t_appear, t_deadline = pkg
        is_active = 1.0 if (current_time >= t_appear and current_time < t_deadline) else 0.0
        package_features.extend([px, py, dx, dy, t_appear, t_deadline, is_active])

    # Padding cho đủ MAX_PACKAGES
    n_pkgs = len(state["packages"])
    pkg_feature_len = 7
    if n_pkgs < MAX_PACKAGES:
        package_features += [0.0] * ((MAX_PACKAGES - n_pkgs) * pkg_feature_len)

    feature_vector = np.array(
        robot_features + package_features + [float(current_time)],
        dtype=np.float32
    )

    return map_tensor, feature_vector

In [67]:
def reward_shaping(r, env, state, action):
    additional = 0
    for robot in env.robots:
        if robot.carrying:
            pkg = env.packages[robot.carrying-1]
            # Thưởng theo khoảng cách tới đích
            dist = np.linalg.norm(np.array(robot.position) - np.array(pkg.target))
            additional += 0.1 * (1 - dist/env.n_rows)
        else:
            # Khuyến khích đi gần điểm xuất phát của package
            for p in env.packages:
                if p.status == 'waiting' and p.start_time <= env.t:
                    dist = np.linalg.norm(np.array(robot.position) - np.array(p.start))
                    additional += 0.05 * (1 - dist/10) if dist < 5 else 0
    
    # Phạt hành động di chuyển không cần thiết
    for act in action:
        if act[0] in ['L','R','U','D'] and not robot.carrying:
            additional -= 0.01
    
    return r + additional

In [54]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)
        self.n_agents = self.env.n_robots 

        self.prev_state = self.env.reset()
        map, feature=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Dict({
            "map": spaces.Box(low=0, high=100, shape=map.shape, dtype=np.float32),
            "feature": spaces.Box(low=0, high=100, shape=feature.shape, dtype=np.float32)
        })


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [22]:
env = Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10)
env.observation_space

Dict('feature': Box(0.0, 100.0, (58,), float32), 'map': Box(0.0, 100.0, (20, 20), float32))

In [None]:
# env = Env('map1.txt', 1000, 5, 100,-0.01, 10., 1., 10)
eval_env = Env('map2.txt',1000, 5, 100,-0.01, 10., 1., 10)


while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, _, info = eval_env.step(action)
    #print('='*10)
    #eval_env.unwrapped.env.render()
    if dones:
        break

print(info)

state_convert = env.reset()

print(state_convert)

((array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32), array([ 8.,  8.,  0.,  6.,  8.,  0.,  2.,  7.,  0.,  9.,  4.,  0.,  4.,
        4.,  0.,  2.,  3.,  3.,  3.,  0., 34.,  1.,  9.,  2.,  3.,  3.,
        0., 17.,  1.,  9.,  7.,  5.,  4.,  0., 25.,  1.,  3.,  7.,  6.,
        3.,  0., 22.,  1.,  8.,  6.,  7.,  8.,  0., 21.,  1.,  6.,  8.,
        8.,  8.,  0., 16.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [58]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import random
from collections import deque
 
# CNN cho việc xử lý map tensor
class CNNEncoder(nn.Module):
    def __init__(self, input_channels=1, map_size=20):
        super(CNNEncoder, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        
        # Tính toán kích thước đầu ra sau các lớp conv
        output_size = map_size // 2  # Sau stride=2 ở conv3
        self.fc = nn.Linear(64 * output_size * output_size, 256)
    
    def forward(self, x):
        # x shape: [batch_size, 1, map_size, map_size]
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc(x))
        return x

# MLP cho việc xử lý feature vector
class MLPEncoder(nn.Module):
    def __init__(self, input_dim):
        super(MLPEncoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 256)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x

# Actor network
class Actor(nn.Module):
    def __init__(self, map_size, feature_dim, n_move_actions, n_status_actions, n_agents):
        super(Actor, self).__init__()
        self.cnn_encoder = CNNEncoder(input_channels=1, map_size=map_size)
        self.mlp_encoder = MLPEncoder(feature_dim)
        self.fc_combine = nn.Linear(256 + 256, 256)
        self.move_heads = nn.ModuleList([nn.Linear(256, n_move_actions) for _ in range(n_agents)])
        self.status_heads = nn.ModuleList([nn.Linear(256, n_status_actions) for _ in range(n_agents)])

    def forward(self, map_tensor, feature_vector):
        batch_size = map_tensor.size(0)
        map_tensor = map_tensor.unsqueeze(1)
        map_features = self.cnn_encoder(map_tensor)
        feature_features = self.mlp_encoder(feature_vector)
        combined = torch.cat([map_features, feature_features], dim=1)
        combined = F.relu(self.fc_combine(combined))
        move_probs = [F.softmax(head(combined), dim=-1) for head in self.move_heads]
        status_probs = [F.softmax(head(combined), dim=-1) for head in self.status_heads]
        return move_probs, status_probs

# Critic network
class Critic(nn.Module):
    def __init__(self, map_size, feature_dim):
        super(Critic, self).__init__()
        self.cnn_encoder = CNNEncoder(input_channels=1, map_size=map_size)
        self.mlp_encoder = MLPEncoder(feature_dim)
        
        # Kết hợp đầu ra của CNN và MLP
        self.fc_combine = nn.Linear(256 + 256, 256)
        self.fc_value = nn.Linear(256, 1)
    
    def forward(self, map_tensor, feature_vector):
        # Xử lý map qua CNN
        map_tensor = map_tensor.unsqueeze(1)  # Thêm kênh input
        map_features = self.cnn_encoder(map_tensor)
        
        # Xử lý feature vector qua MLP
        feature_features = self.mlp_encoder(feature_vector)
        
        # Kết hợp đặc trưng
        combined = torch.cat([map_features, feature_features], dim=1)
        combined = F.relu(self.fc_combine(combined))
        
        # Tính toán giá trị
        value = self.fc_value(combined)
        
        return value

# MAPPO Agent
class MAPPOAgent:
    def __init__(self, map_size, feature_dim, n_move_actions, n_status_actions,n_agents,
                 actor_lr=3e-4, critic_lr=1e-3, gamma=0.99, 
                 gae_lambda=0.95, clip_param=0.2, value_coef=0.5, 
                 entropy_coef=0.01, max_grad_norm=0.5):
        
        self.actor = Actor(map_size, feature_dim, n_move_actions, n_status_actions,n_agents)
        self.critic = Critic(map_size, feature_dim)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
        
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_param = clip_param
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.max_grad_norm = max_grad_norm
        
        self.n_agents = n_agents
        self.n_move_actions = n_move_actions
        self.n_status_actions = n_status_actions
    
    def get_action(self, map_tensor, feature_vector):
        map_tensor = torch.FloatTensor(map_tensor).unsqueeze(0)
        feature_vector = torch.FloatTensor(feature_vector).unsqueeze(0)
        with torch.no_grad():
            move_probs, status_probs = self.actor(map_tensor, feature_vector)
            value = self.critic(map_tensor, feature_vector)
        actions = []
        log_probs = []
        for move_p, status_p in zip(move_probs, status_probs):
            move_dist = Categorical(move_p)
            status_dist = Categorical(status_p)
            move_a = move_dist.sample()
            status_a = status_dist.sample()
            actions.append([move_a.item(), status_a.item()])
            log_probs.append(move_dist.log_prob(move_a).item() + status_dist.log_prob(status_a).item())
        return np.array(actions), log_probs, value.item()
    
    def evaluate_actions(self, map_tensors, feature_vectors, actions):
        move_probs, status_probs = self.actor(map_tensors, feature_vectors)
        values = self.critic(map_tensors, feature_vectors)

        action_log_probs = []
        entropy = 0

        # actions shape: [batch, n_agents, 2]
        # Tách move và status action cho từng agent
        move_actions = actions[:, :, 0]
        status_actions = actions[:, :, 1]

        for i in range(self.n_agents):
            move_dist = Categorical(move_probs[i])
            status_dist = Categorical(status_probs[i])
            move_log_prob = move_dist.log_prob(move_actions[:, i])
            status_log_prob = status_dist.log_prob(status_actions[:, i])
            action_log_probs.append(move_log_prob + status_log_prob)
            entropy += (move_dist.entropy().mean() + status_dist.entropy().mean()) / 2

        action_log_probs = torch.stack(action_log_probs, dim=1)  # [batch, n_agents]

        return values, action_log_probs, entropy / self.n_agents
    
    def update(self, memories):
        # Trích xuất dữ liệu từ bộ nhớ
        maps = []
        features = []
        actions = []
        old_log_probs = []
        rewards = []
        masks = []
        values = []
        
        for memory in memories:
            maps.append(torch.FloatTensor(np.array(memory.maps)))
            features.append(torch.FloatTensor(np.array(memory.features)))
            actions.append(torch.LongTensor(np.array(memory.actions)))
            old_log_probs.append(torch.FloatTensor(np.array(memory.log_probs)))
            rewards.append(torch.FloatTensor(np.array(memory.rewards)))
            masks.append(torch.FloatTensor(np.array(memory.masks)))
            values.append(torch.FloatTensor(np.array(memory.values)))
        
        maps = torch.cat(maps)
        features = torch.cat(features)
        actions = torch.cat(actions)
        old_log_probs = torch.cat(old_log_probs)
        rewards = torch.cat(rewards)
        masks = torch.cat(masks)
        values = torch.cat(values)
        
        # Tính toán returns và advantages sử dụng GAE
        returns = torch.zeros_like(rewards)
        advantages = torch.zeros_like(rewards)
        
        last_value = self.critic(maps[-1].unsqueeze(0), features[-1].unsqueeze(0)).detach()
        last_gae_lam = 0
        
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                next_value = last_value
            else:
                next_value = values[t + 1]
            
            next_non_terminal = masks[t]
            delta = rewards[t] + self.gamma * next_value * next_non_terminal - values[t]
            last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
            advantages[t] = last_gae_lam
        
        returns = advantages + values
        
        # Chuẩn hóa lợi thế
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        # Đảm bảo shape [batch, 1]
        if advantages.dim() == 1:
            advantages = advantages.unsqueeze(1)
        # Lặp lại cho từng agent để có shape [batch, n_agents]
        advantages = advantages.expand(-1, self.n_agents)

        # Tối ưu hóa policy and value networks
        for _ in range(10):  # K epochs
            values, action_log_probs, entropy = self.evaluate_actions(maps, features, actions)
            # action_log_probs: [batch, n_agents], old_log_probs: [batch, n_agents]
            ratios = torch.exp(action_log_probs - old_log_probs)
            # Surrogate loss
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantages
            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = F.mse_loss(values, returns.unsqueeze(-1))
            entropy_loss = -entropy.mean()
            
            # Tổng loss
            loss = policy_loss + self.value_coef * value_loss + self.entropy_coef * entropy_loss
            
            # Gradient descent
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
            
            self.actor_optimizer.step()
            self.critic_optimizer.step()
        
        return policy_loss.item(), value_loss.item(), entropy_loss.item()

# Lớp Memory để lưu trữ kinh nghiệm
class Memory:
    def __init__(self):
        self.maps = []
        self.features = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.masks = []
        self.values = []
    
    def push(self, map_tensor, feature_vector, action, log_prob, reward, mask, value):
        self.maps.append(map_tensor)
        self.features.append(feature_vector)
        self.actions.append(action)
        self.log_probs.append(log_prob)
        self.rewards.append(reward)
        self.masks.append(mask)
        self.values.append(value)
    
    def clear(self):
        self.maps = []
        self.features = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.masks = []
        self.values = []

# Hàm huấn luyện
def train(env, agent, num_episodes=1000, max_steps=100, update_interval=2048):
    memories = [Memory() for _ in range(env.env.n_robots)]
    global_step = 0
    episode_rewards = []
    
    for episode in range(num_episodes):
        obs, _ = env.reset()
        map_tensor, feature_vector = obs
        
        episode_reward = 0
        step = 0
        
        while step < max_steps:
            # Chọn hành động
            actions, log_probs, value = agent.get_action(map_tensor, feature_vector)
            
            # Thực hiện hành động
            next_obs, reward, terminated, truncated, info = env.step(actions)
            next_map_tensor, next_feature_vector = next_obs
            
            # Lưu trữ kinh nghiệm
            mask = 1.0 - float(terminated or truncated)
            for i in range(env.n_agents):
                memories[i].push(
                    map_tensor,
                    feature_vector,
                    actions,
                    log_probs,
                    reward,  # Reward trung bình
                    mask,
                    value
                )
            
            # Cập nhật
            if global_step % update_interval == 0 and global_step > 0:
                policy_loss, value_loss, entropy_loss = agent.update(memories)
                print(f"Episode {episode}, Step {step}, Policy Loss: {policy_loss:.4f}, Value Loss: {value_loss:.4f}, Entropy Loss: {entropy_loss:.4f}")
                
                # Xóa bộ nhớ sau khi cập nhật
                for memory in memories:
                    memory.clear()
            
            # Cập nhật trạng thái và reward
            map_tensor, feature_vector = next_map_tensor, next_feature_vector
            episode_reward += reward
            global_step += 1
            step += 1
            
            if terminated or truncated:
                break
        
        episode_rewards.append(episode_reward)
        
        # In thông tin
        if episode % 10 == 0:
            avg_reward = sum(episode_rewards[-10:]) / 10
            print(f"Episode {episode}, Avg Reward: {avg_reward:.4f}")
    
    return episode_rewards

def run_trained_model(env, agent, num_episodes=10):
    for episode in range(num_episodes):
        obs, _ = env.reset()
        map_tensor, feature_vector = obs
        
        episode_reward = 0
        step = 0
        
        while True:
            # Chọn hành động
            actions, _, _ = agent.get_action(map_tensor, feature_vector)
            
            # Thực hiện hành động
            next_obs, reward, terminated, truncated, info = env.step(actions)
            next_map_tensor, next_feature_vector = next_obs
            
            # Cập nhật trạng thái và reward
            map_tensor, feature_vector = next_map_tensor, next_feature_vector
            episode_reward += reward
            step += 1
            
            print(f"Step {step}, Action: {actions}, Reward: {reward}")
            
            if terminated or truncated:
                break
        
        print(f"Episode {episode}, Total Reward: {episode_reward}, Steps: {step}")

In [68]:
# Tạo môi trường
example_env = Env('map2.txt', 1000, 5, 20, -0.01, 10., 1., 10)
eval_env = Env('map1.txt', 1000, 5, 100, -0.01, 10., 1., 10)
# Kiểm tra môi trường
(obs, _) = example_env.reset()
map_tensor, feature_vector = obs
print("Map tensor shape:", map_tensor.shape)
print("Feature vector shape:", feature_vector.shape)

action = example_env.action_space.sample()
print("Sampled action:", action)

(next_obs, reward, terminated, truncated, info) = example_env.step(action)
next_map_tensor, next_feature_vector = next_obs
print("Reward:", reward)
print("Next map tensor shape:", next_map_tensor.shape)
print("Next feature vector shape:", next_feature_vector.shape)

# Tạo agent
map_size = map_tensor.shape[0]
feature_dim = feature_vector.shape[0]
n_actions = 4  # up, right, down, left
n_status_actions = 3  # S, L, R
n_agents = 5

agent = MAPPOAgent(map_size, feature_dim, n_actions,n_status_actions, n_agents=n_agents)

# Huấn luyện
print("\nBắt đầu huấn luyện...")
rewards = train(example_env, agent, num_episodes=3, max_steps=1000, update_interval=200)

# Chạy mô hình đã huấn luyện
print("\nChạy mô hình đã huấn luyện...")
run_trained_model(example_env, agent, num_episodes=1)

Map tensor shape: (20, 20)
Feature vector shape: (716,)
Sampled action: [4 2 0 0 1 2 3 0 0 2]
Reward: 0.1464200112061815
Next map tensor shape: (20, 20)
Next feature vector shape: (716,)

Bắt đầu huấn luyện...
Episode 0, Step 200, Policy Loss: -0.0219, Value Loss: 0.2977, Entropy Loss: -1.2336
Episode 0, Step 400, Policy Loss: -0.0358, Value Loss: 0.3716, Entropy Loss: -1.1858
Episode 0, Step 600, Policy Loss: -0.0328, Value Loss: 2.7015, Entropy Loss: -1.0796
Episode 0, Step 800, Policy Loss: -0.0425, Value Loss: 2.8436, Entropy Loss: -0.8483
Episode 0, Avg Reward: 20.2325
Episode 1, Step 0, Policy Loss: -0.0327, Value Loss: 27.5846, Entropy Loss: -0.8673
Episode 1, Step 200, Policy Loss: -0.0212, Value Loss: 0.9849, Entropy Loss: -1.2281
Episode 1, Step 400, Policy Loss: -0.0355, Value Loss: 0.4199, Entropy Loss: -1.1335
Episode 1, Step 600, Policy Loss: -0.0387, Value Loss: 0.3118, Entropy Loss: -1.0190
Episode 1, Step 800, Policy Loss: -0.0385, Value Loss: 2.1167, Entropy Loss: -0.

In [70]:
run_trained_model(eval_env, agent, num_episodes=1)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1600 and 6400x256)

In [11]:
def test(env, agent, max_steps):
    obs, _ = env.reset()
    map_tensor, feature_vector = obs

    print("Map tensor shape:", map_tensor.shape)
    print("Feature vector shape:", feature_vector.shape)

    total_test_reward = 0
    step = 0
    done = False
    truncated = False

    while not done and not truncated and step < max_steps:
        actions, _, _ = agent.get_action(map_tensor, feature_vector)
        print(f"Actions: {actions}")
        obs, reward, done, truncated, info = env.step(actions)
        total_test_reward += reward
        step += 1

    print(f" Tổng phần thưởng :{total_test_reward:.2f} sau {step} bước.")


In [12]:
env = Env('map5.txt', 100, 10, 1000, -0.01, 10.0, 1.0, 10)
test(env, agent, max_steps=100)

NameError: name 'agent' is not defined

In [None]:
!pip freeze | grep stable_baselines3