In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Actor Network
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_limit):
        super(Actor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Tanh()
        )
        self.action_limit = action_limit

    def forward(self, state):
        return self.fc(state) * self.action_limit

# Critic Network
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim + action_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state, action):
        return self.fc(torch.cat([state, action], dim=1))

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

# Example parameters
state_dim = 10
action_dim = 2
action_limit = 1
replay_buffer_capacity = 10000
batch_size = 32

# Initialize networks and replay buffer
actor = Actor(state_dim, action_dim, action_limit)
critic1 = Critic(state_dim, action_dim)
critic2 = Critic(state_dim, action_dim)
replay_buffer = ReplayBuffer(replay_buffer_capacity)

# Note: This code provides the basic structure for initializing and updating networks in a multi-agent reinforcement learning scenario.
# However, the actual implementation would require more specific details like network architecture, learning rates, environment setup, etc.


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Actor Network with self-attention mechanism
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_limit):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.attention = nn.MultiheadAttention(embed_dim=64, num_heads=4)
        self.fc2 = nn.Linear(64, action_dim)
        self.action_limit = action_limit

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x, _ = self.attention(x, x, x)
        action = torch.tanh(self.fc2(x))
        return action * self.action_limit

# Critic Network
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim + action_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state, action):
        return self.fc(torch.cat([state, action], dim=1))

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

# Example parameters
state_dim = 10
action_dim = 2
action_limit = 1
replay_buffer_capacity = 10000
batch_size = 32

# Initialize networks and replay buffer
actor = Actor(state_dim, action_dim, action_limit)
critic1 = Critic(state_dim, action_dim)
critic2 = Critic(state_dim, action_dim)
replay_buffer = ReplayBuffer(replay_buffer_capacity)

# Note: This code provides the basic structure for a self-attention-based multi-agent RL algorithm.
# The actual implementation would require additional details like training loops, environment interactions, etc.
