In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class SwarmAgent:
    def __init__(self, agent_id, position):
        self.agent_id = agent_id
        self.position = position

class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

class DRLHP:
    def __init__(self, num_agents, environment_size, num_epochs, learning_rate):
        self.num_agents = num_agents
        self.environment_size = environment_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate

        self.policy_network = PolicyNetwork(environment_size[0] * environment_size[1], num_agents)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss()

    def train(self, expert_preferences):
        expert_states, expert_actions = self.get_state_action_preferences(expert_preferences)

        for epoch in range(self.num_epochs):
            self.optimizer.zero_grad()

            predicted_actions = self.policy_network(expert_states)
            loss = self.criterion(predicted_actions, expert_actions)
            loss.backward()
            self.optimizer.step()

            if epoch % 10 == 0:
                print(f"Epoch: {epoch}, Loss: {loss.item()}")

    def get_state_action_preferences(self, preferences):
        states = []
        actions = []

        for preference in preferences:
            agent_positions = [agent.position for agent in preference['agents']]
            state = np.zeros(self.environment_size)
            for agent_position in agent_positions:
                state[tuple(agent_position)] += 1
            states.append(state.flatten())

            preferred_agent_id = preference['preferred_agent'].agent_id
            actions.append(preferred_agent_id)

        return torch.tensor(states, dtype=torch.float), torch.tensor(actions)

    def select_action(self, agent):
        state = np.zeros(self.environment_size)
        state[tuple(agent.position)] = 1
        state = torch.tensor(state, dtype=torch.float)
        action_probs = self.policy_network(state.unsqueeze(0))
        action = torch.argmax(action_probs).item()
        return action

# Usage example
num_agents = 10
environment_size = (10, 10)
num_epochs = 1000
learning_rate = 0.001

swarm_agents = []
for i in range(num_agents):
    position = np.random.randint(0, environment_size[0]), np.random.randint(0, environment_size[1])
    agent = SwarmAgent(i, position)
    swarm_agents.append(agent)

expert_preferences = [
    {'agents': swarm_agents, 'preferred_agent': swarm_agents[0]},  # Example preference
    # Add more preferences as needed
]

drlhp = DRLHP(num_agents, environment_size, num_epochs, learning_rate)
drlhp.train(expert_preferences)

# Test the policy by selecting actions for agents
actions = [drlhp.select_action(agent) for agent in swarm_agents]
print("Actions:", actions)


  return torch.tensor(states, dtype=torch.float), torch.tensor(actions)


Epoch: 0, Loss: 2.286588430404663
Epoch: 10, Loss: 2.190488815307617
Epoch: 20, Loss: 2.0010759830474854
Epoch: 30, Loss: 1.7105687856674194
Epoch: 40, Loss: 1.5266937017440796
Epoch: 50, Loss: 1.4800370931625366
Epoch: 60, Loss: 1.4696131944656372
Epoch: 70, Loss: 1.4664384126663208
Epoch: 80, Loss: 1.465113639831543
Epoch: 90, Loss: 1.4644008874893188
Epoch: 100, Loss: 1.4639370441436768
Epoch: 110, Loss: 1.4635950326919556
Epoch: 120, Loss: 1.463323712348938
Epoch: 130, Loss: 1.4630993604660034
Epoch: 140, Loss: 1.4629099369049072
Epoch: 150, Loss: 1.4627474546432495
Epoch: 160, Loss: 1.4626070261001587
Epoch: 170, Loss: 1.46248459815979
Epoch: 180, Loss: 1.4623771905899048
Epoch: 190, Loss: 1.4622825384140015
Epoch: 200, Loss: 1.4621984958648682
Epoch: 210, Loss: 1.4621235132217407
Epoch: 220, Loss: 1.4620563983917236
Epoch: 230, Loss: 1.4619964361190796
Epoch: 240, Loss: 1.4619420766830444
Epoch: 250, Loss: 1.46189284324646
Epoch: 260, Loss: 1.4618483781814575
Epoch: 270, Loss: 1.

RuntimeError: ignored

he code starts by defining a SwarmAgent class representing an agent in the swarm. Each agent has an agent ID and a position in the environment.

The PolicyNetwork class is defined as a neural network with two fully connected layers and a softmax activation function.

The DRLHP class represents the Deep Reinforcement Learning from Human Preferences algorithm. It takes the number of agents, environment size, number of training epochs, and learning rate as input.

The DRLHP class contains a policy network, an optimizer, and a loss criterion.

The train method trains the policy network using expert preferences. It iterates for the specified number of epochs.

The policy network is trained using the cross-entropy loss between predicted actions and expert actions.

The get_state_action_preferences method extracts state-action pairs from expert preferences.

The select_action method selects an action for a given agent using the trained policy network.

In the usage example, a swarm of agents is created with random positions in the environment.

Expert preferences are defined as a list of dictionaries, where each dictionary contains a list of agents and the preferred agent.

An instance of DRLHP is created, passing the necessary parameters.

The train method is called to train the policy network using expert preferences.

The select_action method is called for each agent to select actions based on the learned policy.

The selected actions for each agent are printed.

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class SwarmAgent:
    def __init__(self, agent_id, position):
        self.agent_id = agent_id
        self.position = position

class DeepRLHP:
    def __init__(self, num_agents, environment_size, num_iterations, learning_rate):
        self.num_agents = num_agents
        self.environment_size = environment_size
        self.num_iterations = num_iterations
        self.learning_rate = learning_rate

        self.agent_network = nn.Sequential(
            nn.Linear(environment_size[0] * environment_size[1], 64),
            nn.ReLU(),
            nn.Linear(64, num_agents)
        )
        self.optimizer = optim.Adam(self.agent_network.parameters(), lr=learning_rate)
        self.criterion = nn.MSELoss()

    def train(self, expert_preferences):
        for _ in range(self.num_iterations):
            self.optimizer.zero_grad()

            # Collect comparison-based training data
            states, comparisons = self.collect_training_data(expert_preferences)

            # Convert states to tensor
            states = torch.tensor(states, dtype=torch.float)

            # Predict preferences from the agent
            predicted_preferences = self.agent_network(states)

            # Compute loss
            loss = self.criterion(predicted_preferences, torch.tensor(comparisons, dtype=torch.float).unsqueeze(1))
            loss.backward()

            # Update agent network
            self.optimizer.step()

    def collect_training_data(self, expert_preferences):
        states = []
        comparisons = []

        for preference in expert_preferences:
            state1 = np.zeros(self.environment_size)
            state2 = np.zeros(self.environment_size)

            agent1, agent2 = preference
            state1[tuple(agent1.position)] += 1
            state2[tuple(agent2.position)] += 1

            states.append(state1.flatten())
            states.append(state2.flatten())
            comparisons.append(1.0)
            comparisons.append(-1.0)

        return states, comparisons

    def get_action(self, agent):
        state = np.zeros(self.environment_size)
        state[tuple(agent.position)] += 1

        state_tensor = torch.tensor(state.flatten(), dtype=torch.float)
        action_tensor = self.agent_network(state_tensor)

        action = torch.argmax(action_tensor).item()
        return action

# Usage example
num_agents = 10
environment_size = (10, 10)
num_iterations = 1000
learning_rate = 0.001

swarm_agents = []
for i in range(num_agents):
    position = np.random.randint(0, environment_size[0]), np.random.randint(0, environment_size[1])
    agent = SwarmAgent(i, position)
    swarm_agents.append(agent)

expert_preferences = [(swarm_agents[0], swarm_agents[1]), (swarm_agents[2], swarm_agents[3])]  # List of expert preferences

drlhp = DeepRLHP(num_agents, environment_size, num_iterations, learning_rate)
drlhp.train(expert_preferences)

# Get action for a specific agent
agent_id = 0
action = drlhp.get_action(swarm_agents[agent_id])
print(f"Action for Agent {agent_id}: {action}")


  return F.mse_loss(input, target, reduction=self.reduction)


Action for Agent 0: 3


The code starts by defining a SwarmAgent class representing an agent in the swarm. Each agent has an agent ID and a position in the environment.

The DeepRLHP class represents the Deep Reinforcement Learning from Human Preferences algorithm. It takes the number of agents, environment size, number of iterations, and learning rate as input.

The DeepRLHP class contains an agent network, optimizer, and loss criterion. The agent network is a feedforward neural network.

The agent network is defined with two linear layers. The first layer takes as input the flattened size of the environment and outputs 64 units, followed by a ReLU activation function. The second layer takes the 64 units as input and outputs a number of units equal to the number of agents.

The train method trains the agent network using expert preferences. It iterates for the specified number of iterations.

The collect_training_data method collects comparison-based training data from expert preferences. It generates two states corresponding to each preference and assigns a preference value of 1.0 or -1.0 based on the expert preference. The states and preferences are stored in separate lists.

The get_action method retrieves the action for a specific agent based on its position. It first constructs a state representation by creating an environment-sized array and setting the position of the agent to 1.0. The state is then converted to a tensor and passed through the agent network. The action with the highest value is extracted using torch.argmax and converted to a Python scalar using .item().

In the usage example, a swarm of agents is created with random positions in the environment.

Expert preferences are defined as a list of tuples, where each tuple contains two agents representing a preference.

An instance of DeepRLHP is created, passing the necessary parameters.

The train method is called to train the agent network using the expert preferences.

The get_action method is called to retrieve the action for a specific agent.

The action for the agent is printed.