## **Multi-Agent Reinforcement Learning (MARL)**

Multi-Agent Reinforcement Learning (MARL) involves multiple agents interacting in a shared environment, each trying to optimize its own reward, often while considering the actions of other agents.


**Imports**

In [3]:
import numpy as np
import gym
from collections import deque


**Data Loading**

In [None]:
# Create an environment (e.g., multi-agent version of CartPole)
env = gym.make('CartPole-v1')  # Replace with multi-agent environment
n_agents = 2  # Example: 2 agents interacting in the environment

# Initialize Q-table for each agent
Q = [np.zeros((env.observation_space.shape[0], env.action_space.n)) for _ in range(n_agents)]


**Model Building**

In [None]:
# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate

def q_learning(env, n_agents, n_episodes=1000):
    for episode in range(n_episodes):
        states = [env.reset() for _ in range(n_agents)]  # Initial state for each agent
        done = False
        
        while not done:
            actions = []
            for i in range(n_agents):
                if np.random.rand() < epsilon:
                    actions.append(env.action_space.sample())  # Exploration
                else:
                    actions.append(np.argmax(Q[i][states[i]]))  # Exploitation

            next_states, rewards, dones, _ = env.step(actions)
            
            # Update Q-table for each agent independently
            for i in range(n_agents):
                Q[i][states[i], actions[i]] += alpha * (rewards[i] + gamma * np.max(Q[i][next_states[i]]) - Q[i][states[i], actions[i]])

            states = next_states
            done = all(dones)

q_learning(env, n_agents)


**Predictions**

In [None]:
# After training, each agent selects the action with the highest Q-value
states = [env.reset() for _ in range(n_agents)]
actions = []
for i in range(n_agents):
    action = np.argmax(Q[i][states[i]])
    actions.append(action)


**Performance Metrics**

In [None]:
# Evaluate the performance by running the trained agents in the environment
total_rewards = [0 for _ in range(n_agents)]
for _ in range(10):  # Run 10 test episodes
    states = [env.reset() for _ in range(n_agents)]
    done = False
    while not done:
        actions = [np.argmax(Q[i][states[i]]) for i in range(n_agents)]  # Select best actions
        next_states, rewards, dones, _ = env.step(actions)
        for i in range(n_agents):
            total_rewards[i] += rewards[i]
        states = next_states
        done = all(dones)

print(f"Average rewards: {np.mean(total_rewards)}")
