<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/RL/6-4-Simple-Multi-Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

# 환경 정의
class SimpleMultiAgentEnv:
    def __init__(self):
        self.agent_positions = [0, 0]  # 두 에이전트의 초기 위치 (단일 차원 상의 위치)
        self.goal_position = 10  # 목표 위치

    def reset(self):
        self.agent_positions = [0, 0]  # 두 에이전트 초기화
        return self.agent_positions

    def step(self, actions):
        # 각 에이전트의 행동에 따라 위치 이동 (action: 0 -> stay, 1 -> right, -1 -> left)
        for i in range(2):
            self.agent_positions[i] += actions[i]

        # 상태, 보상, 종료 여부 반환
        rewards = [-1, -1]  # 기본 보상 (움직이는 에이전트는 비용 발생)

        # 만약 에이전트가 목표에 도달하면 큰 보상을 줌
        for i in range(2):
            if self.agent_positions[i] == self.goal_position:
                rewards[i] = 10  # 목표 도달 보상

        done = any([pos == self.goal_position for pos in self.agent_positions])  # 목표 도달 시 종료

        return self.agent_positions, rewards, done

# Q-Learning 알고리즘 정의
class QLearningAgent:
    def __init__(self, n_actions, learning_rate=0.1, discount_factor=0.99, epsilon=0.1):
        self.q_table = {}  # Q 테이블 초기화
        self.n_actions = n_actions  # 행동의 개수
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon

    def get_state_action_key(self, state, action):
        return str(state) + "_" + str(action)

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return random.choice(range(self.n_actions))  # 무작위 탐험
        else:
            q_values = [self.q_table.get(self.get_state_action_key(state, a), 0) for a in range(self.n_actions)]
            return np.argmax(q_values)  # 가장 높은 Q값을 가진 행동 선택

    def update(self, state, action, reward, next_state):
        current_q = self.q_table.get(self.get_state_action_key(state, action), 0)
        next_q = max([self.q_table.get(self.get_state_action_key(next_state, a), 0) for a in range(self.n_actions)])

        # Q-러닝 업데이트
        new_q = current_q + self.learning_rate * (reward + self.discount_factor * next_q - current_q)
        self.q_table[self.get_state_action_key(state, action)] = new_q

# 다중 에이전트 학습 실행
env = SimpleMultiAgentEnv()
agent1 = QLearningAgent(n_actions=3)  # 행동: 0 (stay), 1 (right), 2 (left)
agent2 = QLearningAgent(n_actions=3)

n_episodes = 1000

for episode in range(n_episodes):
    state = env.reset()
    done = False
    while not done:
        # 에이전트의 행동 선택
        action1 = agent1.choose_action(state)
        action2 = agent2.choose_action(state)
        actions = [action1 - 1, action2 - 1]  # 행동을 -1, 0, 1로 변환

        # 환경에서 한 단계 실행
        next_state, rewards, done = env.step(actions)

        # 에이전트 학습
        agent1.update(state, action1, rewards[0], next_state)
        agent2.update(state, action2, rewards[1], next_state)

        state = next_state

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1} completed")

print("Training finished!")
