<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/RL/6-3-DoubleDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Dueling Q-Network 모델 정의
class DuelingQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DuelingQNetwork, self).__init__()
        # 상태-가치 함수 부분
        self.fc1 = nn.Linear(state_size, 128)
        self.fc_value = nn.Linear(128, 128)
        self.value = nn.Linear(128, 1)

        # 이득(advantage) 함수 부분
        self.fc_advantage = nn.Linear(128, 128)
        self.advantage = nn.Linear(128, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))

        # 가치 함수
        value = torch.relu(self.fc_value(x))
        value = self.value(value)

        # 이득 함수
        advantage = torch.relu(self.fc_advantage(x))
        advantage = self.advantage(advantage)

        # 최종 Q값: 가치 + (이득 - 평균 이득)
        q_value = value + (advantage - advantage.mean())
        return q_value

# Hyperparameters
state_size = 4
action_size = 2
batch_size = 64
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
target_update = 10

# Replay Memory
memory = deque(maxlen=2000)

# Double DQN: 두 개의 네트워크 사용
q_network = DuelingQNetwork(state_size, action_size)
target_network = DuelingQNetwork(state_size, action_size)
target_network.load_state_dict(q_network.state_dict())
optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)

# 경험 샘플링 함수
def replay(memory, batch_size):
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*minibatch)

    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.long)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.float32)

    # 현재 상태에서의 Q값 계산
    q_values = q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)

    # Double Q-Learning: 행동 선택은 q_network로, Q값 계산은 target_network로
    next_actions = q_network(next_states).max(1)[1]
    next_q_values = target_network(next_states).gather(1, next_actions.unsqueeze(1)).squeeze(1)

    expected_q_values = rewards + (gamma * next_q_values * (1 - dones))

    # 손실 계산 및 역전파
    loss = nn.MSELoss()(q_values, expected_q_values.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# 행동 선택 함수 (ε-greedy)
def choose_action(state, epsilon):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        q_values = q_network(state)
    return np.argmax(q_values.numpy())

# CartPole 환경 설정
env = gym.make('CartPole-v1')
episodes = 1000

# 학습 루프
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        # 행동 선택
        action = choose_action(state, epsilon)

        # 환경에서 한 단계 진행
        next_state, reward, done, _ = env.step(action)

        # 보상 조정
        reward = reward if not done else -10

        # 메모리에 저장
        memory.append((state, action, reward, next_state, done))

        # 상태 업데이트
        state = next_state
        total_reward += reward

        # 경험 리플레이
        replay(memory, batch_size)

    # 탐험률 감소
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # 타겟 네트워크 업데이트
    if episode % target_update == 0:
        target_network.load_state_dict(q_network.state_dict())

    print(f"Episode: {episode}, Total reward: {total_reward}, Epsilon: {epsilon}")

# 학습 완료 후 에이전트 테스트
for i in range(10):
    state = env.reset()
    done = False
    while not done:
        env.render()
        action = choose_action(state, epsilon_min)
        next_state, _, done, _ = env.step(action)
        state = next_state
env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
  states = torch.tensor(states, dtype=torch.float32)


Episode: 0, Total reward: 8.0, Epsilon: 0.995
Episode: 1, Total reward: 13.0, Epsilon: 0.990025
Episode: 2, Total reward: 19.0, Epsilon: 0.985074875
Episode: 3, Total reward: 7.0, Epsilon: 0.9801495006250001
Episode: 4, Total reward: 29.0, Epsilon: 0.9752487531218751
Episode: 5, Total reward: 6.0, Epsilon: 0.9703725093562657
Episode: 6, Total reward: 64.0, Epsilon: 0.9655206468094844
Episode: 7, Total reward: 13.0, Epsilon: 0.960693043575437
Episode: 8, Total reward: 8.0, Epsilon: 0.9558895783575597
Episode: 9, Total reward: 16.0, Epsilon: 0.9511101304657719
Episode: 10, Total reward: 11.0, Epsilon: 0.946354579813443
Episode: 11, Total reward: 5.0, Epsilon: 0.9416228069143757
Episode: 12, Total reward: 9.0, Epsilon: 0.9369146928798039
Episode: 13, Total reward: 0.0, Epsilon: 0.9322301194154049
Episode: 14, Total reward: 3.0, Epsilon: 0.9275689688183278
Episode: 15, Total reward: 28.0, Epsilon: 0.9229311239742362
Episode: 16, Total reward: -1.0, Epsilon: 0.918316468354365
Episode: 17, T