<a href="https://colab.research.google.com/github/hyeminboo/25-1-SelfDriving/blob/main/02_week4_DQN/02_week4_DQN_%EB%B6%80%ED%98%9C%EB%AF%BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

세 개의 레버가 달린 장치와 원숭이가 있다.

장치의 첫 번째 레버를 당기면 쓴 약이, 세 번째 레버를 당기면 바나나가 나온다.

한 번 당긴 레버는 자동으로 다시 올라간다고 가정할 때, 이 장치와 원숭이를 코드로 추상화시키고, DQN을 이용해 학습시켜라.

In [2]:
import random
import numpy as np
import gym
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim

In [23]:
class LeverEnv(gym.Env): # Gym의 Env를 기반으로 환경 만듦
    def __init__(self):
        super(LeverEnv, self).__init__()
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)

    def reset(self):
        return np.array([0.0], dtype=np.float32)

    def step(self, action):
        if action == 0:
            reward = -100
        elif action == 1:
            reward = 10
        else:
            reward = 100
        done = True
        return np.array([0.0], dtype=np.float32), reward, done, {} # [0.0] : dummy state (원숭이가 행동을 결정할 때 어떤 상태 정보도 필요하지 않기 때문)

In [7]:
class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(1, 32),
            nn.ReLU(),
            nn.Linear(32, 3) # 액션 수 만큼 출력
        )

    def forward(self, x):
        return self.fc(x)


In [21]:
class ReplayBuffer:
    def __init__(self, maxlen=1000):
        self.buffer = deque(maxlen=maxlen)

    def push(self, state, action, reward, next_state, done):
        # 경험 하나를 버퍼에 저장
        self.buffer.append((state, int(action), reward, next_state, done))

    def sample(self, batch_size):
        # 랜덤 샘플링
        batch = random.sample(self.buffer, batch_size)

        states = torch.FloatTensor([b[0] for b in batch])
        actions = torch.LongTensor([b[1] for b in batch])
        rewards = torch.FloatTensor([b[2] for b in batch])
        next_states = torch.FloatTensor([b[3] for b in batch])
        dones = torch.FloatTensor([float(b[4]) for b in batch])

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

In [25]:
def train():
    env = LeverEnv()
    qnet = QNetwork()
    target_net = QNetwork()
    target_net.load_state_dict(qnet.state_dict())
    target_net.eval() # target network는 학습하지 않음

    buffer = ReplayBuffer()
    optimizer = optim.Adam(qnet.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    episodes = 300
    batch_size = 32
    gamma = 0.9
    epsilon = 0.1
    update_target = 20

    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_vals = qnet(torch.FloatTensor(state)) # state의 q 값 예측
                    action = torch.argmax(q_vals).item() # 가장 큰 q 값의 액션 선택

            next_state, reward, done, _ = env.step(action)
            buffer.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if len(buffer) >= batch_size:
                s, a, r, s_, d = buffer.sample(batch_size)

                q_values = qnet(s).gather(1, a.unsqueeze(1)).squeeze()

                with torch.no_grad():
                    next_q_values = target_net(s_).max(1)[0]
                    target = r + gamma * next_q_values * (1-d)

                loss = loss_fn(q_values, target)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if (episode + 1) % update_target == 0:
            target_net.load_state_dict(qnet.state_dict())

        print(f"Episode {episode+1}, Total Reward: {total_reward}")

    return qnet

In [11]:
def test(qnet):
    env = LeverEnv()
    for a in range(3):
        with torch.no_grad():
            q_val = qnet(torch.FloatTensor([0]))[a].item()
            print(f"Lever {a}: Q value = {q_val:.2f}")

    state = env.reset()
    with torch.no_grad():
        action = torch.argmax(qnet(torch.FloatTensor(state))).item()
    print(f"Lever {action}을 당김")

In [22]:
qnet = train()
test(qnet)

# reward : -1, 0, 1

  self.buffer.append((state, int(action), reward, next_state, done))


Episode 20, Total Reward: 1
Episode 40, Total Reward: 1
Episode 60, Total Reward: 1
Episode 80, Total Reward: 1
Episode 100, Total Reward: 1
Episode 120, Total Reward: 1
Episode 140, Total Reward: 1
Episode 160, Total Reward: 1
Episode 180, Total Reward: 1
Episode 200, Total Reward: 1
Episode 220, Total Reward: 1
Episode 240, Total Reward: 1
Episode 260, Total Reward: 1
Episode 280, Total Reward: 1
Episode 300, Total Reward: 1
Lever 0: Q value = 1.00
Lever 1: Q value = 0.03
Lever 2: Q value = 1.00
Lever 2을 당김


In [26]:
qnet = train()
test(qnet)

Episode 1, Total Reward: -100
Episode 2, Total Reward: -100
Episode 3, Total Reward: 100
Episode 4, Total Reward: -100
Episode 5, Total Reward: -100
Episode 6, Total Reward: -100
Episode 7, Total Reward: -100
Episode 8, Total Reward: -100
Episode 9, Total Reward: 100
Episode 10, Total Reward: -100
Episode 11, Total Reward: -100
Episode 12, Total Reward: -100
Episode 13, Total Reward: -100
Episode 14, Total Reward: -100
Episode 15, Total Reward: -100
Episode 16, Total Reward: -100
Episode 17, Total Reward: -100
Episode 18, Total Reward: -100
Episode 19, Total Reward: -100
Episode 20, Total Reward: -100
Episode 21, Total Reward: 100
Episode 22, Total Reward: -100
Episode 23, Total Reward: -100
Episode 24, Total Reward: 100
Episode 25, Total Reward: -100
Episode 26, Total Reward: -100
Episode 27, Total Reward: -100
Episode 28, Total Reward: 100
Episode 29, Total Reward: -100
Episode 30, Total Reward: -100
Episode 31, Total Reward: -100
Episode 32, Total Reward: -100
Episode 33, Total Rewa

  self.buffer.append((state, int(action), reward, next_state, done))


Episode 125, Total Reward: 100
Episode 126, Total Reward: 100
Episode 127, Total Reward: 100
Episode 128, Total Reward: 100
Episode 129, Total Reward: 100
Episode 130, Total Reward: 100
Episode 131, Total Reward: 100
Episode 132, Total Reward: 100
Episode 133, Total Reward: 100
Episode 134, Total Reward: 100
Episode 135, Total Reward: 100
Episode 136, Total Reward: 100
Episode 137, Total Reward: 100
Episode 138, Total Reward: 100
Episode 139, Total Reward: 100
Episode 140, Total Reward: 100
Episode 141, Total Reward: 100
Episode 142, Total Reward: 100
Episode 143, Total Reward: 100
Episode 144, Total Reward: 100
Episode 145, Total Reward: 100
Episode 146, Total Reward: 100
Episode 147, Total Reward: 100
Episode 148, Total Reward: 100
Episode 149, Total Reward: 100
Episode 150, Total Reward: 100
Episode 151, Total Reward: 100
Episode 152, Total Reward: 100
Episode 153, Total Reward: 100
Episode 154, Total Reward: 100
Episode 155, Total Reward: 100
Episode 156, Total Reward: 100
Episode 