In [10]:
# 1. 설치
!pip install gymnasium imageio



In [11]:
# DQN 적용
import gymnasium as gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import imageio
import os


* gymnasium: 환경(CartPole)을 제공

* torch: DQN 신경망 구현 및 학습

* imageio: 프레임을 GIF로 저장

* deque: 경험 리플레이 버퍼

In [12]:
# 환경 초기화
env = gym.make("Acrobot-v1", render_mode="rgb_array") # 환경: "rgb_array"로 설정해 시각화 가능
obs_shape = env.observation_space.shape[0]
n_actions = env.action_space.n

In [13]:
# PyTorch DQN 모델 정의
# 모델: 24 → 24 → n_actions 구조의 2층 fully connected 신경망
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 24)
        self.fc2 = nn.Linear(24, 24)
        self.out = nn.Linear(24, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x)

In [14]:
# 주요 하이퍼파라미터
learning_rate = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.01
batch_size = 64
memory = deque(maxlen=2000) # 리플레이 버퍼 (최대 2000개 저장)
n_episodes = 500

# 모델 및 타겟 모델 생성
policy_net = DQN(obs_shape, n_actions)
target_net = DQN(obs_shape, n_actions) # 안정성을 위한 타겟 Q-network
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

In [15]:
# GPU 사용 가능 시
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_net.to(device)
target_net.to(device)

# 학습 루프 및 GIF 저장용 프레임 리스트
frames = []
out_dir = "dqn_frames"
os.makedirs(out_dir, exist_ok=True)

In [16]:
for episode in range(n_episodes):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    obs = torch.tensor(obs, dtype=torch.float32).to(device)

    while not done: # 탐험과 활용을 섞어가며 액션 선택
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_values = policy_net(obs.unsqueeze(0))
                action = q_values.argmax().item()

        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        if episode % 10 == 0: #10번째 에피소드마다 환경 상태를 저장해 GIF로 만들기 위함
            frame = env.render()
            frames.append(frame)

        next_obs_tensor = torch.tensor(next_obs, dtype=torch.float32).to(device)
        memory.append((obs.cpu().numpy(), action, reward, next_obs_tensor.cpu().numpy(), done)) # 경험 저장

        obs = next_obs_tensor
        total_reward += reward

        # 경험 리플레이 학습 : 배치 학습 => 배치 샘플링 → 예측값과 타겟값 비교 → loss 최소화
        if len(memory) >= batch_size:
            minibatch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)

            states = torch.tensor(states, dtype=torch.float32).to(device)
            actions = torch.tensor(actions).unsqueeze(1).to(device)
            rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(device)
            next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
            dones = torch.tensor(dones, dtype=torch.bool).unsqueeze(1).to(device)

            q_values = policy_net(states).gather(1, actions)
            next_q_values = target_net(next_states).max(1)[0].detach().unsqueeze(1)
            target_q = rewards + gamma * next_q_values * (~dones)

            loss = criterion(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    if episode % 10 == 0: # 타겟 네트워크 업데이트
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(min_epsilon, epsilon * epsilon_decay) # epsilon = max(min_epsilon, epsilon * epsilon_decay)

    if episode % 50 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")

env.close()

  states = torch.tensor(states, dtype=torch.float32).to(device)


Episode 0, Total Reward: -500.0, Epsilon: 0.995
Episode 50, Total Reward: -500.0, Epsilon: 0.774
Episode 100, Total Reward: -500.0, Epsilon: 0.603
Episode 150, Total Reward: -500.0, Epsilon: 0.469
Episode 200, Total Reward: -485.0, Epsilon: 0.365
Episode 250, Total Reward: -166.0, Epsilon: 0.284
Episode 300, Total Reward: -500.0, Epsilon: 0.221
Episode 350, Total Reward: -243.0, Epsilon: 0.172
Episode 400, Total Reward: -472.0, Epsilon: 0.134
Episode 450, Total Reward: -500.0, Epsilon: 0.104


In [17]:
# 학습 시각화 GIF 저장
imageio.mimsave("dqn_training.gif", frames, fps=30)
print("GIF saved as dqn_training.gif")

GIF saved as dqn_training.gif
