## 시니어

- DQN 코드 작성 (전에 성능 나왔으면, DDQN 구현)  
- 먼저 (5,5)와 같이 작은 사이즈의 환경에서 시작  
- 매직넘버 사용 지양  



---  


### 1. **작은 사이즈, 적은 지뢰 밀도** 환경에서 학습되는지 확인  

### 시니어용 세팅

In [123]:
! git clone "https://github.com/KanghwaSisters/DQN_minesweeper.git"

fatal: destination path 'DQN_minesweeper' already exists and is not an empty directory.


In [124]:
import os
os.chdir('/content/DQN_minesweeper/codes/Environment')

In [125]:
! python reward5.py
from reward5 import *

In [126]:
env = MinesweeperEnv(map_size=(5,5),
                     n_mines=5,
                     rewards={'win':1.5, 'lose':-1, 'progress':0.2, 'guess':0.3, 'no_progress' : -1},
                     dones={'win':True, 'lose':True, 'progress':False, 'guess':False, 'no_progress' : False},
                     dim2=False)
# dim2=False -> 다차원 state는 단일 차원 state에서 지뢰(-2), 까짐 여부(-1), 주변 지뢰 개수의 합을 각각의 채널로 받고, 원핫인코딩으로 값을 표현한다.

# env.state.shape -> input state (11, 5, 5)

### Import

In [127]:
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

### Net

In [128]:
class Net(nn.Module):
    def __init__(self, input_dims, n_actions, conv_units, in_channels=11):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=2)
        self.conv2 = nn.Conv2d(in_channels=conv_units, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=1)
        self.conv3 = nn.Conv2d(in_channels=conv_units, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=1)
        self.conv4 = nn.Conv2d(in_channels=conv_units, out_channels=conv_units, kernel_size=(3,3), bias=False, padding=1)

        self.flatten = nn.Flatten()

        fc_size = conv_units * (input_dims[-1]+2) * (input_dims[-2]+2)

        self.fc = nn.Linear(fc_size, n_actions)

    def forward(self, x):
        # conv area
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x = self.flatten(x)
        # flatten area
        x = self.fc(x)

        return x

### Agent

0 ~ 8 : 주변 지뢰 개수

-1    : 열리지 않은 타일

-2    : 지뢰
***

`env.step(action_idx)`

-> return self.state, reward, done

`env.state.shape`

-> input state (11, 5, 5) : 5x5 판 11개

`env.reset()`

-> 게임 보드와 state 생성

In [129]:
# Hyperparameters
DISCOUNT_FACTOR = 0.8
LEARNING_RATE = 0.005

EPSILON = 0.5
EPSILON_DECAY = 0.998
EPSILON_MIN = 0.01

TARGET_UPDATE_COUNTER = 0
UPDATE_TARGET_EVERY = 10
CONV_UNITS = 32

BATCH_SIZE = 32
TRAIN_START = 500
MAX_LEN = 10000

In [130]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [131]:
class Agent(nn.Module):
    def __init__(self, env):
        super(Agent, self).__init__()

        self.epsilon = EPSILON
        self.epsilon_decay = EPSILON_DECAY
        self.epsilon_min = EPSILON_MIN

        self.target_update_counter = TARGET_UPDATE_COUNTER
        self.update_target_every = UPDATE_TARGET_EVERY

        self.train_start = TRAIN_START
        self.memory = deque(maxlen=MAX_LEN)

        self.model = Net(env.state.shape, env.total_tiles, CONV_UNITS).to(device)
        self.target_model = Net(env.state.shape, env.total_tiles, CONV_UNITS).to(device)
        self.loss = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE, eps=1e-4)
        self.scheduler = optim.lr_scheduler.CyclicLR(optimizer=self.optimizer, base_lr=0.0001, max_lr=0.1, step_size_up=10000, mode='exp_range')

        self.update_target_model()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def get_action(self, state):
        state = np.array(state).reshape(1, 11, env.ncols, env.nrows)
        state = torch.FloatTensor(state).to(device)

        if np.random.rand() <= EPSILON:
            action = random.randrange(env.total_tiles)
        else:
            q_value = self.model(state)
            self.q_value = q_value.detach().cpu().numpy().flatten()
            action = torch.argmax(q_value).item()

        return action

    def append_sample(self, state, action, reward, next_state, done):
        state = state
        next_state = next_state
        self.memory.append((state, action, reward, next_state, done))

    def train_model(self):
        if len(self.memory) < BATCH_SIZE:
            return

        minibatch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)

        states = states.reshape(BATCH_SIZE, 11, env.ncols, env.nrows)
        next_states = next_states.reshape(BATCH_SIZE, 11, env.ncols, env.nrows)

        states = torch.tensor(states, dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.long).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        pred = self.model(states)
        target_pred = self.target_model(next_states).max(1)[0].detach()

        targets = rewards + (1 - dones) * DISCOUNT_FACTOR * target_pred

        pred = pred.gather(1, actions.unsqueeze(1))
        trg = targets.unsqueeze(1)

        loss = self.loss(pred, trg)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.scheduler.step()

        self.target_update_counter += 1
        if self.target_update_counter >= self.update_target_every:
            self.target_update_counter = 0
            self.update_target_model()

### Main

In [132]:
agent = Agent(env)

EPISODES = 3000
RENDER_PROCESS = False
RENDER_END = False

total_moves = []
scores = np.zeros(EPISODES)
length_memory = np.zeros(EPISODES)
wins = np.zeros(EPISODES)
episodes = np.zeros(EPISODES)
timesteps = np.zeros(EPISODES)
win_rates = {}

N = 100

for epi in range(EPISODES):
    done = False
    score = 0
    time_step = 0
    actions = []
    rewards = []

    env.reset()
    state = env.state
    agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)

    while not done and time_step <= 40:
        time_step += 1
        action = agent.get_action(state)

        # 선택한 행동으로 환경에서 한 타임스텝 진행
        next_state, reward, done = env.step(action)
        score += reward

        # 선택한 action(좌표) 리스트에 추가
        actions.append(env.get_coord(action))
        rewards.append(reward)

        # 메모리에 샘플 저장 및 학습
        agent.append_sample(state, action, reward, next_state, done)
        if len(agent.memory) >= agent.train_start:
            agent.train_model()

        state = next_state

    scores[epi] = score
    timesteps[epi] = time_step

    # 에피소드가 끝날 때 승리 여부를 기록
    if done and reward == env.rewards['win']:
        wins[epi] = 1
        # print(f"Episode: {epi} | Score: {score} | Timesteps: {time_step} | Epsilon: {agent.epsilon:.4f}")
        # env.render(state)

    if (epi) % N == 0:
        scores_N = np.median(scores[max(0, epi - N + 1):epi + 1])  # 마지막 N개의 점수 중간값
        win_rate = np.mean(wins[max(0, epi - N + 1):epi + 1]) * 100  # 승률
        win_rates[epi] = win_rate
        length_memory[epi] = len(agent.memory)

        print(f"Episode {epi:3d} | Timesteps: {time_step}")
        print(f"Score: {scores_N:.2f} | Win Rate: {win_rate:.2f}% | Epsilon: {agent.epsilon:.4f}\n")
        env.render(state)  # 상태 표시
        print(f"Actions taken: {actions}")
        print(f"Rewards per timestep: {rewards}")
        print("--------------------------------------------------")

Episode   0 | Timesteps: 3
Score: -0.40 | Win Rate: 0.00% | Epsilon: 0.4990



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,2
1,.,.,.,M,.
2,.,.,.,.,.
3,.,.,1,.,.
4,.,.,.,.,.


 
Actions taken: [(3, 2), (0, 4), (1, 3)]
Rewards per timestep: [0.3, 0.3, -1]
--------------------------------------------------
Episode 100 | Timesteps: 4
Score: -1.40 | Win Rate: 1.00% | Epsilon: 0.4085



Unnamed: 0,0,1,2,3,4
0,0,0,0,1,.
1,0,0,1,2,.
2,0,0,2,.,.
3,0,0,3,.,.
4,0,0,2,M,.


 
Actions taken: [(4, 0), (1, 2), (2, 0), (4, 3)]
Rewards per timestep: [0.3, -1, -1, -1]
--------------------------------------------------
Episode 200 | Timesteps: 3
Score: -0.70 | Win Rate: 0.00% | Epsilon: 0.3344



Unnamed: 0,0,1,2,3,4
0,1,.,.,.,M
1,.,.,.,3,2
2,.,.,.,1,0
3,.,.,.,2,1
4,.,.,.,.,.


 
Actions taken: [(2, 4), (0, 0), (0, 4)]
Rewards per timestep: [0.3, 0.3, -1]
--------------------------------------------------
Episode 300 | Timesteps: 3
Score: -0.50 | Win Rate: 2.00% | Epsilon: 0.2737



Unnamed: 0,0,1,2,3,4
0,0,0,1,.,.
1,2,2,3,.,.
2,.,M,.,.,.
3,.,.,.,2,.
4,.,.,.,.,.


 
Actions taken: [(3, 3), (0, 0), (2, 1)]
Rewards per timestep: [0.3, 0.3, -1]
--------------------------------------------------
Episode 400 | Timesteps: 11
Score: -0.70 | Win Rate: 1.00% | Epsilon: 0.2240



Unnamed: 0,0,1,2,3,4
0,.,.,1,0,0
1,.,.,1,1,1
2,.,.,2,.,.
3,.,.,.,M,.
4,.,.,2,.,.


 
Actions taken: [(4, 2), (0, 3), (0, 3), (2, 2), (2, 2), (0, 3), (2, 2), (1, 4), (2, 2), (4, 2), (3, 3)]
Rewards per timestep: [0.3, 0.3, -1, 0.2, -1, -1, -1, -1, -1, -1, -1]
--------------------------------------------------
Episode 500 | Timesteps: 2
Score: -1.15 | Win Rate: 3.00% | Epsilon: 0.1834



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,2
1,.,.,.,.,M
2,.,.,.,.,.
3,.,.,.,.,.
4,.,.,.,.,.


 
Actions taken: [(0, 4), (1, 4)]
Rewards per timestep: [0.3, -1]
--------------------------------------------------
Episode 600 | Timesteps: 12
Score: -2.35 | Win Rate: 1.00% | Epsilon: 0.1501



Unnamed: 0,0,1,2,3,4
0,0,0,0,1,.
1,0,0,0,1,M
2,1,1,1,2,.
3,.,.,2,.,2
4,1,.,.,.,.


 
Actions taken: [(0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (3, 2), (3, 4), (2, 1), (2, 1), (2, 0), (4, 0), (1, 4)]
Rewards per timestep: [0.3, -1, -1, -1, -1, 0.2, 0.2, -1, -1, -1, 0.3, -1]
--------------------------------------------------
Episode 700 | Timesteps: 5
Score: -2.90 | Win Rate: 2.00% | Epsilon: 0.1229



Unnamed: 0,0,1,2,3,4
0,0,0,1,.,.
1,0,0,1,M,.
2,0,0,2,.,.
3,0,1,2,.,.
4,0,1,.,.,.


 
Actions taken: [(3, 0), (3, 0), (3, 0), (0, 0), (1, 3)]
Rewards per timestep: [0.3, -1, -1, -1, -1]
--------------------------------------------------
Episode 800 | Timesteps: 8
Score: -3.40 | Win Rate: 1.00% | Epsilon: 0.1006



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,.
1,1,3,.,3,.
2,0,2,.,M,.
3,0,1,3,.,2
4,0,0,1,.,.


 
Actions taken: [(4, 0), (3, 4), (3, 4), (3, 0), (3, 4), (3, 4), (1, 3), (2, 3)]
Rewards per timestep: [0.3, 0.3, -1, -1, -1, -1, 0.3, -1]
--------------------------------------------------
Episode 900 | Timesteps: 2
Score: -2.25 | Win Rate: 3.00% | Epsilon: 0.0823



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,.
1,.,.,2,.,.
2,.,.,.,.,.
3,.,M,.,.,.
4,.,.,.,.,.


 
Actions taken: [(1, 2), (3, 1)]
Rewards per timestep: [0.3, -1]
--------------------------------------------------
Episode 1000 | Timesteps: 9
Score: -2.40 | Win Rate: 0.00% | Epsilon: 0.0674



Unnamed: 0,0,1,2,3,4
0,M,.,.,1,0
1,.,3,.,1,0
2,.,.,.,2,0
3,.,.,.,2,1
4,.,.,.,.,.


 
Actions taken: [(1, 1), (0, 4), (0, 4), (0, 4), (0, 3), (0, 4), (0, 4), (0, 4), (0, 0)]
Rewards per timestep: [0.3, 0.3, -1, -1, -1, -1, -1, -1, -1]
--------------------------------------------------
Episode 1100 | Timesteps: 15
Score: -2.05 | Win Rate: 0.00% | Epsilon: 0.0552



Unnamed: 0,0,1,2,3,4
0,0,0,0,1,.
1,0,0,0,1,.
2,0,0,1,1,.
3,1,1,2,.,3
4,.,.,.,.,M


 
Actions taken: [(2, 3), (2, 3), (2, 2), (0, 2), (2, 3), (0, 1), (1, 2), (1, 2), (1, 2), (2, 2), (1, 2), (3, 4), (1, 2), (1, 2), (4, 4)]
Rewards per timestep: [0.3, -1, 0.2, 0.3, -1, -1, -1, -1, -1, -1, -1, 0.2, -1, -1, -1]
--------------------------------------------------
Episode 1200 | Timesteps: 21
Score: -1.80 | Win Rate: 0.00% | Epsilon: 0.0452



Unnamed: 0,0,1,2,3,4
0,1,.,2,1,0
1,1,.,.,1,0
2,2,.,.,2,1
3,.,2,.,.,.
4,1,.,M,.,.


 
Actions taken: [(0, 0), (2, 0), (1, 0), (0, 4), (2, 0), (2, 0), (3, 1), (3, 1), (3, 1), (1, 0), (3, 1), (4, 0), (3, 1), (3, 1), (0, 2), (3, 1), (3, 1), (3, 1), (3, 1), (2, 0), (4, 2)]
Rewards per timestep: [0.3, 0.3, 0.2, 0.3, -1, -1, 0.2, -1, -1, -1, -1, 0.2, -1, -1, 0.2, -1, -1, -1, -1, -1, -1]
--------------------------------------------------
Episode 1300 | Timesteps: 5
Score: -1.70 | Win Rate: 0.00% | Epsilon: 0.0370



Unnamed: 0,0,1,2,3,4
0,.,.,.,1,0
1,.,.,.,2,0
2,.,.,M,2,0
3,.,.,.,2,0
4,.,.,.,1,0


 
Actions taken: [(3, 4), (3, 4), (3, 4), (3, 4), (2, 2)]
Rewards per timestep: [0.3, -1, -1, -1, -1]
--------------------------------------------------
Episode 1400 | Timesteps: 6
Score: -2.10 | Win Rate: 0.00% | Epsilon: 0.0303



Unnamed: 0,0,1,2,3,4
0,.,M,.,.,.
1,2,2,2,.,.
2,0,0,2,3,.
3,0,0,1,.,.
4,0,0,1,.,.


 
Actions taken: [(2, 0), (1, 2), (1, 2), (2, 3), (1, 2), (0, 1)]
Rewards per timestep: [0.3, -1, -1, 0.2, -1, -1]
--------------------------------------------------
Episode 1500 | Timesteps: 5
Score: -2.55 | Win Rate: 0.00% | Epsilon: 0.0248



Unnamed: 0,0,1,2,3,4
0,.,M,.,2,0
1,.,.,.,3,0
2,.,.,.,2,0
3,1,2,1,1,0
4,.,1,0,0,0


 
Actions taken: [(3, 1), (1, 4), (3, 0), (2, 3), (0, 1)]
Rewards per timestep: [0.3, 0.3, 0.2, -1, -1]
--------------------------------------------------
Episode 1600 | Timesteps: 8
Score: -2.70 | Win Rate: 0.00% | Epsilon: 0.0203



Unnamed: 0,0,1,2,3,4
0,1,.,3,.,.
1,.,.,.,M,1
2,.,.,.,3,.
3,.,.,.,1,.
4,.,.,.,.,.


 
Actions taken: [(2, 3), (0, 2), (1, 4), (2, 3), (0, 0), (2, 3), (3, 3), (1, 3)]
Rewards per timestep: [0.3, 0.3, 0.2, -1, 0.3, -1, 0.2, -1]
--------------------------------------------------
Episode 1700 | Timesteps: 12
Score: -2.85 | Win Rate: 1.00% | Epsilon: 0.0166



Unnamed: 0,0,1,2,3,4
0,0,0,0,0,0
1,1,1,2,1,1
2,.,.,3,.,.
3,1,1,4,.,M
4,0,0,2,.,.


 
Actions taken: [(1, 4), (0, 1), (1, 4), (1, 4), (2, 2), (0, 2), (1, 4), (0, 1), (4, 0), (4, 0), (4, 0), (3, 4)]
Rewards per timestep: [0.3, 0.3, -1, -1, 0.2, -1, -1, -1, 0.3, -1, -1, -1]
--------------------------------------------------
Episode 1800 | Timesteps: 6
Score: -2.65 | Win Rate: 1.00% | Epsilon: 0.0136



Unnamed: 0,0,1,2,3,4
0,0,0,1,.,.
1,0,0,1,M,.
2,0,0,1,3,.
3,1,1,0,2,.
4,.,1,0,1,.


 
Actions taken: [(4, 1), (2, 2), (4, 2), (2, 2), (2, 2), (1, 3)]
Rewards per timestep: [0.3, 0.3, 0.2, -1, -1, -1]
--------------------------------------------------
Episode 1900 | Timesteps: 6
Score: -3.55 | Win Rate: 3.00% | Epsilon: 0.0111



Unnamed: 0,0,1,2,3,4
0,0,1,.,.,.
1,0,2,M,.,.
2,1,3,.,.,.
3,.,.,.,.,.
4,.,.,.,.,.


 
Actions taken: [(0, 0), (2, 1), (0, 1), (0, 1), (1, 0), (1, 2)]
Rewards per timestep: [0.3, -1, -1, -1, -1, -1]
--------------------------------------------------
Episode 2000 | Timesteps: 2
Score: -2.45 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,.
1,.,.,.,.,.
2,.,.,1,.,.
3,.,.,.,.,.
4,.,.,.,M,.


 
Actions taken: [(2, 2), (4, 3)]
Rewards per timestep: [0.3, -1]
--------------------------------------------------
Episode 2100 | Timesteps: 7
Score: -2.30 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,0,0,1,.,.
1,0,0,1,.,.
2,0,0,1,.,3
3,1,2,2,2,.
4,.,M,.,.,.


 
Actions taken: [(3, 3), (2, 4), (3, 3), (1, 0), (3, 3), (2, 1), (4, 1)]
Rewards per timestep: [0.3, 0.2, -1, 0.3, -1, -1, -1]
--------------------------------------------------
Episode 2200 | Timesteps: 3
Score: -1.90 | Win Rate: 2.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,M,.,.,.
1,.,.,.,.,.
2,.,2,3,.,.
3,.,.,.,.,.
4,.,.,.,.,.


 
Actions taken: [(2, 1), (2, 2), (0, 1)]
Rewards per timestep: [0.3, 0.2, -1]
--------------------------------------------------
Episode 2300 | Timesteps: 5
Score: -1.70 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,1,.,.,M,1
1,1,1,.,.,.
2,0,1,.,.,.
3,0,1,.,.,.
4,0,1,.,.,.


 
Actions taken: [(0, 4), (0, 0), (4, 0), (0, 4), (0, 3)]
Rewards per timestep: [0.3, 0.3, 0.3, -1, -1]
--------------------------------------------------
Episode 2400 | Timesteps: 4
Score: -2.95 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,.,.,2,.
1,.,.,1,2,1
2,.,.,1,0,0
3,.,.,3,2,1
4,.,.,.,M,.


 
Actions taken: [(3, 2), (0, 3), (2, 3), (4, 3)]
Rewards per timestep: [0.3, 0.3, 0.2, -1]
--------------------------------------------------
Episode 2500 | Timesteps: 17
Score: -2.40 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,0,0,0,0,0
1,0,0,1,2,2
2,1,1,2,.,.
3,2,.,.,3,3
4,2,M,.,.,.


 
Actions taken: [(3, 4), (3, 3), (0, 0), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 0), (4, 0), (1, 3), (3, 3), (0, 4), (3, 3), (3, 3), (4, 1)]
Rewards per timestep: [0.3, 0.2, 0.3, -1, -1, -1, -1, -1, -1, 0.2, 0.2, -1, -1, -1, -1, -1, -1]
--------------------------------------------------
Episode 2600 | Timesteps: 5
Score: -2.70 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,2,M,.,.
1,.,2,.,.,.
2,.,.,.,.,.
3,.,1,.,.,.
4,.,.,.,.,.


 
Actions taken: [(0, 1), (0, 1), (1, 1), (3, 1), (0, 2)]
Rewards per timestep: [0.3, -1, 0.2, 0.3, -1]
--------------------------------------------------
Episode 2700 | Timesteps: 2
Score: -2.35 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,.
1,.,.,.,.,.
2,M,.,.,2,.
3,.,.,.,.,.
4,.,.,.,.,.


 
Actions taken: [(2, 3), (2, 0)]
Rewards per timestep: [0.3, -1]
--------------------------------------------------
Episode 2800 | Timesteps: 13
Score: -4.45 | Win Rate: 0.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,3,.,1,0
1,.,.,M,1,0
2,.,3,2,1,0
3,.,2,0,0,0
4,.,2,0,0,0


 
Actions taken: [(3, 4), (3, 2), (3, 2), (3, 2), (4, 2), (3, 2), (4, 1), (4, 2), (3, 2), (0, 1), (4, 4), (0, 3), (1, 2)]
Rewards per timestep: [0.3, -1, -1, -1, -1, -1, -1, -1, -1, 0.3, -1, -1, -1]
--------------------------------------------------
Episode 2900 | Timesteps: 8
Score: -2.40 | Win Rate: 1.00% | Epsilon: 0.0100



Unnamed: 0,0,1,2,3,4
0,.,.,.,.,.
1,.,.,.,M,.
2,1,.,3,.,.
3,1,1,1,.,.
4,.,1,.,.,.


 
Actions taken: [(3, 2), (3, 2), (2, 2), (3, 0), (2, 0), (3, 1), (4, 1), (1, 3)]
Rewards per timestep: [0.3, -1, 0.2, 0.3, 0.2, 0.2, 0.2, -1]
--------------------------------------------------


## 과제
1. 완성해오기(설명가능한 상태로 ipynb 파일 구성)    
2. 알파제로 구조 공부해오기(전원)  
3. 자기 코드 규칙 정리해오기