In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import numpy as np
import random
from pathlib import Path
from math import sqrt
import os

# State

In [38]:
class TicTacToeState():
    def __init__(self, pieces=None, enemy_pieces=None):
        # 3x3 보드 초기화. None이면 빈 보드 생성.
        self.pieces = pieces if pieces != None else [0] * 9
        self.enemy_pieces = enemy_pieces if enemy_pieces != None else [0] * 9

    # 돌의 수 얻기
    def piece_count(self, pieces):
        count = 0
        for i in pieces:
            if i == 1:
                count += 1
        return count

    def check_line(self, pieces):
        winning_lines = [
            [0, 1, 2], # 가로
            [3, 4, 5],
            [6, 7, 8],
            [0, 3, 6], # 세로
            [1, 4, 7],
            [2, 5, 8],
            [0, 4, 8],# 대각선
            [2, 4, 6]
        ]
        for line in winning_lines:
            if all(pieces[i] == 1 for i in line):
                return True
        return False

    # 패배 여부 판정 Q. 승리 여부를 판정하지 않고 패배 여부를 판정하는 이유?
    def is_lose(self):
        return self.check_line(self.enemy_pieces)

    # 무승부 여부 확인
    def is_draw(self):
        total_pieces = [max(p, e) for p, e in zip(self.pieces, self.enemy_pieces)]
        if all(p == 1 for p in total_pieces): # 모든 간이 채워졌는지 확인
            # 승리나 패배 상태가 아닌 경우만 무승
            if not self.check_line(self.pieces) and not self.check_line(self.enemy_pieces):
                return True
        return False

    # 게임 종료 여부 확인
    def is_done(self):
        return self.is_lose() or self.is_draw()

    # 다음 상태 얻기
    def next(self, action):
        pieces = self.pieces.copy()
        pieces[action] = 1
        return TicTacToeState(self.enemy_pieces, pieces)

    # 가능한 수의 리스트 얻기
    def legal_actions(self):
        actions = []
        for i in range(9):
            if self.pieces[i] == 0 and self.enemy_pieces[i] == 0:
                actions.append(i)
        return actions

    # 선수 여부 확인
    def is_first_player(self):
        return self.piece_count(self.pieces) == self.piece_count(self.enemy_pieces)

    # 판 문자열 출력
    def __str__(self):
        ox = ('o', 'x') if self.is_first_player() else ('x', 'o')
        str = ''
        for i in range(9):
            if self.pieces[i] == 1:
                str += ox[0]
            elif self.enemy_pieces[i] == 1:
                str += ox[1]
            else:
                str += '-'
            if i % 3 == 2:
                str += '\n'
        return str

# 랜덤으로 행동 선택
def random_action(state):
    legal_actions = state.legal_actions()
    return legal_actions[random.randint(0, len(legal_actions) - 1)]

# 알파베타법을 활용한 상태 가치 계산
def alpha_beta(state, alpha, beta):
    # 패배 시 상태 가치 -1
    if state.is_lose():
        return -1

    # 무승부 시, 상태 가치 0
    if state.is_draw():
        return 0

    # 가능한 수의 상태 가치 계산
    for action in state.legal_actions():
        score = -alpha_beta(state.next(action), -beta, -alpha)
        if score > alpha:
            alpha = score

        if alpha >= beta:
            return alpha

    # 합법적인 수의 상태 가치의 최댓값을 반환
    return alpha

# 알파베타법을 활용한 행동 선택
def alpha_beta_action(state):
    # 가능한 수의 상태 가치 계산
    best_action = 0
    alpha = -float('inf')
    for action in state.legal_actions():
        score = -alpha_beta(state.next(action), -beta, -alpha)
        if score > alpha:
            best_action = action
            alpha = score

    return best_action

# 플레이아웃
def playout(state):
   # 패배 시, 상태 가치 -1
    if state.is_lose():
        return -1

    # 무승부 시, 상태 가치 0
    if state.is_draw():
        return 0

    # 다음 상태의 상태 가치
    return -playout(state.next(random_action(state)))

# 최댓값의 인덱스 반환
def argmax(collection):
    return collection.index(max(collection))

def mcts_action(state):
    root_node = Node(state)
    root_node.expand()

    # 시뮬레이션 100회 반복 -> 시행횟수가 가장 큰 행동을 다음 수로 선택
    for _ in range(100):
        root_node.evaluate()

    legal_actions = state.legal_actions()
    n_list = []
    for c in root_node.child_nodes:
        n_list.append(c.n)
    return legal_actions[argmax(n_list)]


# 동작 확인
if __name__ == '__main__':
    # 상태 생성
    state = TicTacToeState()

    # 게임 종료 시까지 반복
    while True:
        # 게임 종료 시
        if state.is_done():
            break

        # 다음 상태 얻기
        state = state.next(random_action(state))

        # 문자열 표시
        print(state)
        print()

o--
---
---


o--
-x-
---


o--
-x-
-o-


o-x
-x-
-o-


o-x
-x-
-oo


o-x
-x-
xoo




### State Test

In [39]:
# 1. 초기 상태
state = TicTacToeState()
assert state.pieces == [0] * 9
assert state.enemy_pieces == [0] * 9
assert state.is_done() == False
assert state.legal_actions() == list(range(9))
print("Initial state test completed.")

# 2. 승리 조건
state = TicTacToeState(pieces=[1, 1, 1, 0, 0, 0, 0, 0, 0])
assert state.check_line(state.pieces) == True
assert state.is_done() == False
assert state.is_lose() == False
print("Winning state test completed.")

# 3. 패배 조건
state = TicTacToeState(enemy_pieces=[1, 0, 0, 0, 1, 0, 0, 0, 1])
assert state.is_lose() == True
assert state.is_done() == True
print("Losing state test completed.")

# 4. 무승부 조건
state = TicTacToeState(pieces=[1, 0, 1, 1, 1, 0, 0, 1, 0],
                       enemy_pieces=[0, 1, 0, 0, 0, 1, 1, 0, 1]
                       )
assert state.is_draw() == True
assert state.is_done() == True
print("Draw state test completed.")

# 5. 다음 상태
state = TicTacToeState()
next_state = state.next(0)
#print(next_state.pieces)
#print(next_state.enemy_pieces)
assert next_state.pieces == [0] * 9
assert next_state.enemy_pieces == [1, 0, 0, 0, 0, 0, 0, 0, 0]
print("Next state test completed.")

# 6. 가능한 행동 리스트
state = TicTacToeState(
    pieces=[1, 0, 0, 1, 1, 0, 0, 1, 1],
    enemy_pieces=[0, 1, 0, 0, 0, 0, 1, 0, 0]
)
#print(state)
assert state.legal_actions() == [2, 5]
print("Legal actions test passed.")

Initial state test completed.
Winning state test completed.
Losing state test completed.
Draw state test completed.
Next state test completed.
Legal actions test passed.


# Network

In [40]:
DN_FILTERS = 128 # convolutional layer 수
DN_RESIDUAL_NUM = 16 # residual block 수
DN_INPUT_SHAPE = (3, 3, 2) # 3x3의 2차원 배열 2개
DN_OUTPUT_SIZE = 9 # 행동 수

In [41]:
# convolutional layer 정의
def conv(filters):
    return nn.Conv2d(
        in_channels=filters,
        out_channels=filters,
        kernel_size=3,
        stride=1,
        padding=1,
        bias=False
    )

In [42]:
class ResidualBlock(nn.Module):
    expansion = 1

    def __init__(self, filters, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        """
        - inplanes: input channel size
        - planes: output channel size
        - groups, base_width: ResNet이나 Wide ResNet의 경우 사용
        """
        super(ResidualBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")

        # Basic Block의 구조
        self.conv1 = conv(filters) # conv1에서 downsample
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv(filters)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        # short connection (다운샘플링이 필요한 경우 수행)
        if self.downsample is not None:
            identity = self.downsample(x)


        # identity mapping 시 identity mapping 후 ReLU를 적용
        # ReLU를 통과하면 양의 값만 남기 때문에 residual의 의미가 제대로 유지되지 않기 때문
        out += identity
        out = self.relu(out)

        return out

In [43]:
class DualNetwork(nn.Module):
    def __init__(self):
        super(DualNetwork, self).__init__()

        self.conv1 = nn.Conv2d(
            in_channels=DN_INPUT_SHAPE[0],
            out_channels=DN_FILTERS,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=False)
        self.bn1 = nn.BatchNorm2d(DN_FILTERS)
        self.relu = nn.ReLU(inplace=True)

        self.residual_blocks = nn.Sequential(*[ResidualBlock(DN_FILTERS, DN_FILTERS, DN_FILTERS) for _ in range(DN_RESIDUAL_NUM)])

        self.global_pool = nn.AdaptiveAvgPool2d(1)

        self.policy_head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(DN_FILTERS, DN_OUTPUT_SIZE),
            nn.Softmax(dim=1)
        )

        self.value_head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(DN_FILTERS, 1),
            nn.Tanh()
        ) # Sequential?

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.residual_blocks(x)

        x = self.global_pool(x)

        p = self.policy_head(x)
        v = self.value_head(x)

        return p, v

def save_model(model, path='./model/best.pth'):
    os.makedirs('./model/', exist_ok=True)
    torch.save(model.state_dict(), path)

def load_model(path='./model/best.pth'):
    model = DualNetwork()
    if os.path.exists(path):
        model.load_state_dict(torch.load(path))
    model.eval() # 추론 모드로 설정
    return model

# 동작 확인
if __name__ == '__main__':
    model = DualNetwork()
    print(model)

    # 모델 저장 테스트
    save_model(model)

    # 모델 로드 테스트
    loaded_model = load_model()
    print("Model loaded successfully.")

DualNetwork(
  (conv1): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (residual_blocks): Sequential(
    (0): ResidualBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): ResidualBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=

  model.load_state_dict(torch.load(path))


### Network Test

In [44]:
test_input = torch.randn(1, DN_INPUT_SHAPE[0], DN_INPUT_SHAPE[1], DN_INPUT_SHAPE[2])
if __name__ == '__main__':
    model = DualNetwork()
    p, v = model(test_input)
    print(p.shape)
    print(v.shape)
    print(p)
    print(v)

torch.Size([1, 9])
torch.Size([1, 1])
tensor([[0.0109, 0.0716, 0.4173, 0.1787, 0.0223, 0.1263, 0.0317, 0.1056, 0.0356]],
       grad_fn=<SoftmaxBackward0>)
tensor([[-0.5485]], grad_fn=<TanhBackward0>)


# MCTS Node

In [45]:
class Node:
    def __init__(self, state, p):
        self.state = state
        self.p = p
        self.w = 0
        self.n = 0
        self.child_nodes = None

    # 평가
    def evaluate(self):
        # 게임 종료 시
        if self.state.is_done():
            value = -1 if self.state.is_lose() else 0

            # 갱신
            self.w += value
            self.n += 1
            return value

        # 리프노드에 도달 시
        if not self.child_nodes:
            # neural network 추론을 통해 정책과 가치 얻기
            policies, value = predict(model, self.state)

            # 갱신
            self.w += value
            self.n += 1

            # 전개
            self.child_nodes = []
            for action, policy in zip(self.state. legal_actions(), policies):
                self.child_nodes.append(Node(self.state.next(action), policy))
            return value

        else: # 자녀 노드 있는 경우
            # 아크 평가값이 가장 큰 자녀 노드를 평가해 가치 얻기
            value = -self.next_child_node().evaluate()

            self.w += value
            self.n += 1
            return value

    # 아크 평가값이 가장 큰 자녀 노드 얻기
    def next_child_node(self):
        # 시행 횟수 n이 0인 자녀 노드 반환
        for child_node in self.child_nodes:
            if child_node.n == 0:
                return child_node

        # 아크 평가 계산
        C_PUCT = 1.0
        t = sum(nodes_to_scores(self.child_nodes))
        pucb_values = []
        for child_node in self.child_nodes:
            pucb_values.append((-child_node.w / child_node.n if child_node.n else 0.0) + C_PUCT * child_node.p * sqrt(t) / (1 + child_node.n))

        # 아크 평가값이 가장 큰 자녀 노드 반환
        return self.child_nodes[argmax(pucb_values)]

### Node Test

In [46]:
state = TicTacToeState()
state.done = False
state.lose = False
state.legal_actions_list = state.legal_actions if state.legal_actions is not None else [0, 1, 2]

if __name__ == '__main__':
    root_node = Node(state, p=1.0)

    value = root_node.evaluate()
    print(value)
    print(root_node.w)
    print(root_node.n)

    if root_node.child_nodes:
        print("자녀 노드 있음")
        print(len(root_node.child_nodes))

    best_child = root_node.next_child_node()
    print(best_child.p)
    print(best_child.w)
    print(best_child.n)

-0.07290931791067123
-0.07290931791067123
1
자녀 노드 있음
9
0.11248944
0
0


# MCTS

In [47]:
PV_EVALUATE_COUNT = 50 # predict 1회당 시뮬레이션 횟수

In [48]:
# 추론
def predict(model, state):
    # 추론을 위한 입력 데이터 shape 변환
    a, b, c = DN_INPUT_SHAPE
    x = np.array([state.pieces, state.enemy_pieces]) # (2, H, W)
    x = x.reshape(c, a, b).transpose(1, 2, 0).reshape(1, a, b, c) # pytorch: 채널 우선 형식 (N, C, H, W)
    # reshape은 데이터 순서를 바꾸지 않고, 단순히 numpy 배열의 재구성만 수행하여 차원을 재조정함
    # transpose은 실제 데이터 순서를 변경
    x_tensor = torch.tensor(x, dtype=torch.float32)

    # 추론
    with torch.no_grad():
        y_policy, y_value = model(x_tensor)

    # 정책
    policies = y_policy[0][list(state.legal_actions())].numpy() # 가능한 수만 추출
    policies /= sum(policies) if sum(policies) else 1 # 확률 분포로 변환

    # 가치
    value = y_value[0].item()
    return policies, value

def nodes_to_scores(nodes):
    scores = []
    for c in nodes:
        scores.append(c.n)
    return scores

def get_policy(model, state, temperature): # scores -> policy
    # 현재 상태의 노드 생성
    root_node = Node(state, 0)

    for _ in range(PV_EVALUATE_COUNT):
        root_node.evaluate()

    scores = nodes_to_scores(root_node.child_nodes)
    # temperature == 0 : 가장 높은 확률을 가진 행동을 선택
    # temperature > 0 : 확률 분포에 따라 무작위 선택, 값이 클수록 무작위성이 커짐
    if temperature == 0:
        action = np.argmax(scores)
        scores = np.zeros(len(scores))
        scores[action] = 1
    else: # 볼츠만 함수로 variation 추가
        scores = boltzman(scores, temperature)
    return scores

def pv_mcts_action(model, temperature=0):
    def pv_mcts_action(state):
        scores = get_policy(model, state, temperature)
        return np.random.choice(state.legal_actions(), p=scores) # p=scores: 행동 선택이 점수확률에 비례하도

    return pv_mcts_action

# 볼츠반 분포
def boltzman(xs, temperature):
    xs = [x ** (1 / temperature) for x in xs]
    return [x / sum(xs) for x in xs]


### MCTS Test

In [49]:
if __name__ == '__main__':
    # 모델 로드
    model_path = './model/best.pth'
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}. Please save the model first.")

    model = load_model(model_path)

    # 상태 생성
    state = TicTacToeState()

    next_action = pv_mcts_action(model, 1.0)

    # 게임 종료 시까지 반복
    while True:
        # 게임 종료 시
        if state.is_done():
            break

        # 행동 얻기
        action = next_action(state)

        # 다음 상태 얻기
        state = state.next(action)

        # 문장열 출력
        print(state)

  model.load_state_dict(torch.load(path))


---
o--
---

---
o--
x--

--o
o--
x--

--o
o--
xx-

o-o
o--
xx-

o-o
o--
xxx



# Self-play

In [50]:
SP_GAME_COUNT = 50 # self-play 수행할 게임 수
SP_TEMPERATURE = 1.0

In [51]:
def first_player_value(ended_state):
    # 1: 선 수 플레이어 승리, -1: 선 수 플레이어 패배, 0: 무승부
    if ended_state.is_lose():
        return -1 if ended_state.is_first_player() else 1
    return 0

# self-play를 통해 수집한 학습 데이터 저장
def save_data(data, path='./data/train_data.npy'):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    if os.path.exists(path):
        existing_data = np.load(path, allow_pickle=True)
        data = np.concatenate([existing_data, data], axis=0)
    np.save(path, data)
    print(f"Data saved to {path}, total size: {len(data)}")

def self_play():
    # 학습 데이터
    train_data = []

    # 베스트 플레이어 모델로 self-play
    model = load_model('./model/best.pth')

    for i in range(SP_GAME_COUNT):
        state = TicTacToeState()

        # 각 상태마다 [state.pieces, state.enemy_pieces], polices, result 저장
        game_data = []

        # 게임 진행
        while True:
            if state.is_done():
                break

            scores = get_policy(model, state, SP_TEMPERATURE)

            policies = [0] * DN_OUTPUT_SIZE
            for action, policy in zip(state.legal_actions(), scores):
                policies[action] = policy
            game_data.append([[state.pieces, state.enemy_pieces], policies, None]) # result는 아직 할당 안됨

            action = np.random.choice(state.legal_actions(), p=scores)
            state = state.next(action)

        value = first_player_value(state)

        # 역방향으로 각 상태에 게임 result 저장
        for j in range(len(game_data)):
            game_data[j][2] = value
            value = -value

        # 현재 게임 데이터를 train_data에 추가
        train_data.extend(game_data)

        # 출력
        print(f"Game {i + 1}/{SP_GAME_COUNT} completed.")

        #del model
        #torch.cuda.empty_cache()

    # 학습 데이터 저장
    save_data(np.array(train_data, dtype=object))

# 동작 확인
if __name__ == '__main__':
    self_play()


  model.load_state_dict(torch.load(path))


Game 1/50 completed.
Game 2/50 completed.
Game 3/50 completed.
Game 4/50 completed.
Game 5/50 completed.
Game 6/50 completed.
Game 7/50 completed.
Game 8/50 completed.
Game 9/50 completed.
Game 10/50 completed.
Game 11/50 completed.
Game 12/50 completed.
Game 13/50 completed.
Game 14/50 completed.
Game 15/50 completed.
Game 16/50 completed.
Game 17/50 completed.
Game 18/50 completed.
Game 19/50 completed.
Game 20/50 completed.
Game 21/50 completed.
Game 22/50 completed.
Game 23/50 completed.
Game 24/50 completed.
Game 25/50 completed.
Game 26/50 completed.
Game 27/50 completed.
Game 28/50 completed.
Game 29/50 completed.
Game 30/50 completed.
Game 31/50 completed.
Game 32/50 completed.
Game 33/50 completed.
Game 34/50 completed.
Game 35/50 completed.
Game 36/50 completed.
Game 37/50 completed.
Game 38/50 completed.
Game 39/50 completed.
Game 40/50 completed.
Game 41/50 completed.
Game 42/50 completed.
Game 43/50 completed.
Game 44/50 completed.
Game 45/50 completed.
Game 46/50 complete

# Train

In [52]:
RN_EPOCHS = 100 # 학습 횟수

In [53]:
# 손실 함수 및 옵티마이저 정의
criterion_for_policy = nn.CrossEntropyLoss()
criterion_for_value = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

In [54]:
# 데이터 준비
def load_data(path='./data/train_data.npy'):
    # save_data에서 저장된 경로를 바로 사용하도록 수정...
    try:
        data = np.load(path, allow_pickle=True)
        print(f"Data loaded from {path}, total size: {len(data)}")
        return data
    except FileNotFoundError:
        print(f"File not found at {path}. Make sure the data exists.")
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def prepare_data(data):
    xs, y_policies, y_values = zip(*data)

    a, b, c = DN_INPUT_SHAPE
    xs = np.array(xs)
    xs = xs.reshape(len(xs), c, a, b).transpose(0, 2, 3, 1) # pytorch: 채널 우선 형식 (N, C, H, W)
    y_policies = np.array(y_policies)
    y_values = np.array(y_values)

    # 데이터를 텐서로 변환
    xs = torch.tensor(xs, dtype=torch.float32)
    y_policies = torch.tensor(y_policies, dtype=torch.float32)
    y_values = torch.tensor(y_values, dtype=torch.float32)

    # 데이터셋과 데이터로더 준비
    dataset = TensorDataset(xs, y_policies, y_values)
    train_loader = DataLoader(dataset, batch_size=128, shuffle=True)

    return train_loader

In [55]:
def train_network():
    train_data = load_data()
    train_loader = prepare_data(train_data)

    model = load_model('./model/best.pth')

    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #model.to(device)

    # 학습 스케쥴러 설정
    def step_decay(epoch):
        if epoch >= 80:
            return 0.0005
        elif epoch >= 100:
            return 0.00025
        return 0.001

    epoch_losses = []

    # 학습 실행
    for epoch in range(RN_EPOCHS):
        model.train()
        running_loss = 0.0

        for inputs, policies, values in train_loader:
            #inputs, policies, values = inputs.to(device), policies.to(device), values.to(device)

            optimizer.zero_grad()

            policy_pred, value_pred = model(inputs)

            loss_policy = criterion_for_policy(policy_pred, policies)
            loss_value = criterion_for_value(value_pred, values.view(-1, 1)) # 타겟 크기 수
            policy_loss_weight = 1.0
            value_loss_weight = 0.01
            total_loss = policy_loss_weight * loss_policy + value_loss_weight * loss_value

            total_loss.backward()
            optimizer.step()

            # 손실 누적
            running_loss += total_loss.item()

        # 학습 스케쥴링
        for param_group in optimizer.param_groups:
            param_group['lr'] = step_decay(epoch)

        epoch_losses.append(running_loss / len(train_loader))
        print(f'Epoch [{epoch + 1}/{RN_EPOCHS}], Loss: {running_loss / len(train_loader):.4f}')


    # 최신 플레이어 모델 저장
    torch.save(model.state_dict(), '/model/latest.pth')

    # 손실 그래프 시각화
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, RN_EPOCHS + 1), epoch_losses, label="Training Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training Loss over Epochs")
    plt.legend()
    plt.grid(True)
    plt.show()

    # del model
    # torch.cuda.empty_cache()

# 동작 확인
if __name__ == '__main__':
    train_network()

Data loaded from ./data/train_data.npy, total size: 704


  model.load_state_dict(torch.load(path))


Epoch [1/100], Loss: 2.2222
Epoch [2/100], Loss: 2.2226
Epoch [3/100], Loss: 2.2235
Epoch [4/100], Loss: 2.2245
Epoch [5/100], Loss: 2.2230
Epoch [6/100], Loss: 2.2224
Epoch [7/100], Loss: 2.2232
Epoch [8/100], Loss: 2.2237
Epoch [9/100], Loss: 2.2233
Epoch [10/100], Loss: 2.2234
Epoch [11/100], Loss: 2.2230
Epoch [12/100], Loss: 2.2229
Epoch [13/100], Loss: 2.2226
Epoch [14/100], Loss: 2.2227
Epoch [15/100], Loss: 2.2236
Epoch [16/100], Loss: 2.2223
Epoch [17/100], Loss: 2.2230
Epoch [18/100], Loss: 2.2227
Epoch [19/100], Loss: 2.2227
Epoch [20/100], Loss: 2.2236
Epoch [21/100], Loss: 2.2226
Epoch [22/100], Loss: 2.2232
Epoch [23/100], Loss: 2.2237
Epoch [24/100], Loss: 2.2237
Epoch [25/100], Loss: 2.2233
Epoch [26/100], Loss: 2.2233
Epoch [27/100], Loss: 2.2228
Epoch [28/100], Loss: 2.2219
Epoch [29/100], Loss: 2.2232
Epoch [30/100], Loss: 2.2222
Epoch [31/100], Loss: 2.2231
Epoch [32/100], Loss: 2.2234
Epoch [33/100], Loss: 2.2224
Epoch [34/100], Loss: 2.2231
Epoch [35/100], Loss: 2

RuntimeError: Parent directory /model does not exist.

# Best Player

In [None]:
EP_GAME_COUNT = 10  # 평가 1회당 게임 수

In [None]:
# 선 수를 둔 플레이어의 포인트
def first_player_point(ended_state):
    # 1: 선 수 플레이어 승리, 0: 선 수 플레이어 패배, 0.5: 무승부
    if ended_state.is_lose():
        return 0 if ended_state.is_first_player() else 1
    return 0.5


# 1 게임 실행
def play(next_actions):
    # 상태 생성
    state = TicTacToeState()

    # 게임 종료 시까지 반복
    while True:
        # 게임 종료 시
        if state.is_done():
            break

        # 행동 얻기
        next_action = next_actions[0] if state.is_first_player() else next_actions[1]
        action = next_action(state)

        # 다음 상태의 획득
        state = state.next(action)

    # 선 수 플레이어의 포인트 반환
    return first_player_point(state)


# 임의의 알고리즘 평가
def evaluate_algorithm_of(label, next_actions):
    # 여러 차례 대전을 반복
    total_point = 0
    for i in range(EP_GAME_COUNT):
        # 1 게임 실행
        if i % 2 == 0:
            total_point += play(next_actions)
        else:
            total_point += 1 - play(list(reversed(next_actions)))

        # 출력
        print('\rEvaluate {}/{}'.format(i + 1, EP_GAME_COUNT), end='')
    print('')

    # 평균 포인트 계산
    average_point = total_point / EP_GAME_COUNT
    print(label, average_point)


# 베스트 플레이어 평가
def evaluate_best_player():
    # 베스트 플레이어 모델 로드 (PyTorch로 변경)
    model = torch.load('./model/best.pth')
    model.eval()  # 평가 모드로 설정

    # PV MCTS로 행동 선택을 수행하는 함수 생성
    next_pv_mcts_action = pv_mcts_action(model, 0.0)

    # VS 랜덤
    next_actions = (next_pv_mcts_action, random_action)
    evaluate_algorithm_of('VS_Random', next_actions)

    # VS 알파베타법
    next_actions = (next_pv_mcts_action, alpha_beta_action)
    evaluate_algorithm_of('VS_AlphaBeta', next_actions)

    # VS 몬테카를로 트리 탐색
    next_actions = (next_pv_mcts_action, mcts_action)
    evaluate_algorithm_of('VS_MCTS', next_actions)


# 동작 확인
if __name__ == '__main__':
    evaluate_best_player()