In [1]:
import numpy as np
from collections import deque
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

#Игра

In [3]:
# Игра крестики-нолики
class TicTacToe:
    def __init__(self, player_1, player_2, board_size=3, win_size=3):
        self.players = {-1: player_1,
                         1: player_2}

        self.wins = {player_1.name: 0,
                     player_2.name: 0}

        self.board_size=board_size
        self.win_size = win_size
        self._kernel = self._create_kernel()


    # Создает ядро свертки для расчета побед
    def _create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel


    # Проверяет победы для состояний states, в кот. ходы были совершены игроками turns, turn={-1, 1}
    def _test_win(self, state, turn):
        rows, cols, w_size = *state.shape, self.win_size
        expanded_states = np.lib.stride_tricks.as_strided(
            state,
            shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
            strides=(*state.strides, *state.strides),
            writeable=False,
        )
        feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)
        return -turn * (feature_map == turn * w_size).any().astype(int)


    # Проигрывание нескольких полных эпизодов
    def play(self, num_games=1, visualize=False):
        transitions = []
        for t in range(num_games):
            next_turn = turn = -1
            state = (np.zeros((self.board_size, self.board_size)), turn) # Начальное состояние игры. state = (state2d, turn)
            if visualize:
                self.visualize_state(state, turn)
            while(next_turn != 0):
                state_2d, turn = state
                current_player = self.players[turn]
                action = current_player.get_action(state)
                next_state_2d, next_turn, reward = self.play_turn(state, action)
                transitions.append((turn * state_2d, action, reward, -turn * next_state_2d, next_turn == 0))   #state, action, reward, new_state, done
                if visualize:
                    self.visualize_state((next_state_2d, next_turn), turn)
                if next_turn == 0:
                    if visualize:
                        if (reward == 0): print('Ничья!\n')
                        else: print(f'Победа ({self.players[reward * turn].name})!\n')
                    if reward != 0:
                        self.wins[self.players[reward * turn].name] += 1
                    self.players = {-1: self.players[1], 1: self.players[-1]}
                state = next_state_2d, next_turn
        return transitions


    # Выполнение хода и проверка на некорректный ход (проигрышь) / выигрыш / ничью
    def play_turn(self, state, action): # next_state2d, next_turn, reward
        state2d, turn = state
        next_state2d = state2d.copy()

        # Проверка корректности хода
        if (state2d[(action)] != 0):
            return next_state2d, 0, -1        # Игрок проиграл (# next_turn == 0 => Игра окончена)

        # Совершение хода
        next_state2d[action] = turn

        # Проверка победы
        if self._test_win(next_state2d, turn):
            return next_state2d, 0, 1         # Текущий игрок побеждает (next_turn == 0 => Игра окончена)

        # Проверка ничьи
        if (next_state2d != 0).all():
            return next_state2d, 0, 0         # Ничья (next_turn == 0 => Игра окончена)

        # Инчае, ход следующего игрока
        return next_state2d, -turn, 0         # next_turn == -turn => Смена хода


    # Выводит на экран состояние игры после хода игрока
    @staticmethod
    def visualize_state(next_state, turn):
        next_state2d, next_turn = next_state
        print(f"player {turn}'s turn:")
        if (next_state2d == 0).all() and turn == 0:
            print("[invalid state]\n\n")
        else:
            print(str(next_state2d)
                  .replace(".", "")
                  .replace("[[", "")
                  .replace(" [", "")
                  .replace("]]", "")
                  .replace("]", "")
                  .replace("-0", " .")
                  .replace("0", ".")
                  .replace("-1", " X")
                  .replace("1", "O")
            )


    @staticmethod
    def print_transitions(transitions):
        states, actions, rewards, next_states, dones = zip(*transitions)
        for i in np.arange(len(states)):
            print("\033[31m{}.".format(i + 1), '\033[30m')
            TicTacToe.visualize_state((next_states[i], -1), 1)
            print('\naction = ', actions[i] + np.array([1, 1]), end='\n')
            print('reward = ', rewards[i], end='\n')
            if (dones[i]): print('Игра окончена', end='\n\n')
            else: print('Игра продолжается', end='\n\n')

#Игроки

In [4]:
class Human:
    def __init__(self, name='Human'):
        self.name = name

    def get_action(self, state):
        state2d, turn = state
        print('Введите ваш ход (Строка, столбец)')
        row, col = map(int, input().split())
        while (state2d[row - 1, col - 1] != 0):
            print('Клетка занята!')
            print('Введите ваш ход (Строка, столбец)')
            row, col = map(int, input().split())
        return row - 1, col - 1

In [5]:
# Игрок Рандом с преимуществами:
# 1. Если есть возможность выиграть за один ход, он делает это (win = True)
# 2. Если у соперника есть возможность выиграть в следующем ходу, он блокирует этот ход (defense = True)
# 3. Иначе, выбирает случайный ход из множества допустимых
class RandomPlus:
    def __init__(self, board_size=3, win_size=3, name='RandomPlus', win=False, defense=False):
        self.name = name
        self.board_size = board_size
        self.win_size = win_size
        self.win = win
        self.defense = defense

        self._kernel = self._create_kernel()


    # Создает ядро свертки для расчета потенциальных побед
    def _create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel


    def get_action(self, state):
        state2d, turn = state
        rows, cols, w_size = *state2d.shape, self.win_size

        if self.win or self.defense:
            expanded_states = np.lib.stride_tricks.as_strided(
                state2d,
                shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
                strides=(*state2d.strides, *state2d.strides),
                writeable=False,
            )
            feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)

            if self.win:
                wins = np.array(np.where(turn * feature_map == w_size - 1))
                if wins.shape[1] > 0:
                    K, I, J = wins[:, 0]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

            if self.defense:
                defenses = np.array(np.where(-turn * feature_map == w_size - 1))
                if defenses.shape[1] > 0:
                    K, I, J = defenses[:, 0]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

In [6]:
class DQNAgent(nn.Module):
    def __init__(self, epsilon=0, name='DQNAgent', masking=False):
        super().__init__()

        self.name = name
        self.epsilon = epsilon
        self.n_channels = 3
        self.masking = masking    # Маскирование (ВКЛЮЧАТЬ ТОЛЬКО ПРИ ИНФЕРЕНСЕ)

        self.network = nn.Sequential(
            nn.Conv2d(self.n_channels, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 1, kernel_size=(3, 3), padding='same')
        )

    def forward(self, x):
        x = torch.stack([x == 1, x == -1, x == 0], axis=1).float()
        return self.network(x).squeeze(1)

    def greedy_action(self, state, device=device):
        state2d, turn = state
        state_t = torch.FloatTensor(turn * state2d).unsqueeze(0).to(device)
        q_values = self.forward(state_t).squeeze(0).detach().cpu().numpy()
        if self.masking:
            q_values[state2d != 0] = -float("Inf")
        return np.unravel_index(q_values.argmax(), q_values.shape)

    def random_action(self, state):
        state2d, turn = state
        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

    def get_action(self, state):
        if random.random() < self.epsilon:
            action = self.random_action(state)
        else:
            action = self.greedy_action(state)
        return action

# Буферы

In [7]:
# Обычный буфер
class ReplayBuffer(object):
    def __init__(self, size):
        self._storage = deque(maxlen=size)

    def __len__(self):
        return len(self._storage)

    def add(self, transition):
        self._storage.append(transition)

    def sample(self, batch_size, augmentation=False):
        batch = random.sample(self._storage, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            # ======== ДЛЯ ВСЕГО БАТЧА ОДИНАКОВАЯ АУГМЕНТАЦИЯ ========
            # n = states.shape[-1] - 1
            # k = np.random.randint(0, 4)
            # states = np.rot90(states, k, axes=(1,2)).copy()
            # next_states = np.rot90(next_states, k, axes=(1,2)).copy()

            # i, j = actions[:, 0], actions[:, 1]
            # if k == 1: actions = np.column_stack((n - j, i))
            # if k == 2: actions = np.column_stack((n - i, n - j))
            # if k == 3: actions = np.column_stack((j, n - i))


            # ======== ДЛЯ КАЖДОГО ЭЛЕМЕНТА БАТЧА ОТДЕЛЬНО ========
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))


            # ======== УВЕЛИЧЕНИЕ X4 ========
            # n = states.shape[-1] - 1
            # i, j = actions[:, 0], actions[:, 1]

            # states = np.concatenate([np.rot90(states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # next_states = np.concatenate([np.rot90(next_states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # actions = np.concatenate([actions,
            #                           np.column_stack((n - j, i)),
            #                           np.column_stack((n - i, n - j)),
            #                           np.column_stack((j, n - i))], axis=0)
            # rewards = np.tile(rewards, 4)
            # dones = np.tile(dones, 4)

        return states, actions, rewards, next_states, dones

In [8]:
# =========== Prioritized Replay Buffer With Augmentation ===========
class PrioritizedBuffer(object):
    def __init__(self, capacity, prob_alpha=0.6):
        self.prob_alpha = prob_alpha
        self.capacity = capacity
        self.buffer = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def add(self, state, action, reward, next_state, done):
        max_prio = self.priorities.max() if self.buffer else 1.0

        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4, augmentation=False):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        probs  = prios ** self.prob_alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total    = len(self.buffer)
        weights  = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights  = np.array(weights, dtype=np.float32)

        states, actions, rewards, next_states, dones = zip(*samples)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

#Функции и гиперпараметры для обучения

In [9]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed);

In [10]:
board_size = 7
win_size = 5

In [24]:
# Гиперпараметры метода DQN

batch_size = 128        # 512 - много
total_steps = 120_000

decay_steps = 110_000
init_epsilon = 1
final_epsilon = 0.05     # 0.02 - мало; 0.1 - мало

loss_freq = 100
refresh_target_network_freq = 100    # 1000 - много, 50 - мало

eval_freq = 500
n_eval_games = 100

max_grad_norm = 50

gamma = 0.9

In [12]:
agent = DQNAgent(init_epsilon).to(device)

target_network = DQNAgent(init_epsilon).to(device)
target_network.load_state_dict(agent.state_dict())

optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)
exp_replay = PrioritizedBuffer(16_000) #ReplayBuffer(16_000)

In [13]:
sum([p.numel() for p in agent.parameters()])

1189633

In [14]:
# Возвращает temporal difference loss
def compute_td_loss(states, actions, rewards, next_states, dones,
                    agent, target_network, weights=None, indices=None,    #exp_replay
                    gamma=0.9, device=device, prioritized=False):

    states = torch.tensor(states, device=device, dtype=torch.float32)                # shape: [batch_size, state_dim]
    actions = torch.tensor(actions, device=device, dtype=torch.int64)                # shape: [batch_size]
    rewards = torch.tensor(rewards, device=device, dtype=torch.float32)              # shape: [batch_size]
    next_states = torch.tensor(next_states, device=device, dtype=torch.float32)      # shape: [batch_size, state_dim]
    dones = torch.tensor(dones, device=device, dtype=torch.int64)                    # shape: [batch_size]

    predicted_qvalues = agent(states)                                                # shape: [batch_size, n_actions]
    predicted_next_qvalues = target_network(next_states)                             # shape: [batch_size, n_actions]
    predicted_qvalues_for_actions = predicted_qvalues[range(len(actions)), actions[:, 0], actions[:, 1]]  # shape: [batch_size]
    next_state_values = predicted_next_qvalues.view(dones.shape[0], -1).max(axis=1).values
    target_qvalues_for_actions = rewards - (1 - dones) * gamma * next_state_values

    if prioritized:   #[Prioterized DQN]
        weights = torch.tensor(weights, device=device, dtype=torch.float32)
        loss = weights * (predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2
        prios = (loss + 1e-5).data.cpu().numpy()  # Обновление приоритетов
        loss = torch.mean(loss)
        exp_replay.update_priorities(indices, prios)
        return loss
    else:
        return torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2)  #loss

# Рассчитывает epsilon на текущем шаге step
def linear_decay(init_epsilon, final_epsilon, step, decay_steps):
    return max(init_epsilon - step * (init_epsilon - final_epsilon) / decay_steps, final_epsilon)

# Обучение

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [17]:
PATH = f'/content/drive/MyDrive/TicTacToe_8/'

loss = None
loss_values = []
reward_values = []

In [None]:
with open('out.txt', 'w') as f:
  for t in range(total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 0    	 reward = -1.0	{'DQNAgent': 0, 'RandomPlus': 100}

t = 100  	 loss = 0.0014503456186503172	 eps = 0.9992
t = 200  	 loss = 0.0025754813104867935	 eps = 0.9984
t = 300  	 loss = 0.005845450796186924	 eps = 0.9976
t = 400  	 loss = 0.00960560329258442	 eps = 0.9968
t = 500  	 loss = 0.006231207400560379	 eps = 0.996
t = 500  	 reward = -0.98	{'DQNAgent': 1, 'RandomPlus': 99}

t = 600  	 loss = 0.007771224714815617	 eps = 0.9952
t = 700  	 loss = 0.004004349932074547	 eps = 0.9944
t = 800  	 loss = 0.006096098106354475	 eps = 0.9936
t = 900  	 loss = 0.0032341827172785997	 eps = 0.9928
t = 1000 	 loss = 0.004183352924883366	 eps = 0.992
t = 1000 	 reward = -0.84	{'DQNAgent': 8, 'RandomPlus': 92}

t = 1100 	 loss = 0.0025476980954408646	 eps = 0.9912
t = 1200 	 loss = 0.007859527133405209	 eps = 0.9904
t = 1300 	 loss = 0.0032840995118021965	 eps = 0.9896
t = 1400 	 loss = 0.002489143516868353	 eps = 0.9888
t = 1500 	 loss = 0.0023776970338076353	 eps = 0.988
t = 1500 	 reward = 

KeyboardInterrupt: 

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_10000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_10000'))

main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
with open('out.txt', 'w') as f:
  for t in range(10_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 10100	 loss = 0.015143396332859993	 eps = 0.8653
t = 10200	 loss = 0.012356224469840527	 eps = 0.864
t = 10300	 loss = 0.00405209930613637	 eps = 0.8627
t = 10400	 loss = 0.0061295730993151665	 eps = 0.8613
t = 10500	 loss = 0.00525152450427413	 eps = 0.86
t = 10500	 reward = -0.19	{'DQNAgent': 40, 'RandomPlus': 59}

t = 10600	 loss = 0.004989832639694214	 eps = 0.8587
t = 10700	 loss = 0.00580214150249958	 eps = 0.8573
t = 10800	 loss = 0.005738983396440744	 eps = 0.856
t = 10900	 loss = 0.005302464589476585	 eps = 0.8547
t = 11000	 loss = 0.005145261995494366	 eps = 0.8533
t = 11000	 reward = -0.49	{'DQNAgent': 25, 'RandomPlus': 74}

t = 11100	 loss = 0.004748477600514889	 eps = 0.852
t = 11200	 loss = 0.003772490657866001	 eps = 0.8507
t = 11300	 loss = 0.0035700856242328882	 eps = 0.8493
t = 11400	 loss = 0.005855051800608635	 eps = 0.848
t = 11500	 loss = 0.0056157358922064304	 eps = 0.8467
t = 11500	 reward = -0.24	{'DQNAgent': 38, 'RandomPlus': 62}

t = 11600	 loss = 0.00658

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_17000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_17000'))

main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
with open('out.txt', 'w') as f:
  for t in range(17_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 17100	 loss = None	 eps = 0.772
t = 17200	 loss = 0.013053225353360176	 eps = 0.7707
t = 17300	 loss = 0.0032226815819740295	 eps = 0.7693
t = 17400	 loss = 0.0030869494657963514	 eps = 0.768
t = 17500	 loss = 0.0056057944893836975	 eps = 0.7667
t = 17500	 reward = -0.07	{'DQNAgent': 46, 'RandomPlus': 53}

t = 17600	 loss = 0.003163502085953951	 eps = 0.7653
t = 17700	 loss = 0.005290397442877293	 eps = 0.764
t = 17800	 loss = 0.003680925816297531	 eps = 0.7627
t = 17900	 loss = 0.006656264886260033	 eps = 0.7613
t = 18000	 loss = 0.004775339737534523	 eps = 0.76
t = 18000	 reward = -0.08	{'DQNAgent': 46, 'RandomPlus': 54}

t = 18100	 loss = 0.005879539996385574	 eps = 0.7587
t = 18200	 loss = 0.008892146870493889	 eps = 0.7573
t = 18300	 loss = 0.004732859320938587	 eps = 0.756
t = 18400	 loss = 0.005579778924584389	 eps = 0.7547
t = 18500	 loss = 0.004977567121386528	 eps = 0.7533
t = 18500	 reward = -0.32	{'DQNAgent': 34, 'RandomPlus': 66}

t = 18600	 loss = 0.004779628477990627

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_39000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_39000'))

main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
with open('out.txt', 'w') as f:
  for t in range(39_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 39100	 loss = None	 eps = 0.4787
t = 39200	 loss = 0.0037698177620768547	 eps = 0.4773
t = 39300	 loss = 0.0030763777904212475	 eps = 0.476
t = 39400	 loss = 0.001814821269363165	 eps = 0.4747
t = 39500	 loss = 0.003513412084430456	 eps = 0.4733
t = 39500	 reward = 0.16	{'DQNAgent': 58, 'RandomPlus': 42}

t = 39600	 loss = 0.0028121585492044687	 eps = 0.472
t = 39700	 loss = 0.008978449739515781	 eps = 0.4707
t = 39800	 loss = 0.0027770744636654854	 eps = 0.4693
t = 39900	 loss = 0.004383034072816372	 eps = 0.468
t = 40000	 loss = 0.003285860875621438	 eps = 0.4667
t = 40000	 reward = 0.22	{'DQNAgent': 61, 'RandomPlus': 39}

t = 40100	 loss = 0.003897226881235838	 eps = 0.4653
t = 40200	 loss = 0.004303330089896917	 eps = 0.464
t = 40300	 loss = 0.0052237012423574924	 eps = 0.4627
t = 40400	 loss = 0.004812898114323616	 eps = 0.4613
t = 40500	 loss = 0.005868859589099884	 eps = 0.46
t = 40500	 reward = 0.44	{'DQNAgent': 72, 'RandomPlus': 28}

t = 40600	 loss = 0.005282066762447357	

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_54000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_54000'))

main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
with open('out.txt', 'w') as f:
  for t in range(54_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 54100	 loss = None	 eps = 0.2787
t = 54200	 loss = 0.0031566277612000704	 eps = 0.2773
t = 54300	 loss = 0.0027538668364286423	 eps = 0.276
t = 54400	 loss = 0.005372095387428999	 eps = 0.2747
t = 54500	 loss = 0.0034153624437749386	 eps = 0.2733
t = 54500	 reward = 0.5	{'DQNAgent': 75, 'RandomPlus': 25}

t = 54600	 loss = 0.0024666134268045425	 eps = 0.272
t = 54700	 loss = 0.00479389913380146	 eps = 0.2707
t = 54800	 loss = 0.0024015833623707294	 eps = 0.2693
t = 54900	 loss = 0.003967026248574257	 eps = 0.268
t = 55000	 loss = 0.0034500639885663986	 eps = 0.2667
t = 55000	 reward = 0.74	{'DQNAgent': 87, 'RandomPlus': 13}

t = 55100	 loss = 0.003942839335650206	 eps = 0.2653
t = 55200	 loss = 0.004457756876945496	 eps = 0.264
t = 55300	 loss = 0.0038009081035852432	 eps = 0.2627
t = 55400	 loss = 0.00373813696205616	 eps = 0.2613
t = 55500	 loss = 0.0040886541828513145	 eps = 0.26
t = 55500	 reward = 0.28	{'DQNAgent': 64, 'RandomPlus': 36}

t = 55600	 loss = 0.008327452465891838	

KeyboardInterrupt: 

In [18]:
agent.load_state_dict(torch.load(f'{PATH}model_82500'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_82500'))

main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [19]:
with open('out.txt', 'w') as f:
  for t in range(82_600, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 82600	 loss = None	 eps = 0.25
t = 82700	 loss = 0.004096668679267168	 eps = 0.25
t = 82800	 loss = 0.007480150554329157	 eps = 0.25
t = 82900	 loss = 0.002658799523487687	 eps = 0.25
t = 83000	 loss = 0.005371814593672752	 eps = 0.25
t = 83000	 reward = 0.56	{'DQNAgent': 78, 'RandomPlus': 22}

t = 83100	 loss = 0.004077929072082043	 eps = 0.25
t = 83200	 loss = 0.004114334471523762	 eps = 0.25
t = 83300	 loss = 0.005193930119276047	 eps = 0.25
t = 83400	 loss = 0.0041450802236795425	 eps = 0.25
t = 83500	 loss = 0.0034748208709061146	 eps = 0.25
t = 83500	 reward = 0.51	{'DQNAgent': 75, 'RandomPlus': 24}

t = 83600	 loss = 0.003645701799541712	 eps = 0.25
t = 83700	 loss = 0.004883396904915571	 eps = 0.25
t = 83800	 loss = 0.004776002839207649	 eps = 0.25
t = 83900	 loss = 0.00487343966960907	 eps = 0.25
t = 84000	 loss = 0.005662119016051292	 eps = 0.25
t = 84000	 reward = 0.58	{'DQNAgent': 79, 'RandomPlus': 21}

t = 84100	 loss = 0.005373808555305004	 eps = 0.25
t = 84200	 loss 

In [None]:
# Загрузка самой лучшей модели
agent.load_state_dict(torch.load(f'{PATH}model_95500'))

<All keys matched successfully>

In [25]:
with open('out.txt', 'w') as f:
  for t in range(100_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 100100	 loss = 0.019809503108263016	 eps = 0.1355
t = 100200	 loss = 0.01588462106883526	 eps = 0.1346
t = 100300	 loss = 0.010876724496483803	 eps = 0.1338
t = 100400	 loss = 0.011499712243676186	 eps = 0.1329
t = 100500	 loss = 0.02178739570081234	 eps = 0.132
t = 100500	 reward = 0.56	{'DQNAgent': 78, 'RandomPlus': 22}

t = 100600	 loss = 0.0064584193751215935	 eps = 0.1312
t = 100700	 loss = 0.00675962632521987	 eps = 0.1303
t = 100800	 loss = 0.008149629458785057	 eps = 0.1295
t = 100900	 loss = 0.0074499137699604034	 eps = 0.1286
t = 101000	 loss = 0.008473038673400879	 eps = 0.1277
t = 101000	 reward = 0.42	{'DQNAgent': 71, 'RandomPlus': 29}

t = 101100	 loss = 0.006410880945622921	 eps = 0.1269
t = 101200	 loss = 0.004786661826074123	 eps = 0.126
t = 101300	 loss = 0.0055897957645356655	 eps = 0.1251
t = 101400	 loss = 0.005414832383394241	 eps = 0.1243
t = 101500	 loss = 0.005742618348449469	 eps = 0.1234
t = 101500	 reward = 0.6	{'DQNAgent': 80, 'RandomPlus': 20}

t = 101

KeyboardInterrupt: 

In [26]:
agent.epsilon = 0
eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
eval_game.play(n_eval_games)
eval_game.wins

{'DQNAgent': 65, 'RandomPlus': 35}

#Тестирование обученных моделей (инференс с маскированием)

In [None]:
PATH = '/content/drive/MyDrive/TicTacToe_8/'

In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
agent.epsilon = 0
agent.masking = True

for i in range(0, 70_001, 500):
    agent.load_state_dict(torch.load(f'{PATH}model_{i}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(1000)
    print(f'{i:5}', eval_game.wins)

In [None]:
# Сравнение лучших моделей (без проигрышей)
models = [26500, 27000, 28000, 29000, 30000, 30500, 34500, 36500, 38500, 39500, 40000,
          40500, 41500, 42000, 42500, 43000, 43500, 44500, 45000, 45500, 47500,
          48000, 49000, 50000, 51500, 52000, 52500, 53000, 53500, 54000, 54500,
          55000, 55500, 56000, 57000, 59500, 60000, 60500, 61000, 61500, 62000,
          63500, 64000, 64500, 65000, 66000, 66500, 67500, 69000]

agent.epsilon = 0
agent.masking = True

for model in models:
    agent.load_state_dict(torch.load(f'{PATH}model_{model}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(10_000)
    print(model, eval_game.wins)

In [22]:
# Загрузка самой лучшей модели
agent.load_state_dict(torch.load(f'{PATH}model_95500', map_location=torch.device('cpu')))

<All keys matched successfully>

In [23]:
agent.epsilon = 0
test_game = TicTacToe(agent, Human(), board_size=board_size, win_size=win_size)
test_game.play(4, True)
test_game.wins

player -1's turn:
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
player -1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
Введите ваш ход (Строка, столбец)
3 3
player 1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  O  .  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
player -1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  O  X  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
Введите ваш ход (Строка, столбец)
2 4
player 1's turn:
 .  .  .  .  .  .  .
 .  .  .  O  .  .  .
 .  .  O  X  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
player -1's turn:
 .  .  .  .  .  .  .
 .  .  .  O  .  .  .
 .  .  O  X  .  .  .
 .  .  .  X  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  . 

KeyboardInterrupt: Interrupted by user

# Первый ход за крестики и значения $Q$-фунцкии в начальном состоянии

In [None]:
state2d = torch.tensor(np.zeros((1, 7, 7))).to(device)

q_values = agent(state2d).squeeze(0).detach().cpu().numpy()
np.unravel_index(q_values.argmax(), q_values.shape)

(3, 3)

In [None]:
q_values.round(4)

array([[-0.0684, -0.064 , -0.0358, -0.0494, -0.0263, -0.0711, -0.0195],
       [-0.0457, -0.0207, -0.0132, -0.0035, -0.0091, -0.029 , -0.0438],
       [-0.0156, -0.0113,  0.0434,  0.0295,  0.0367, -0.0051, -0.0104],
       [-0.0425, -0.0255,  0.0354,  0.0539,  0.0183, -0.0157, -0.0361],
       [-0.0721, -0.0236,  0.0205,  0.0216,  0.0134, -0.0366, -0.0267],
       [-0.0979, -0.0705, -0.0431, -0.0291, -0.044 , -0.0472, -0.0386],
       [-0.0893, -0.0818, -0.0544, -0.0399, -0.0268, -0.0384, -0.0467]],
      dtype=float32)