In [1]:
import numpy as np
from collections import deque
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

#Игра

In [3]:
# Игра крестики-нолики
class TicTacToe:
    def __init__(self, player_1, player_2, board_size=3, win_size=3):
        self.players = {-1: player_1,
                         1: player_2}

        self.wins = {player_1.name: 0,
                     player_2.name: 0}

        self.board_size=board_size
        self.win_size = win_size
        self._kernel = self._create_kernel()


    # Создает ядро свертки для расчета побед
    def _create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel


    # Проверяет победы для состояний states, в кот. ходы были совершены игроками turns, turn={-1, 1}
    def _test_win(self, state, turn):
        rows, cols, w_size = *state.shape, self.win_size
        expanded_states = np.lib.stride_tricks.as_strided(
            state,
            shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
            strides=(*state.strides, *state.strides),
            writeable=False,
        )
        feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)
        return -turn * (feature_map == turn * w_size).any().astype(int)


    # Проигрывание нескольких полных эпизодов
    def play(self, num_games=1, visualize=False):
        transitions = []
        for t in range(num_games):
            next_turn = turn = -1
            state = (np.zeros((self.board_size, self.board_size)), turn) # Начальное состояние игры. state = (state2d, turn)
            if visualize:
                self.visualize_state(state, turn)
            while(next_turn != 0):
                state_2d, turn = state
                current_player = self.players[turn]
                action = current_player.get_action(state)
                next_state_2d, next_turn, reward = self.play_turn(state, action)
                transitions.append((turn * state_2d, action, reward, -turn * next_state_2d, next_turn == 0))   #state, action, reward, new_state, done
                if visualize:
                    self.visualize_state((next_state_2d, next_turn), turn)
                if next_turn == 0:
                    if visualize:
                        if (reward == 0): print('Ничья!\n')
                        else: print(f'Победа ({self.players[reward * turn].name})!\n')
                    if reward != 0:
                        self.wins[self.players[reward * turn].name] += 1
                    self.players = {-1: self.players[1], 1: self.players[-1]}
                state = next_state_2d, next_turn
        return transitions


    # Выполнение хода и проверка на некорректный ход (проигрышь) / выигрыш / ничью
    def play_turn(self, state, action): # next_state2d, next_turn, reward
        state2d, turn = state
        next_state2d = state2d.copy()

        # Проверка корректности хода
        if (state2d[(action)] != 0):
            return next_state2d, 0, -1        # Игрок проиграл (# next_turn == 0 => Игра окончена)

        # Совершение хода
        next_state2d[action] = turn

        # Проверка победы
        if self._test_win(next_state2d, turn):
            return next_state2d, 0, 1         # Текущий игрок побеждает (next_turn == 0 => Игра окончена)

        # Проверка ничьи
        if (next_state2d != 0).all():
            return next_state2d, 0, 0         # Ничья (next_turn == 0 => Игра окончена)

        # Инчае, ход следующего игрока
        return next_state2d, -turn, 0         # next_turn == -turn => Смена хода


    # Выводит на экран состояние игры после хода игрока
    @staticmethod
    def visualize_state(next_state, turn):
        next_state2d, next_turn = next_state
        print(f"player {turn}'s turn:")
        if (next_state2d == 0).all() and turn == 0:
            print("[invalid state]\n\n")
        else:
            print(str(next_state2d)
                  .replace(".", "")
                  .replace("[[", "")
                  .replace(" [", "")
                  .replace("]]", "")
                  .replace("]", "")
                  .replace("-0", " .")
                  .replace("0", ".")
                  .replace("-1", " X")
                  .replace("1", "O")
            )


    @staticmethod
    def print_transitions(transitions):
        states, actions, rewards, next_states, dones = zip(*transitions)
        for i in np.arange(len(states)):
            print("\033[31m{}.".format(i + 1), '\033[30m')
            TicTacToe.visualize_state((next_states[i], -1), 1)
            print('\naction = ', actions[i] + np.array([1, 1]), end='\n')
            print('reward = ', rewards[i], end='\n')
            if (dones[i]): print('Игра окончена', end='\n\n')
            else: print('Игра продолжается', end='\n\n')

#Игроки

In [4]:
class Human:
    def __init__(self, name='Human'):
        self.name = name

    def get_action(self, state):
        state2d, turn = state
        print('Введите ваш ход (Строка, столбец)')
        row, col = map(int, input().split())
        while (state2d[row - 1, col - 1] != 0):
            print('Клетка занята!')
            print('Введите ваш ход (Строка, столбец)')
            row, col = map(int, input().split())
        return row - 1, col - 1

In [5]:
# Игрок Рандом с преимуществами:
# 1. Если есть возможность выиграть за один ход, он делает это (win = True)
# 2. Если у соперника есть возможность выиграть в следующем ходу, он блокирует этот ход (defense = True)
# 3. Иначе, выбирает случайный ход из множества допустимых
class RandomPlus:
    def __init__(self, board_size=3, win_size=3, name='RandomPlus', win=False, defense=False):
        self.name = name
        self.board_size = board_size
        self.win_size = win_size
        self.win = win
        self.defense = defense

        self._kernel = self._create_kernel()


    # Создает ядро свертки для расчета потенциальных побед
    def _create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel


    def get_action(self, state):
        state2d, turn = state
        rows, cols, w_size = *state2d.shape, self.win_size

        if self.win or self.defense:
            expanded_states = np.lib.stride_tricks.as_strided(
                state2d,
                shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
                strides=(*state2d.strides, *state2d.strides),
                writeable=False,
            )
            feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)

            if self.win:
                wins = np.array(np.where(turn * feature_map == w_size - 1))
                if wins.shape[1] > 0:
                    K, I, J = wins[:, 0]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

            if self.defense:
                defenses = np.array(np.where(-turn * feature_map == w_size - 1))
                if defenses.shape[1] > 0:
                    K, I, J = defenses[:, 0]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

In [6]:
class DQNAgent(nn.Module):
    def __init__(self, epsilon=0, name='DQNAgent', masking=False):
        super().__init__()

        self.name = name
        self.epsilon = epsilon
        self.n_channels = 3
        self.masking = masking    # Маскирование (ВКЛЮЧАТЬ ТОЛЬКО ПРИ ИНФЕРЕНСЕ)

        self.network = nn.Sequential(
            nn.Conv2d(self.n_channels, 64, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(64, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 1, kernel_size=(3, 3), padding='same')
        )

    def forward(self, x):
        x = torch.stack([x == 1, x == -1, x == 0], axis=1).float()
        return self.network(x).squeeze(1)

    def greedy_action(self, state, device=device):
        state2d, turn = state
        state_t = torch.FloatTensor(turn * state2d).unsqueeze(0).to(device)
        q_values = self.forward(state_t).squeeze(0).detach().cpu().numpy()
        if self.masking:
            q_values[state2d != 0] = -float("Inf")
        return np.unravel_index(q_values.argmax(), q_values.shape)

    def random_action(self, state):
        state2d, turn = state
        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

    def get_action(self, state):
        if random.random() < self.epsilon:
            action = self.random_action(state)
        else:
            action = self.greedy_action(state)
        return action

#Функции и гиперпараметры для обучения

In [7]:
class ReplayBuffer(object):
    def __init__(self, size):
        self._storage = deque(maxlen=size)

    def __len__(self):
        return len(self._storage)

    def add(self, transition):
        self._storage.append(transition)

    def sample(self, batch_size, augmentation=False):
        batch = random.sample(self._storage, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            # ======== ДЛЯ ВСЕГО БАТЧА ОДИНАКОВАЯ АУГМЕНТАЦИЯ ========
            # n = states.shape[-1] - 1
            # k = np.random.randint(0, 4)
            # states = np.rot90(states, k, axes=(1,2)).copy()
            # next_states = np.rot90(next_states, k, axes=(1,2)).copy()

            # i, j = actions[:, 0], actions[:, 1]
            # if k == 1: actions = np.column_stack((n - j, i))
            # if k == 2: actions = np.column_stack((n - i, n - j))
            # if k == 3: actions = np.column_stack((j, n - i))


            # ======== ДЛЯ КАЖДОГО ЭЛЕМЕНТА БАТЧА ОТДЕЛЬНО ========
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))


            # ======== УВЕЛИЧЕНИЕ X4 ========
            # n = states.shape[-1] - 1
            # i, j = actions[:, 0], actions[:, 1]

            # states = np.concatenate([np.rot90(states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # next_states = np.concatenate([np.rot90(next_states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # actions = np.concatenate([actions,
            #                           np.column_stack((n - j, i)),
            #                           np.column_stack((n - i, n - j)),
            #                           np.column_stack((j, n - i))], axis=0)
            # rewards = np.tile(rewards, 4)
            # dones = np.tile(dones, 4)

        return states, actions, rewards, next_states, dones

In [8]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed);

In [9]:
board_size = 5
win_size = 4

In [10]:
# Гиперпараметры метода DQN

batch_size = 128        # 512 - много
total_steps = 60_000

decay_steps = 40_000
init_epsilon = 1
final_epsilon = 0.2     # 0.02 - мало; 0.1 - мало

loss_freq = 100
refresh_target_network_freq = 100    # 1000 - много, 50 - мало

eval_freq = 500
n_eval_games = 100

max_grad_norm = 50

gamma = 0.9

In [11]:
agent = DQNAgent(init_epsilon).to(device)

target_network = DQNAgent(init_epsilon).to(device)
target_network.load_state_dict(agent.state_dict())

optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)
exp_replay = ReplayBuffer(16_000)

In [12]:
sum([p.numel() for p in agent.parameters()])

741889

In [13]:
# Возвращает temporal difference loss
def compute_td_loss(states, actions, rewards, next_states, dones,
                    agent, target_network, gamma=0.9, device=device):

    states = torch.tensor(states, device=device, dtype=torch.float32)                # shape: [batch_size, state_dim]
    actions = torch.tensor(actions, device=device, dtype=torch.int64)                # shape: [batch_size]
    rewards = torch.tensor(rewards, device=device, dtype=torch.float32)              # shape: [batch_size]
    next_states = torch.tensor(next_states, device=device, dtype=torch.float32)      # shape: [batch_size, state_dim]
    dones = torch.tensor(dones, device=device, dtype=torch.int64)                    # shape: [batch_size]

    predicted_qvalues = agent(states)                                                # shape: [batch_size, n_actions]
    predicted_next_qvalues = target_network(next_states)                             # shape: [batch_size, n_actions]
    predicted_qvalues_for_actions = predicted_qvalues[range(len(actions)), actions[:, 0], actions[:, 1]]  # shape: [batch_size]
    next_state_values = predicted_next_qvalues.view(dones.shape[0], -1).max(axis=1).values
    target_qvalues_for_actions = rewards - (1 - dones) * gamma * next_state_values
    return torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2)  #loss

# Рассчитывает epsilon на текущем шаге step
def linear_decay(init_epsilon, final_epsilon, step, decay_steps):
    return max(init_epsilon - step * (init_epsilon - final_epsilon) / decay_steps, final_epsilon)

# Обучение

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
main_random = RandomPlus(5, 4, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [16]:
PATH = f'/content/drive/MyDrive/TicTacToe_6/'

loss = None
loss_values = []
reward_values = []

In [None]:
with open('out.txt', 'w') as f:
  for t in range(total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        state_2d, turn = state
        exp_replay.add((turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0)) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones, agent, target_network, gamma)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(5, 4, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 0    	 reward = -1.0	{'DQNAgent': 0, 'RandomPlus': 100}

t = 100  	 loss = 0.0020428646821528673	 eps = 0.9978
t = 200  	 loss = 0.010184163227677345	 eps = 0.9955
t = 300  	 loss = 0.015375171788036823	 eps = 0.9932
t = 400  	 loss = 0.02215087227523327	 eps = 0.991
t = 500  	 loss = 0.034211672842502594	 eps = 0.9888
t = 500  	 reward = -0.46	{'DQNAgent': 27, 'RandomPlus': 73}

t = 600  	 loss = 0.034019533544778824	 eps = 0.9865
t = 700  	 loss = 0.03570473566651344	 eps = 0.9842
t = 800  	 loss = 0.02089115045964718	 eps = 0.982
t = 900  	 loss = 0.03694465011358261	 eps = 0.9798
t = 1000 	 loss = 0.030592478811740875	 eps = 0.9775
t = 1000 	 reward = 0.26	{'DQNAgent': 63, 'RandomPlus': 37}

t = 1100 	 loss = 0.023649821057915688	 eps = 0.9752
t = 1200 	 loss = 0.026075009256601334	 eps = 0.973
t = 1300 	 loss = 0.022903960198163986	 eps = 0.9708
t = 1400 	 loss = 0.020473351702094078	 eps = 0.9685
t = 1500 	 loss = 0.01808428391814232	 eps = 0.9663
t = 1500 	 reward = 0.72	{'D

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_47000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_47000'))

main_random = RandomPlus(5, 4, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
with open('out.txt', 'w') as f:
  for t in range(47_000, 70_000):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        state_2d, turn = state
        exp_replay.add((turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0)) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones, agent, target_network)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(5, 4, win=True, defense=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 47000	 loss = None	 eps = 0.2
t = 47000	 reward = 1.0	{'DQNAgent': 100, 'RandomPlus': 0}

t = 47100	 loss = 0.00048288117977790534	 eps = 0.2
t = 47200	 loss = 0.0004975859774276614	 eps = 0.2
t = 47300	 loss = 0.0003183933440595865	 eps = 0.2
t = 47400	 loss = 0.0025374810211360455	 eps = 0.2
t = 47500	 loss = 0.000501373375300318	 eps = 0.2
t = 47500	 reward = 0.98	{'DQNAgent': 98, 'RandomPlus': 0}

t = 47600	 loss = 0.0004328169743530452	 eps = 0.2
t = 47700	 loss = 0.001629895530641079	 eps = 0.2
t = 47800	 loss = 0.000490679289214313	 eps = 0.2
t = 47900	 loss = 0.001265194732695818	 eps = 0.2
t = 48000	 loss = 0.0022540870122611523	 eps = 0.2
t = 48000	 reward = 1.0	{'DQNAgent': 100, 'RandomPlus': 0}

t = 48100	 loss = 0.0005319251213222742	 eps = 0.2
t = 48200	 loss = 0.0009426262113265693	 eps = 0.2
t = 48300	 loss = 0.0005073613720014691	 eps = 0.2
t = 48400	 loss = 0.0010195233626291156	 eps = 0.2
t = 48500	 loss = 0.0007352617103606462	 eps = 0.2
t = 48500	 reward = 1.0	

#Тестирование обученных моделей (инференс с маскированием)

In [None]:
PATH = '/content/drive/MyDrive/TicTacToe_6/'

In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(5, 4, win=True, defense=True)
agent.epsilon = 0
agent.masking = True

for i in range(0, 70_001, 500):
    agent.load_state_dict(torch.load(f'{PATH}model_{i}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(1000)
    print(f'{i:5}', eval_game.wins)

    0 {'DQNAgent': 95, 'RandomPlus': 795}
  500 {'DQNAgent': 543, 'RandomPlus': 398}
 1000 {'DQNAgent': 811, 'RandomPlus': 153}
 1500 {'DQNAgent': 865, 'RandomPlus': 94}
 2000 {'DQNAgent': 863, 'RandomPlus': 100}
 2500 {'DQNAgent': 841, 'RandomPlus': 99}
 3000 {'DQNAgent': 858, 'RandomPlus': 101}
 3500 {'DQNAgent': 823, 'RandomPlus': 127}
 4000 {'DQNAgent': 846, 'RandomPlus': 99}
 4500 {'DQNAgent': 862, 'RandomPlus': 74}
 5000 {'DQNAgent': 919, 'RandomPlus': 55}
 5500 {'DQNAgent': 867, 'RandomPlus': 83}
 6000 {'DQNAgent': 886, 'RandomPlus': 67}
 6500 {'DQNAgent': 895, 'RandomPlus': 63}
 7000 {'DQNAgent': 912, 'RandomPlus': 49}
 7500 {'DQNAgent': 914, 'RandomPlus': 52}
 8000 {'DQNAgent': 928, 'RandomPlus': 37}
 8500 {'DQNAgent': 938, 'RandomPlus': 28}
 9000 {'DQNAgent': 946, 'RandomPlus': 20}
 9500 {'DQNAgent': 934, 'RandomPlus': 29}
10000 {'DQNAgent': 935, 'RandomPlus': 28}
10500 {'DQNAgent': 957, 'RandomPlus': 14}
11000 {'DQNAgent': 949, 'RandomPlus': 18}
11500 {'DQNAgent': 948, 'Rand

In [21]:
# Сравнение лучших моделей (без проигрышей)
models = [21500, 25000, 25500, 27500, 30500, 31500, 32000, 32000, 34500, 36000,
          37500, 41500, 42000, 43500, 44000, 44500, 47000, 48000, 49500, 51000,
          51500, 52000, 52500, 54000, 55000, 55500, 56500, 58000, 59500, 61000,
          61500, 64500, 65000, 65500, 66000, 66500, 67500, 68000, 69500, 70000]

agent.epsilon = 0
agent.masking = True

for model in models:
    agent.load_state_dict(torch.load(f'{PATH}model_{model}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(10_000)
    print(model, eval_game.wins)

21500 {'DQNAgent': 9825, 'RandomPlus': 19}
25000 {'DQNAgent': 9887, 'RandomPlus': 18}
25500 {'DQNAgent': 9858, 'RandomPlus': 25}
27500 {'DQNAgent': 9943, 'RandomPlus': 7}
30500 {'DQNAgent': 9944, 'RandomPlus': 13}
31500 {'DQNAgent': 9951, 'RandomPlus': 11}
32000 {'DQNAgent': 9933, 'RandomPlus': 14}
32000 {'DQNAgent': 9940, 'RandomPlus': 9}
34500 {'DQNAgent': 9943, 'RandomPlus': 16}
36000 {'DQNAgent': 9948, 'RandomPlus': 11}
37500 {'DQNAgent': 9913, 'RandomPlus': 17}
41500 {'DQNAgent': 9971, 'RandomPlus': 5}
42000 {'DQNAgent': 9960, 'RandomPlus': 6}
43500 {'DQNAgent': 9867, 'RandomPlus': 30}
44000 {'DQNAgent': 9947, 'RandomPlus': 19}
44500 {'DQNAgent': 9961, 'RandomPlus': 10}
47000 {'DQNAgent': 9961, 'RandomPlus': 6}
48000 {'DQNAgent': 9931, 'RandomPlus': 11}
49500 {'DQNAgent': 9949, 'RandomPlus': 8}
51000 {'DQNAgent': 9940, 'RandomPlus': 9}
51500 {'DQNAgent': 9961, 'RandomPlus': 5}
52000 {'DQNAgent': 9964, 'RandomPlus': 3}
52500 {'DQNAgent': 9921, 'RandomPlus': 8}
54000 {'DQNAgent': 99

In [24]:
# Загрузка самой лучшей модели
agent.load_state_dict(torch.load(f'{PATH}model_68000', map_location=torch.device('cpu')))

<All keys matched successfully>

In [26]:
agent.epsilon = 0
test_game = TicTacToe(agent, Human(), board_size=board_size, win_size=win_size)
test_game.play(4, True)
test_game.wins

player -1's turn:
. . . . .
. . . . .
. . . . .
. . . . .
. . . . .
player -1's turn:
 .  .  .  .  .
 .  .  .  .  .
 .  .  X  .  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
2 4
player 1's turn:
 .  .  .  .  .
 .  .  .  O  .
 .  .  X  .  .
 .  .  .  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  .  X  O  .
 .  .  X  .  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
4 3
player 1's turn:
 .  .  .  .  .
 .  .  X  O  .
 .  .  X  .  .
 .  .  O  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  .  X  O  .
 .  .  X  .  .
 .  .  O  .  X
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
3 4
player 1's turn:
 .  .  .  .  .
 .  .  X  O  .
 .  .  X  O  .
 .  .  O  .  X
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  .  X  O  .
 .  .  X  O  .
 .  .  O  X  X
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
2 5
player 1's turn:
 .  .  .  .  .
 .  .  X  O  O
 .  .  X  O  .
 .  .  O  X  X
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  .  

{'DQNAgent': 2, 'Human': 0}

# Первый ход за крестики и значения $Q$-фунцкии в начальном состоянии

In [27]:
state2d = torch.tensor(np.array(
    [[[0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0]]]
)).to(device)

q_values = agent(state2d).squeeze(0).detach().cpu().numpy()
np.unravel_index(q_values.argmax(), q_values.shape)

(2, 2)

In [28]:
q_values

array([[-0.03003811,  0.02076708, -0.02016265,  0.0177759 , -0.02678831],
       [ 0.01136995,  0.014121  ,  0.00581902,  0.02358159,  0.02361256],
       [-0.02887448,  0.01612367,  0.04173366,  0.02498521, -0.0246248 ],
       [ 0.02923507,  0.01261448,  0.03011223,  0.02426082,  0.01718375],
       [-0.04016344,  0.0072506 , -0.02590851,  0.01938777, -0.02558664]],
      dtype=float32)