In [None]:
import numpy as np
from collections import deque
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

#Игра

In [None]:
# Игра крестики-нолики
class TicTacToe:
    def __init__(self, player_1, player_2, board_size=3, win_size=3):
        self.players = {-1: player_1,
                         1: player_2}

        self.wins = {player_1.name: 0,
                     player_2.name: 0}

        self.board_size=board_size
        self.win_size = win_size
        self._kernel = self._create_kernel()


    # Создает ядро свертки для расчета побед
    def _create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel


    # Проверяет победы для состояний states, в кот. ходы были совершены игроками turns, turn={-1, 1}
    def _test_win(self, state, turn):
        rows, cols, w_size = *state.shape, self.win_size
        expanded_states = np.lib.stride_tricks.as_strided(
            state,
            shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
            strides=(*state.strides, *state.strides),
            writeable=False,
        )
        feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)
        return -turn * (feature_map == turn * w_size).any().astype(int)


    # Проигрывание нескольких полных эпизодов
    def play(self, num_games=1, visualize=False):
        transitions = []
        for t in range(num_games):
            next_turn = turn = -1
            state = (np.zeros((self.board_size, self.board_size)), turn) # Начальное состояние игры. state = (state2d, turn)
            if visualize:
                self.visualize_state(state, turn)
            while(next_turn != 0):
                state_2d, turn = state
                current_player = self.players[turn]
                action = current_player.get_action(state)
                next_state_2d, next_turn, reward = self.play_turn(state, action)
                transitions.append((turn * state_2d, action, reward, -turn * next_state_2d, next_turn == 0))   #state, action, reward, new_state, done
                if visualize:
                    self.visualize_state((next_state_2d, next_turn), turn)
                if next_turn == 0:
                    if visualize:
                        if (reward == 0): print('Ничья!\n')
                        else: print(f'Победа ({self.players[reward * turn].name})!\n')
                    if reward != 0:
                        self.wins[self.players[reward * turn].name] += 1
                    self.players = {-1: self.players[1], 1: self.players[-1]}
                state = next_state_2d, next_turn
        return transitions


    # Выполнение хода и проверка на некорректный ход (проигрышь) / выигрыш / ничью
    def play_turn(self, state, action): # next_state2d, next_turn, reward
        state2d, turn = state
        next_state2d = state2d.copy()

        # Проверка корректности хода
        if (state2d[(action)] != 0):
            return next_state2d, 0, -1        # Игрок проиграл (# next_turn == 0 => Игра окончена)

        # Совершение хода
        next_state2d[action] = turn

        # Проверка победы
        if self._test_win(next_state2d, turn):
            return next_state2d, 0, 1         # Текущий игрок побеждает (next_turn == 0 => Игра окончена)

        # Проверка ничьи
        if (next_state2d != 0).all():
            return next_state2d, 0, 0         # Ничья (next_turn == 0 => Игра окончена)

        # Инчае, ход следующего игрока
        return next_state2d, -turn, 0         # next_turn == -turn => Смена хода


    # Выводит на экран состояние игры после хода игрока
    @staticmethod
    def visualize_state(next_state, turn):
        next_state2d, next_turn = next_state
        print(f"player {turn}'s turn:")
        if (next_state2d == 0).all() and turn == 0:
            print("[invalid state]\n\n")
        else:
            print(str(next_state2d)
                  .replace(".", "")
                  .replace("[[", "")
                  .replace(" [", "")
                  .replace("]]", "")
                  .replace("]", "")
                  .replace("-0", " .")
                  .replace("0", ".")
                  .replace("-1", " X")
                  .replace("1", "O")
            )


    @staticmethod
    def print_transitions(transitions):
        states, actions, rewards, next_states, dones = zip(*transitions)
        for i in np.arange(len(states)):
            print("\033[31m{}.".format(i + 1), '\033[30m')
            TicTacToe.visualize_state((next_states[i], -1), 1)
            print('\naction = ', actions[i] + np.array([1, 1]), end='\n')
            print('reward = ', rewards[i], end='\n')
            if (dones[i]): print('Игра окончена', end='\n\n')
            else: print('Игра продолжается', end='\n\n')

#Игроки

In [None]:
class Human:
    def __init__(self, name='Human'):
        self.name = name

    def get_action(self, state):
        state2d, turn = state
        print('Введите ваш ход (Строка, столбец)')
        row, col = map(int, input().split())
        while (state2d[row - 1, col - 1] != 0):
            print('Клетка занята!')
            print('Введите ваш ход (Строка, столбец)')
            row, col = map(int, input().split())
        return row - 1, col - 1

In [None]:
# Игрок Рандом с преимуществами:
# 1. Если есть возможность выиграть за один ход, он делает это (win = True)
# 2. Если у соперника есть возможность выиграть в следующем ходу, он блокирует этот ход (defense = True)
# 3. Если есть возможность построить четверку, он делает это (win_2 = True)
# 4. Если у соперника есть возможность построить четверку в следующем ходу, он блокирует этот ход (defense_2 = True)
# 5. Иначе, выбирает случайный ход из множества допустимых
class RandomPlus:
    def __init__(self, board_size=3, win_size=3, name='RandomPlus',
                 win=False, defense=False, win_2=False, defense_2=False):
        self.name = name
        self.board_size = board_size
        self.win_size = win_size

        self.win = win
        self.defense = defense

        self.win_2 = win_2
        self.defense_2 = defense_2

        if win or defense:
            self._kernel = self._create_kernel(win_size)

        if win_2 or defense_2:
            self._kernel_2 = self._create_kernel(win_size - 1)


    # Создает ядро свертки для расчета потенциальных побед
    def _create_kernel(self, win_size):
        kernel = np.zeros((2 * win_size + 2, win_size, win_size))
        for i in range(win_size):
            kernel[i, i, :] = np.ones(win_size)
        for i in range(win_size, 2 * win_size):
            kernel[i, :, i - win_size] = np.ones(win_size).T
        kernel[2 * win_size] = np.eye(win_size)
        kernel[2 * win_size + 1] = np.fliplr(np.eye(win_size))
        return kernel


    def get_action(self, state):
        state2d, turn = state
        rows, cols, w_size = *state2d.shape, self.win_size

        if self.win or self.defense:
            expanded_states = np.lib.stride_tricks.as_strided(
                state2d,
                shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
                strides=(*state2d.strides, *state2d.strides),
                writeable=False,
            )
            feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)

            if self.win:
                wins = np.array(np.where(turn * feature_map == w_size - 1))
                if wins.shape[1] > 0:
                    index = np.random.randint(0, wins.shape[1])
                    K, I, J = wins[:, index]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

            if self.defense:
                defenses = np.array(np.where(-turn * feature_map == w_size - 1))
                if defenses.shape[1] > 0:
                    index = np.random.randint(0, defenses.shape[1])
                    K, I, J = defenses[:, index]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

        if self.win_2 or self.defense_2:
            expanded_states = np.lib.stride_tricks.as_strided(
                state2d,
                shape=(rows - w_size + 2, cols - w_size + 2, w_size - 1, w_size - 1),
                strides=(*state2d.strides, *state2d.strides),
                writeable=False,
            )
            feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel_2)

            if self.win_2:
                wins = np.array(np.where(turn * feature_map == w_size - 2))
                if wins.shape[1] > 0:
                    index = np.random.randint(0, wins.shape[1])
                    K, I, J = wins[:, index]
                    indxs = np.where(np.logical_and((self._kernel_2[K] == 1), (state2d[I: I + w_size - 1, J: J + w_size - 1] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

            if self.defense_2:
                defenses = np.array(np.where(-turn * feature_map == w_size - 2))
                if defenses.shape[1] > 0:
                    index = np.random.randint(0, defenses.shape[1])
                    K, I, J = defenses[:, index]
                    indxs = np.where(np.logical_and((self._kernel_2[K] == 1), (state2d[I: I + w_size - 1, J: J + w_size - 1] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

In [None]:
class DQNAgent(nn.Module):
    def __init__(self, epsilon=0, name='DQNAgent', masking=False):
        super().__init__()

        self.name = name
        self.epsilon = epsilon
        self.n_channels = 3
        self.masking = masking    # Маскирование (ВКЛЮЧАТЬ ТОЛЬКО ПРИ ИНФЕРЕНСЕ)

        self.network = nn.Sequential(
            nn.Conv2d(self.n_channels, 128, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 128, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(128, 1, kernel_size=(3, 3), padding='same')
        )

    def forward(self, x):
        x = torch.stack([x == 1, x == -1, x == 0], axis=1).float()
        return self.network(x).squeeze(1)

    def greedy_action(self, state, device=device):
        state2d, turn = state
        state_t = torch.FloatTensor(turn * state2d).unsqueeze(0).to(device)
        q_values = self.forward(state_t).squeeze(0).detach().cpu().numpy()
        if self.masking:
            q_values[state2d != 0] = -float("Inf")
        return np.unravel_index(q_values.argmax(), q_values.shape)

    def random_action(self, state):
        state2d, turn = state
        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

    def get_action(self, state):
        if random.random() < self.epsilon:
            action = self.random_action(state)
        else:
            action = self.greedy_action(state)
        return action

# Буферы

In [None]:
# Обычный буфер
class ReplayBuffer(object):
    def __init__(self, size):
        self._storage = deque(maxlen=size)

    def __len__(self):
        return len(self._storage)

    def add(self, transition):
        self._storage.append(transition)

    def sample(self, batch_size, augmentation=False):
        batch = random.sample(self._storage, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            # ======== ДЛЯ ВСЕГО БАТЧА ОДИНАКОВАЯ АУГМЕНТАЦИЯ ========
            # n = states.shape[-1] - 1
            # k = np.random.randint(0, 4)
            # states = np.rot90(states, k, axes=(1,2)).copy()
            # next_states = np.rot90(next_states, k, axes=(1,2)).copy()

            # i, j = actions[:, 0], actions[:, 1]
            # if k == 1: actions = np.column_stack((n - j, i))
            # if k == 2: actions = np.column_stack((n - i, n - j))
            # if k == 3: actions = np.column_stack((j, n - i))


            # ======== ДЛЯ КАЖДОГО ЭЛЕМЕНТА БАТЧА ОТДЕЛЬНО ========
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))


            # ======== УВЕЛИЧЕНИЕ X4 ========
            # n = states.shape[-1] - 1
            # i, j = actions[:, 0], actions[:, 1]

            # states = np.concatenate([np.rot90(states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # next_states = np.concatenate([np.rot90(next_states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # actions = np.concatenate([actions,
            #                           np.column_stack((n - j, i)),
            #                           np.column_stack((n - i, n - j)),
            #                           np.column_stack((j, n - i))], axis=0)
            # rewards = np.tile(rewards, 4)
            # dones = np.tile(dones, 4)

        return states, actions, rewards, next_states, dones

In [None]:
# =========== Prioritized Replay Buffer With Augmentation ===========
class PrioritizedBuffer(object):
    def __init__(self, capacity, prob_alpha=0.6):
        self.prob_alpha = prob_alpha
        self.capacity = capacity
        self.buffer = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def add(self, state, action, reward, next_state, done):
        max_prio = self.priorities.max() if self.buffer else 1.0

        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4, augmentation=False):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        probs  = prios ** self.prob_alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total    = len(self.buffer)
        weights  = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights  = np.array(weights, dtype=np.float32)

        states, actions, rewards, next_states, dones = zip(*samples)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

#Функции и гиперпараметры для обучения

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed);

In [None]:
board_size = 5
win_size = 4

In [None]:
# Гиперпараметры метода DQN

batch_size = 128        # 512 - много
total_steps = 60_000

decay_steps = 40_000
init_epsilon = 1
final_epsilon = 0.2     # 0.02 - мало; 0.1 - мало

loss_freq = 100
refresh_target_network_freq = 100    # 1000 - много, 50 - мало

eval_freq = 500
n_eval_games = 100

max_grad_norm = 50

gamma = 0.9

In [None]:
agent = DQNAgent(init_epsilon).to(device)

target_network = DQNAgent(init_epsilon).to(device)
target_network.load_state_dict(agent.state_dict())

optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)
exp_replay = PrioritizedBuffer(16_000) #ReplayBuffer(16_000)

In [None]:
sum([p.numel() for p in agent.parameters()])

1185025

In [None]:
# Возвращает temporal difference loss
def compute_td_loss(states, actions, rewards, next_states, dones,
                    agent, target_network, weights=None, indices=None,
                    gamma=0.9, device=device, prioritized=True):

    states = torch.tensor(states, device=device, dtype=torch.float32)                # shape: [batch_size, state_dim]
    actions = torch.tensor(actions, device=device, dtype=torch.int64)                # shape: [batch_size]
    rewards = torch.tensor(rewards, device=device, dtype=torch.float32)              # shape: [batch_size]
    next_states = torch.tensor(next_states, device=device, dtype=torch.float32)      # shape: [batch_size, state_dim]
    dones = torch.tensor(dones, device=device, dtype=torch.int64)                    # shape: [batch_size]
    weights = torch.tensor(weights, device=device, dtype=torch.float32)

    predicted_qvalues = agent(states)                                                # shape: [batch_size, n_actions]
    predicted_next_qvalues = target_network(next_states)                             # shape: [batch_size, n_actions]
    predicted_qvalues_for_actions = predicted_qvalues[range(len(actions)), actions[:, 0], actions[:, 1]]  # shape: [batch_size]
    next_state_values = predicted_next_qvalues.view(dones.shape[0], -1).max(axis=1).values
    target_qvalues_for_actions = rewards - (1 - dones) * gamma * next_state_values

    if prioritized:
        loss = weights * (predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2
        prios = (loss + 1e-5).data.cpu().numpy()  # Обновление приоритетов [Prioterized DQN]
        loss = torch.mean(loss)
        exp_replay.update_priorities(indices, prios)
        return loss
    else:
        return torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2)  #loss

# Рассчитывает epsilon на текущем шаге step
def linear_decay(init_epsilon, final_epsilon, step, decay_steps):
    return max(init_epsilon - step * (init_epsilon - final_epsilon) / decay_steps, final_epsilon)

# Обучение

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
PATH = f'/content/drive/MyDrive/TicTacToe_11/'

loss = None
loss_values = []
reward_values = []

In [None]:
with open('out.txt', 'w') as f:
  for t in range(total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        state_2d, turn = state
        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 0    	 reward = -1.0	{'DQNAgent': 0, 'RandomPlus': 100}

t = 100  	 loss = 0.0013881123159080744	 eps = 0.998
t = 200  	 loss = 0.003986238501966	 eps = 0.996
t = 300  	 loss = 0.003425873816013336	 eps = 0.994
t = 400  	 loss = 0.007009033113718033	 eps = 0.992
t = 500  	 loss = 0.0031843779142946005	 eps = 0.99
t = 500  	 reward = -0.72	{'DQNAgent': 14, 'RandomPlus': 86}

t = 600  	 loss = 0.005009326618164778	 eps = 0.988
t = 700  	 loss = 0.005027949810028076	 eps = 0.986
t = 800  	 loss = 0.006607561372220516	 eps = 0.984
t = 900  	 loss = 0.0037566833198070526	 eps = 0.982
t = 1000 	 loss = 0.005147852934896946	 eps = 0.98
t = 1000 	 reward = -0.82	{'DQNAgent': 9, 'RandomPlus': 91}

t = 1100 	 loss = 0.004279031418263912	 eps = 0.978
t = 1200 	 loss = 0.006132327951490879	 eps = 0.976
t = 1300 	 loss = 0.003267318941652775	 eps = 0.974
t = 1400 	 loss = 0.004802246578037739	 eps = 0.972
t = 1500 	 loss = 0.003764278255403042	 eps = 0.97
t = 1500 	 reward = -0.8	{'DQNAgent': 1

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_36500'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_36500'))

In [None]:
with open('out.txt', 'w') as f:
  for t in range(36_600, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        state_2d, turn = state
        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

# Гиперпараметры метода DQN
torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 36600	 loss = None	 eps = 0.268
t = 36700	 loss = 0.00014099874533712864	 eps = 0.266
t = 36800	 loss = 0.00020060810493305326	 eps = 0.264
t = 36900	 loss = 0.00020682072499766946	 eps = 0.262
t = 37000	 loss = 0.00027246488025411963	 eps = 0.26
t = 37000	 reward = 0.96	{'DQNAgent': 96, 'RandomPlus': 0}

t = 37100	 loss = 0.00031232365290634334	 eps = 0.258
t = 37200	 loss = 0.0001755008997861296	 eps = 0.256
t = 37300	 loss = 0.00019002807675860822	 eps = 0.254
t = 37400	 loss = 0.0001777427241904661	 eps = 0.252
t = 37500	 loss = 0.0006124767241999507	 eps = 0.25
t = 37500	 reward = 1.0	{'DQNAgent': 100, 'RandomPlus': 0}

t = 37600	 loss = 0.00021436184761114419	 eps = 0.248
t = 37700	 loss = 0.00015682837693020701	 eps = 0.246
t = 37800	 loss = 0.00020844966638833284	 eps = 0.244
t = 37900	 loss = 0.00013681976997759193	 eps = 0.242
t = 38000	 loss = 0.00015053477545734495	 eps = 0.24
t = 38000	 reward = 0.99	{'DQNAgent': 99, 'RandomPlus': 0}

t = 38100	 loss = 0.00020895790657

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_59500'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_59500'))

In [None]:
with open('out.txt', 'w') as f:
  for t in range(59_600, 80_000):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        state_2d, turn = state
        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

# Гиперпараметры метода DQN
torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 59600	 loss = 0.003960108384490013	 eps = 0.2
t = 59700	 loss = 0.0003764395078178495	 eps = 0.2
t = 59800	 loss = 0.0003657957713585347	 eps = 0.2
t = 59900	 loss = 0.00025083834771066904	 eps = 0.2
t = 60000	 loss = 0.0002798822824843228	 eps = 0.2
t = 60000	 reward = 0.99	{'DQNAgent': 99, 'RandomPlus': 0}

t = 60100	 loss = 0.00019810502999462187	 eps = 0.2
t = 60200	 loss = 0.0002592834644019604	 eps = 0.2
t = 60300	 loss = 0.00017909894813783467	 eps = 0.2
t = 60400	 loss = 0.00028372794622555375	 eps = 0.2
t = 60500	 loss = 0.0005165296024642885	 eps = 0.2
t = 60500	 reward = 0.99	{'DQNAgent': 99, 'RandomPlus': 0}

t = 60600	 loss = 0.00026640715077519417	 eps = 0.2
t = 60700	 loss = 0.00020307810336817056	 eps = 0.2
t = 60800	 loss = 0.00018918971181847155	 eps = 0.2
t = 60900	 loss = 0.00012081737077096477	 eps = 0.2
t = 61000	 loss = 0.0001024806551868096	 eps = 0.2
t = 61000	 reward = 0.94	{'DQNAgent': 96, 'RandomPlus': 2}

t = 61100	 loss = 0.00020484747074078768	 eps = 

#Тестирование обученных моделей (инференс с маскированием)

In [None]:
PATH = '/content/drive/MyDrive/TicTacToe_11/'

In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
agent.epsilon = 0
agent.masking = True

for i in range(16_000, 37000, 500):
    agent.load_state_dict(torch.load(f'{PATH}model_{i}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(1000)
    print(f'{i:5}', eval_game.wins)

16000 {'DQNAgent': 955, 'RandomPlus': 11}
16500 {'DQNAgent': 955, 'RandomPlus': 5}
17000 {'DQNAgent': 976, 'RandomPlus': 5}
17500 {'DQNAgent': 975, 'RandomPlus': 6}
18000 {'DQNAgent': 962, 'RandomPlus': 6}
18500 {'DQNAgent': 954, 'RandomPlus': 5}
19000 {'DQNAgent': 952, 'RandomPlus': 2}
19500 {'DQNAgent': 968, 'RandomPlus': 4}
20000 {'DQNAgent': 970, 'RandomPlus': 4}
20500 {'DQNAgent': 972, 'RandomPlus': 5}
21000 {'DQNAgent': 960, 'RandomPlus': 4}
21500 {'DQNAgent': 952, 'RandomPlus': 6}
22000 {'DQNAgent': 953, 'RandomPlus': 5}
22500 {'DQNAgent': 960, 'RandomPlus': 6}
23000 {'DQNAgent': 983, 'RandomPlus': 1}
23500 {'DQNAgent': 964, 'RandomPlus': 7}
24000 {'DQNAgent': 971, 'RandomPlus': 5}
24500 {'DQNAgent': 964, 'RandomPlus': 5}
25000 {'DQNAgent': 972, 'RandomPlus': 10}
25500 {'DQNAgent': 975, 'RandomPlus': 5}
26000 {'DQNAgent': 971, 'RandomPlus': 5}
26500 {'DQNAgent': 971, 'RandomPlus': 3}
27000 {'DQNAgent': 969, 'RandomPlus': 6}
27500 {'DQNAgent': 956, 'RandomPlus': 3}
28000 {'DQNAge

In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
agent.epsilon = 0
agent.masking = True

for i in range(37_000, 70500, 500):
    agent.load_state_dict(torch.load(f'{PATH}model_{i}'))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(1000)
    print(f'{i:5}', eval_game.wins)

37000 {'DQNAgent': 970, 'RandomPlus': 3}
37500 {'DQNAgent': 976, 'RandomPlus': 7}
38000 {'DQNAgent': 968, 'RandomPlus': 8}
38500 {'DQNAgent': 974, 'RandomPlus': 6}
39000 {'DQNAgent': 966, 'RandomPlus': 4}
39500 {'DQNAgent': 963, 'RandomPlus': 6}
40000 {'DQNAgent': 979, 'RandomPlus': 3}
40500 {'DQNAgent': 978, 'RandomPlus': 3}
41000 {'DQNAgent': 980, 'RandomPlus': 0}
41500 {'DQNAgent': 967, 'RandomPlus': 13}
42000 {'DQNAgent': 978, 'RandomPlus': 3}
42500 {'DQNAgent': 978, 'RandomPlus': 1}
43000 {'DQNAgent': 984, 'RandomPlus': 1}
43500 {'DQNAgent': 979, 'RandomPlus': 8}
44000 {'DQNAgent': 980, 'RandomPlus': 2}
44500 {'DQNAgent': 972, 'RandomPlus': 2}
45000 {'DQNAgent': 955, 'RandomPlus': 7}
45500 {'DQNAgent': 980, 'RandomPlus': 0}
46000 {'DQNAgent': 977, 'RandomPlus': 2}
46500 {'DQNAgent': 975, 'RandomPlus': 4}
47000 {'DQNAgent': 980, 'RandomPlus': 2}
47500 {'DQNAgent': 976, 'RandomPlus': 3}
48000 {'DQNAgent': 981, 'RandomPlus': 4}
48500 {'DQNAgent': 966, 'RandomPlus': 8}
49000 {'DQNAgen

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/TicTacToe_11/model_60000'

In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
agent.epsilon = 0
agent.masking = True

for i in range(60_000, 64000, 500):
    agent.load_state_dict(torch.load(f'{PATH}model_{i}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(1000)
    print(f'{i:5}', eval_game.wins)

60000 {'DQNAgent': 976, 'RandomPlus': 3}
60500 {'DQNAgent': 970, 'RandomPlus': 3}
61000 {'DQNAgent': 947, 'RandomPlus': 10}
61500 {'DQNAgent': 955, 'RandomPlus': 13}
62000 {'DQNAgent': 943, 'RandomPlus': 4}
62500 {'DQNAgent': 873, 'RandomPlus': 67}
63000 {'DQNAgent': 952, 'RandomPlus': 9}
63500 {'DQNAgent': 978, 'RandomPlus': 3}


In [None]:
# Сравнение лучших моделей (без проигрышей)
models = [41000, 45500, 57000, 58500]

agent.epsilon = 0
agent.masking = True

for model in models:
    agent.load_state_dict(torch.load(f'{PATH}model_{model}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(10_000)
    print(model, eval_game.wins)

41000 {'DQNAgent': 9738, 'RandomPlus': 37}
45500 {'DQNAgent': 9803, 'RandomPlus': 13}
57000 {'DQNAgent': 9801, 'RandomPlus': 13}
58500 {'DQNAgent': 9786, 'RandomPlus': 11}


In [None]:
# Сравнение лучших моделей (без проигрышей)
models = [23000, 28500, 33500, 34500]

agent.epsilon = 0
agent.masking = True

for model in models:
    agent.load_state_dict(torch.load(f'{PATH}model_{model}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(10_000)
    print(model, eval_game.wins)

23000 {'DQNAgent': 9697, 'RandomPlus': 50}
28500 {'DQNAgent': 9745, 'RandomPlus': 24}
33500 {'DQNAgent': 9799, 'RandomPlus': 14}
34500 {'DQNAgent': 9801, 'RandomPlus': 17}


In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(board_size, win_size, win=True, defense=True)
agent.epsilon = 0
agent.masking = True

model = 58500
agent.load_state_dict(torch.load(f'{PATH}model_{model}'))
eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
eval_game.play(10_000)
print(model, eval_game.wins)

58500 {'DQNAgent': 9969, 'RandomPlus': 1}


In [None]:
# Загрузка самой лучшей модели
agent.load_state_dict(torch.load(f'{PATH}model_58500'))#, map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
agent.epsilon = 0
test_game = TicTacToe(agent, Human(), board_size=board_size, win_size=win_size)
test_game.play(4, True)
test_game.wins

player -1's turn:
. . . . .
. . . . .
. . . . .
. . . . .
. . . . .
player -1's turn:
 .  .  .  .  .
 .  .  .  .  .
 .  .  X  .  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
2 2
player 1's turn:
 .  .  .  .  .
 .  O  .  .  .
 .  .  X  .  .
 .  .  .  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  O  .  .  .
 .  X  X  .  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
3 4
player 1's turn:
 .  .  .  .  .
 .  O  .  .  .
 .  X  X  O  .
 .  .  .  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  O  X  .  .
 .  X  X  O  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
4 3
player 1's turn:
 .  .  .  .  .
 .  O  X  .  .
 .  X  X  O  .
 .  .  O  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  O  X  .  .
 .  X  X  O  .
 X  .  O  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
1 4
player 1's turn:
 .  .  .  O  .
 .  O  X  .  .
 .  X  X  O  .
 X  .  O  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  O  .
 .  O  

{'DQNAgent': 1, 'Human': 0}

# Первый ход за крестики и значения $Q$-фунцкии в начальном состоянии

In [None]:
state2d = torch.tensor(np.array(
    [[[0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0]]]
)).to(device)

q_values = agent(state2d).squeeze(0).detach().cpu().numpy()
np.unravel_index(q_values.argmax(), q_values.shape)

(2, 2)

In [None]:
q_values.round(4)

array([[-0.0404,  0.0055, -0.0263,  0.0159, -0.0434],
       [ 0.0161,  0.    ,  0.0052,  0.0096,  0.0094],
       [-0.0336,  0.0246,  0.0311,  0.0127, -0.0406],
       [ 0.0197,  0.0064,  0.0235,  0.0066,  0.0127],
       [-0.0295, -0.0043, -0.0313,  0.0035, -0.0543]], dtype=float32)