In [None]:
import numpy as np
from collections import deque
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

#Игра

In [None]:
# Игра крестики-нолики
class TicTacToe:
    def __init__(self, player_1, player_2, board_size=3, win_size=3):
        self.players = {-1: player_1,
                         1: player_2}

        self.wins = {player_1.name: 0,
                     player_2.name: 0}

        self.board_size=board_size
        self.win_size = win_size
        self._kernel = self._create_kernel()


    # Создает ядро свертки для расчета побед
    def _create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel


    # Проверяет победы для состояний states, в кот. ходы были совершены игроками turns, turn={-1, 1}
    def _test_win(self, state, turn):
        rows, cols, w_size = *state.shape, self.win_size
        expanded_states = np.lib.stride_tricks.as_strided(
            state,
            shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
            strides=(*state.strides, *state.strides),
            writeable=False,
        )
        feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)
        return -turn * (feature_map == turn * w_size).any().astype(int)


    # Проигрывание нескольких полных эпизодов
    def play(self, num_games=1, visualize=False):
        transitions = []
        for t in range(num_games):
            next_turn = turn = -1
            state = (np.zeros((self.board_size, self.board_size)), turn) # Начальное состояние игры. state = (state2d, turn)
            if visualize:
                self.visualize_state(state, turn)
            while(next_turn != 0):
                state_2d, turn = state
                current_player = self.players[turn]
                action = current_player.get_action(state)
                next_state_2d, next_turn, reward = self.play_turn(state, action)
                transitions.append((turn * state_2d, action, reward, -turn * next_state_2d, next_turn == 0))   #state, action, reward, new_state, done
                if visualize:
                    self.visualize_state((next_state_2d, next_turn), turn)
                if next_turn == 0:
                    if visualize:
                        if (reward == 0): print('Ничья!\n')
                        else: print(f'Победа ({self.players[reward * turn].name})!\n')
                    if reward != 0:
                        self.wins[self.players[reward * turn].name] += 1
                    self.players = {-1: self.players[1], 1: self.players[-1]}
                state = next_state_2d, next_turn
        return transitions


    # Выполнение хода и проверка на некорректный ход (проигрышь) / выигрыш / ничью
    def play_turn(self, state, action): # next_state2d, next_turn, reward
        state2d, turn = state
        next_state2d = state2d.copy()

        # Проверка корректности хода
        if (state2d[(action)] != 0):
            return next_state2d, 0, -1        # Игрок проиграл (# next_turn == 0 => Игра окончена)

        # Совершение хода
        next_state2d[action] = turn

        # Проверка победы
        if self._test_win(next_state2d, turn):
            return next_state2d, 0, 1         # Текущий игрок побеждает (next_turn == 0 => Игра окончена)

        # Проверка ничьи
        if (next_state2d != 0).all():
            return next_state2d, 0, 0         # Ничья (next_turn == 0 => Игра окончена)

        # Инчае, ход следующего игрока
        return next_state2d, -turn, 0         # next_turn == -turn => Смена хода


    # Выводит на экран состояние игры после хода игрока
    @staticmethod
    def visualize_state(next_state, turn):
        next_state2d, next_turn = next_state
        print(f"player {turn}'s turn:")
        if (next_state2d == 0).all() and turn == 0:
            print("[invalid state]\n\n")
        else:
            print(str(next_state2d)
                  .replace(".", "")
                  .replace("[[", "")
                  .replace(" [", "")
                  .replace("]]", "")
                  .replace("]", "")
                  .replace("-0", " .")
                  .replace("0", ".")
                  .replace("-1", " X")
                  .replace("1", "O")
            )


    @staticmethod
    def print_transitions(transitions):
        states, actions, rewards, next_states, dones = zip(*transitions)
        for i in np.arange(len(states)):
            print("\033[31m{}.".format(i + 1), '\033[30m')
            TicTacToe.visualize_state((next_states[i], -1), 1)
            print('\naction = ', actions[i] + np.array([1, 1]), end='\n')
            print('reward = ', rewards[i], end='\n')
            if (dones[i]): print('Игра окончена', end='\n\n')
            else: print('Игра продолжается', end='\n\n')

#Игроки

In [None]:
class Human:
    def __init__(self, name='Human'):
        self.name = name

    def get_action(self, state):
        state2d, turn = state
        print('Введите ваш ход (Строка, столбец)')
        row, col = map(int, input().split())
        while (state2d[row - 1, col - 1] != 0):
            print('Клетка занята!')
            print('Введите ваш ход (Строка, столбец)')
            row, col = map(int, input().split())
        return row - 1, col - 1

In [None]:
# Игрок Рандом с преимуществами:
# 1. Если есть возможность выиграть за один ход, он делает это (win = True)
# 2. Если у соперника есть возможность выиграть в следующем ходу, он блокирует этот ход (defense = True)
# 3. Если есть возможность построить четверку, он делает это (win_2 = True)
# 4. Если у соперника есть возможность построить четверку в следующем ходу, он блокирует этот ход (defense_2 = True)
# 5. Иначе, выбирает случайный ход из множества допустимых
class RandomPlus:
    def __init__(self, board_size=3, win_size=3, name='RandomPlus',
                 win=False, defense=False, win_2=False, defense_2=False):
        self.name = name
        self.board_size = board_size
        self.win_size = win_size

        self.win = win
        self.defense = defense

        self.win_2 = win_2
        self.defense_2 = defense_2

        if win or defense:
            self._kernel = self._create_kernel(win_size)

        if win_2 or defense_2:
            self._kernel_2 = self._create_kernel(win_size - 1)


    # Создает ядро свертки для расчета потенциальных побед
    def _create_kernel(self, win_size):
        kernel = np.zeros((2 * win_size + 2, win_size, win_size))
        for i in range(win_size):
            kernel[i, i, :] = np.ones(win_size)
        for i in range(win_size, 2 * win_size):
            kernel[i, :, i - win_size] = np.ones(win_size).T
        kernel[2 * win_size] = np.eye(win_size)
        kernel[2 * win_size + 1] = np.fliplr(np.eye(win_size))
        return kernel


    def get_action(self, state):
        state2d, turn = state
        rows, cols, w_size = *state2d.shape, self.win_size

        if self.win or self.defense:
            expanded_states = np.lib.stride_tricks.as_strided(
                state2d,
                shape=(rows - w_size + 1, cols - w_size + 1, w_size, w_size),
                strides=(*state2d.strides, *state2d.strides),
                writeable=False,
            )
            feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel)

            if self.win:
                wins = np.array(np.where(turn * feature_map == w_size - 1))
                if wins.shape[1] > 0:
                    index = np.random.randint(0, wins.shape[1])
                    K, I, J = wins[:, index]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

            if self.defense:
                defenses = np.array(np.where(-turn * feature_map == w_size - 1))
                if defenses.shape[1] > 0:
                    index = np.random.randint(0, defenses.shape[1])
                    K, I, J = defenses[:, index]
                    indxs = np.where(np.logical_and((self._kernel[K] == 1), (state2d[I: I + w_size, J: J + w_size] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

        if self.win_2 or self.defense_2:
            expanded_states = np.lib.stride_tricks.as_strided(
                state2d,
                shape=(rows - w_size + 2, cols - w_size + 2, w_size - 1, w_size - 1),
                strides=(*state2d.strides, *state2d.strides),
                writeable=False,
            )
            feature_map = np.einsum('xyij,sij->sxy', expanded_states, self._kernel_2)

            if self.win_2:
                wins = np.array(np.where(turn * feature_map == w_size - 2))
                if wins.shape[1] > 0:
                    index = np.random.randint(0, wins.shape[1])
                    K, I, J = wins[:, index]
                    indxs = np.where(np.logical_and((self._kernel_2[K] == 1), (state2d[I: I + w_size - 1, J: J + w_size - 1] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

            if self.defense_2:
                defenses = np.array(np.where(-turn * feature_map == w_size - 2))
                if defenses.shape[1] > 0:
                    index = np.random.randint(0, defenses.shape[1])
                    K, I, J = defenses[:, index]
                    indxs = np.where(np.logical_and((self._kernel_2[K] == 1), (state2d[I: I + w_size - 1, J: J + w_size - 1] == 0)))
                    return tuple(np.array(indxs)[:, 0] + [I, J])

        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

In [None]:
class DQNAgent(nn.Module):
    def __init__(self, epsilon=0, name='DQNAgent', masking=False):
        super().__init__()

        self.name = name
        self.epsilon = epsilon
        self.n_channels = 3
        self.masking = masking    # Маскирование (ВКЛЮЧАТЬ ТОЛЬКО ПРИ ИНФЕРЕНСЕ)

        self.network = nn.Sequential(
            nn.Conv2d(self.n_channels, 128, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(256, 128, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(128, 1, kernel_size=(3, 3), padding='same')
        )

    def forward(self, x):
        x = torch.stack([x == 1, x == -1, x == 0], axis=1).float()
        return self.network(x).squeeze(1)

    def greedy_action(self, state, device=device):
        state2d, turn = state
        state_t = torch.FloatTensor(turn * state2d).unsqueeze(0).to(device)
        q_values = self.forward(state_t).squeeze(0).detach().cpu().numpy()
        if self.masking:
            q_values[state2d != 0] = -float("Inf")
        return np.unravel_index(q_values.argmax(), q_values.shape)

    def random_action(self, state):
        state2d, turn = state
        zero_idxs = np.argwhere(state2d == 0)
        return tuple(zero_idxs[np.random.randint(len(zero_idxs))])

    def get_action(self, state):
        if random.random() < self.epsilon:
            action = self.random_action(state)
        else:
            action = self.greedy_action(state)
        return action

# Буферы

In [None]:
# Обычный буфер
class ReplayBuffer(object):
    def __init__(self, size):
        self._storage = deque(maxlen=size)

    def __len__(self):
        return len(self._storage)

    def add(self, transition):
        self._storage.append(transition)

    def sample(self, batch_size, augmentation=False):
        batch = random.sample(self._storage, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            # ======== ДЛЯ ВСЕГО БАТЧА ОДИНАКОВАЯ АУГМЕНТАЦИЯ ========
            # n = states.shape[-1] - 1
            # k = np.random.randint(0, 4)
            # states = np.rot90(states, k, axes=(1,2)).copy()
            # next_states = np.rot90(next_states, k, axes=(1,2)).copy()

            # i, j = actions[:, 0], actions[:, 1]
            # if k == 1: actions = np.column_stack((n - j, i))
            # if k == 2: actions = np.column_stack((n - i, n - j))
            # if k == 3: actions = np.column_stack((j, n - i))


            # ======== ДЛЯ КАЖДОГО ЭЛЕМЕНТА БАТЧА ОТДЕЛЬНО ========
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))


            # ======== УВЕЛИЧЕНИЕ X4 ========
            # n = states.shape[-1] - 1
            # i, j = actions[:, 0], actions[:, 1]

            # states = np.concatenate([np.rot90(states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # next_states = np.concatenate([np.rot90(next_states, k, axes=(1, 2)) for k in range(4)], axis=0)
            # actions = np.concatenate([actions,
            #                           np.column_stack((n - j, i)),
            #                           np.column_stack((n - i, n - j)),
            #                           np.column_stack((j, n - i))], axis=0)
            # rewards = np.tile(rewards, 4)
            # dones = np.tile(dones, 4)

        return states, actions, rewards, next_states, dones

In [None]:
# =========== Prioritized Replay Buffer With Augmentation ===========
class PrioritizedBuffer(object):
    def __init__(self, capacity, prob_alpha=0.6):
        self.prob_alpha = prob_alpha
        self.capacity = capacity
        self.buffer = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def add(self, state, action, reward, next_state, done):
        max_prio = self.priorities.max() if self.buffer else 1.0

        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4, augmentation=False):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        probs  = prios ** self.prob_alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total    = len(self.buffer)
        weights  = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights  = np.array(weights, dtype=np.float32)

        states, actions, rewards, next_states, dones = zip(*samples)
        states, actions, rewards, next_states, dones = np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

        if augmentation:
            n = states.shape[-1] - 1
            k = np.random.randint(0, 4, size=batch_size)

            mask = [None] * 4
            for i in range(1, 4):
                mask[i] = k == i
                states[mask[i]] = np.rot90(states[mask[i]], i, axes=(1, 2))
                next_states[mask[i]] = np.rot90(next_states[mask[i]], i, axes=(1, 2))

            i, j = actions[:, 0], actions[:, 1]
            actions[mask[1]] = np.column_stack((n - j[mask[1]], i[mask[1]]))
            actions[mask[2]] = np.column_stack((n - i[mask[2]], n - j[mask[2]]))
            actions[mask[3]] = np.column_stack((j[mask[3]], n - i[mask[3]]))

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

#Функции и гиперпараметры для обучения

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed);

In [None]:
board_size = 7
win_size = 5

In [None]:
# Гиперпараметры метода DQN

batch_size = 128        # 512 - много
total_steps = 90_000

decay_steps = 60_000
init_epsilon = 1
final_epsilon = 0.25     # 0.02 - мало; 0.1 - мало

loss_freq = 100
refresh_target_network_freq = 100    # 1000 - много, 50 - мало

eval_freq = 500
n_eval_games = 100

max_grad_norm = 50

gamma = 0.9

In [None]:
agent = DQNAgent(init_epsilon).to(device)

target_network = DQNAgent(init_epsilon).to(device)
target_network.load_state_dict(agent.state_dict())

optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)
exp_replay = PrioritizedBuffer(16_000) #ReplayBuffer(16_000)

In [None]:
sum([p.numel() for p in agent.parameters()])

1775105

In [None]:
# Возвращает temporal difference loss
def compute_td_loss(states, actions, rewards, next_states, dones,
                    agent, target_network, weights=None, indices=None,    #exp_replay
                    gamma=0.9, device=device, prioritized=False):

    states = torch.tensor(states, device=device, dtype=torch.float32)                # shape: [batch_size, state_dim]
    actions = torch.tensor(actions, device=device, dtype=torch.int64)                # shape: [batch_size]
    rewards = torch.tensor(rewards, device=device, dtype=torch.float32)              # shape: [batch_size]
    next_states = torch.tensor(next_states, device=device, dtype=torch.float32)      # shape: [batch_size, state_dim]
    dones = torch.tensor(dones, device=device, dtype=torch.int64)                    # shape: [batch_size]

    predicted_qvalues = agent(states)                                                # shape: [batch_size, n_actions]
    predicted_next_qvalues = target_network(next_states)                             # shape: [batch_size, n_actions]
    predicted_qvalues_for_actions = predicted_qvalues[range(len(actions)), actions[:, 0], actions[:, 1]]  # shape: [batch_size]
    next_state_values = predicted_next_qvalues.view(dones.shape[0], -1).max(axis=1).values
    target_qvalues_for_actions = rewards - (1 - dones) * gamma * next_state_values

    if prioritized:   #[Prioterized DQN]
        weights = torch.tensor(weights, device=device, dtype=torch.float32)
        loss = weights * (predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2
        prios = (loss + 1e-5).data.cpu().numpy()  # Обновление приоритетов
        loss = torch.mean(loss)
        exp_replay.update_priorities(indices, prios)
        return loss
    else:
        return torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2)  #loss

# Рассчитывает epsilon на текущем шаге step
def linear_decay(init_epsilon, final_epsilon, step, decay_steps):
    return max(init_epsilon - step * (init_epsilon - final_epsilon) / decay_steps, final_epsilon)

# Обучение

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
main_random = RandomPlus(board_size, win_size, win=True)
game = TicTacToe(agent, main_random, board_size=board_size, win_size=win_size)

In [None]:
PATH = f'/content/drive/MyDrive/TicTacToe_10/'

loss = None
loss_values = []
reward_values = []

In [None]:
with open('out.txt', 'w') as f:
  for t in range(total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 0    	 reward = -1.0	{'DQNAgent': 0, 'RandomPlus': 100}

t = 100  	 loss = 0.0001400406181346625	 eps = 0.9988
t = 200  	 loss = 0.0011216223938390613	 eps = 0.9975
t = 300  	 loss = 0.002111040288582444	 eps = 0.9962
t = 400  	 loss = 0.0016430187970399857	 eps = 0.995
t = 500  	 loss = 0.002185105113312602	 eps = 0.9938
t = 500  	 reward = -1.0	{'DQNAgent': 0, 'RandomPlus': 100}

t = 600  	 loss = 0.002207183977589011	 eps = 0.9925
t = 700  	 loss = 0.001879225135780871	 eps = 0.9912
t = 800  	 loss = 0.0012359556276351213	 eps = 0.99
t = 900  	 loss = 0.0023581241257488728	 eps = 0.9888
t = 1000 	 loss = 0.0024229567497968674	 eps = 0.9875
t = 1000 	 reward = -1.0	{'DQNAgent': 0, 'RandomPlus': 100}

t = 1100 	 loss = 0.0014549789484590292	 eps = 0.9862
t = 1200 	 loss = 0.001642267918214202	 eps = 0.985
t = 1300 	 loss = 0.0032500899396836758	 eps = 0.9838
t = 1400 	 loss = 0.0017823234666138887	 eps = 0.9825
t = 1500 	 loss = 0.002298464998602867	 eps = 0.9812
t = 1500 	 reward

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_12500'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_12500'))

In [None]:
with open('out.txt', 'w') as f:
  for t in range(12_600, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 12600	 loss = None	 eps = 0.8425
t = 12700	 loss = 0.0008929910836741328	 eps = 0.8413
t = 12800	 loss = 0.00042384787229821086	 eps = 0.84
t = 12900	 loss = 0.00042001449037343264	 eps = 0.8387
t = 13000	 loss = 0.0003424016758799553	 eps = 0.8375
t = 13000	 reward = 0.48	{'DQNAgent': 74, 'RandomPlus': 26}

t = 13100	 loss = 0.0005153411184437573	 eps = 0.8362
t = 13200	 loss = 0.0009429357014596462	 eps = 0.835
t = 13300	 loss = 0.0008155627292580903	 eps = 0.8337
t = 13400	 loss = 0.0007702329894527793	 eps = 0.8325
t = 13500	 loss = 0.00035658638807944953	 eps = 0.8313
t = 13500	 reward = 0.46	{'DQNAgent': 73, 'RandomPlus': 27}

t = 13600	 loss = 0.0006528746453113854	 eps = 0.83
t = 13700	 loss = 0.0004594996280502528	 eps = 0.8287
t = 13800	 loss = 0.0010459225159138441	 eps = 0.8275
t = 13900	 loss = 0.00038195308297872543	 eps = 0.8263
t = 14000	 loss = 0.0004274129751138389	 eps = 0.825
t = 14000	 reward = 0.66	{'DQNAgent': 83, 'RandomPlus': 17}

t = 14100	 loss = 0.000274

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_27000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_27000'))

In [None]:
with open('out.txt', 'w') as f:
  for t in range(27_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 27100	 loss = None	 eps = 0.6613
t = 27200	 loss = 0.0059985509142279625	 eps = 0.66
t = 27300	 loss = 0.00033890249324031174	 eps = 0.6587
t = 27400	 loss = 0.0007647744496352971	 eps = 0.6575
t = 27500	 loss = 0.0003159498446621001	 eps = 0.6562
t = 27500	 reward = 0.73	{'DQNAgent': 84, 'RandomPlus': 11}

t = 27600	 loss = 0.00028065693913958967	 eps = 0.655
t = 27700	 loss = 0.00038898485945537686	 eps = 0.6538
t = 27800	 loss = 0.0002986114704981446	 eps = 0.6525
t = 27900	 loss = 0.00038932362804189324	 eps = 0.6512
t = 28000	 loss = 0.00032276188721880317	 eps = 0.65
t = 28000	 reward = 0.62	{'DQNAgent': 81, 'RandomPlus': 19}

t = 28100	 loss = 0.00022563812672160566	 eps = 0.6487
t = 28200	 loss = 0.00037838303251191974	 eps = 0.6475
t = 28300	 loss = 0.0004960792139172554	 eps = 0.6462
t = 28400	 loss = 0.00029072415782138705	 eps = 0.645
t = 28500	 loss = 0.0003081182949244976	 eps = 0.6438
t = 28500	 reward = 0.68	{'DQNAgent': 84, 'RandomPlus': 16}

t = 28600	 loss = 0.00

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_32000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_32000'))

In [None]:
with open('out.txt', 'w') as f:
  for t in range(32_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 32100	 loss = None	 eps = 0.5988
t = 32200	 loss = 0.0003604223020374775	 eps = 0.5975
t = 32300	 loss = 0.0002640708698891103	 eps = 0.5962
t = 32400	 loss = 0.00029565137811005116	 eps = 0.595
t = 32500	 loss = 0.0005859998636879027	 eps = 0.5938
t = 32500	 reward = 0.79	{'DQNAgent': 89, 'RandomPlus': 10}

t = 32600	 loss = 0.0003901832096744329	 eps = 0.5925
t = 32700	 loss = 0.00025767215993255377	 eps = 0.5913
t = 32800	 loss = 0.0002867078292183578	 eps = 0.59
t = 32900	 loss = 0.0003160769119858742	 eps = 0.5887
t = 33000	 loss = 0.0007393527776002884	 eps = 0.5875
t = 33000	 reward = 0.77	{'DQNAgent': 88, 'RandomPlus': 11}

t = 33100	 loss = 0.0002735173038672656	 eps = 0.5862
t = 33200	 loss = 0.0005184731562621891	 eps = 0.585
t = 33300	 loss = 0.00022128145792521536	 eps = 0.5837
t = 33400	 loss = 0.0003853509551845491	 eps = 0.5825
t = 33500	 loss = 0.000253845879342407	 eps = 0.5813
t = 33500	 reward = 0.86	{'DQNAgent': 91, 'RandomPlus': 5}

t = 33600	 loss = 0.0002703

In [None]:
agent.load_state_dict(torch.load(f'{PATH}model_50000'))
optimizer.load_state_dict(torch.load(f'{PATH}opt_50000'))

In [None]:
with open('out.txt', 'w') as f:
  for t in range(50_100, total_steps):
    print(f't = {t}. Ход {game.players[-1].name}', file=f)

    state = (np.zeros((board_size, board_size)), -1) # Начальное состояние игры. state = (state_2d, turn)
    turn = next_turn = -1

    while(next_turn != 0):
        current_player = game.players[turn]
        if current_player.name == 'DQNAgent':
            agent.epsilon = linear_decay(init_epsilon, final_epsilon, t, decay_steps)

        action = agent.get_action(state)
        print(action, file=f)
        next_state_2d, next_turn, reward = game.play_turn(state, action)
        state_2d, turn = state

        if next_turn == 0:
            if (reward == 0): print('Ничья!\n', file=f)
            else: print(f'Победа ({game.players[reward * turn].name})!\n', file=f)
            game.players = {-1: game.players[1], 1: game.players[-1]}

        exp_replay.add(turn * state_2d, action, reward, next_turn * next_state_2d, next_turn == 0) #state, action, reward, new_state, done

        # Обучение на минибатче
        if len(exp_replay) >= batch_size:
            states, actions, rewards, next_states, dones, indices, weights = exp_replay.sample(batch_size, augmentation=True)
            loss = compute_td_loss(states, actions, rewards, next_states, dones,
                                   agent, target_network, weights, indices, gamma, prioritized=True)
            loss.backward()
            grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        state = next_state_2d, next_turn

    # Каждые refresh_target_network_freq обновляются веса target сети
    if t % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    # Вывод лосса  с заданной частотой
    if t % loss_freq == 0 and t > 0:
        loss_values.append(loss)
        print(f"t = {str(t):5}\t loss = {loss}\t eps = {round(agent.epsilon, 4)}")

    # Вывод награды с заданной частотой
    if t % eval_freq == 0:
        agent.epsilon = 0
        eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
        eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
        eval_game.play(n_eval_games)
        mean_reward = (eval_game.wins['DQNAgent'] - eval_game.wins['RandomPlus']) / n_eval_games
        reward_values.append(mean_reward)
        print(f"t = {str(t):5}\t reward = {round(mean_reward, 4)}\t{eval_game.wins}\n")

        torch.save(agent.state_dict(), PATH + f'model_{t}')
        torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

torch.save(agent.state_dict(), PATH + f'model_{t}')
torch.save(optimizer.state_dict(), PATH + f'opt_{t}')

t = 50100	 loss = None	 eps = 0.3738
t = 50200	 loss = 0.0002211250684922561	 eps = 0.3725
t = 50300	 loss = 0.0002295014710398391	 eps = 0.3712
t = 50400	 loss = 0.00014761908096261322	 eps = 0.37
t = 50500	 loss = 0.00020375415624585003	 eps = 0.3688
t = 50500	 reward = 0.92	{'DQNAgent': 96, 'RandomPlus': 4}

t = 50600	 loss = 0.0003613890730775893	 eps = 0.3675
t = 50700	 loss = 0.00017614458920434117	 eps = 0.3662
t = 50800	 loss = 0.0006521661998704076	 eps = 0.365
t = 50900	 loss = 0.00022069099941290915	 eps = 0.3638
t = 51000	 loss = 0.000389240711228922	 eps = 0.3625
t = 51000	 reward = 0.91	{'DQNAgent': 95, 'RandomPlus': 4}

t = 51100	 loss = 0.0002868498268071562	 eps = 0.3612
t = 51200	 loss = 0.00017824700626078993	 eps = 0.36
t = 51300	 loss = 0.00012416072422638535	 eps = 0.3588
t = 51400	 loss = 0.0002470770850777626	 eps = 0.3575
t = 51500	 loss = 0.00033206160878762603	 eps = 0.3562
t = 51500	 reward = 0.85	{'DQNAgent': 92, 'RandomPlus': 7}

t = 51600	 loss = 0.000646

#Тестирование обученных моделей (инференс с маскированием)

In [None]:
PATH = '/content/drive/MyDrive/TicTacToe_10/'

In [None]:
# Сравнение обученных моделей
eval_random = RandomPlus(board_size, win_size, win=True, defense=True, win_2=True, defense_2=True)
agent.epsilon = 0
agent.masking = True

for i in range(0, 70_000, 500):
    agent.load_state_dict(torch.load(f'{PATH}model_{i}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(1000)
    print(f'{i:5}', eval_game.wins)

    0 {'DQNAgent': 2, 'RandomPlus': 996}
  500 {'DQNAgent': 5, 'RandomPlus': 985}
 1000 {'DQNAgent': 22, 'RandomPlus': 947}
 1500 {'DQNAgent': 73, 'RandomPlus': 879}
 2000 {'DQNAgent': 89, 'RandomPlus': 809}
 2500 {'DQNAgent': 214, 'RandomPlus': 654}
 3000 {'DQNAgent': 182, 'RandomPlus': 666}
 3500 {'DQNAgent': 347, 'RandomPlus': 521}
 4000 {'DQNAgent': 405, 'RandomPlus': 444}
 4500 {'DQNAgent': 557, 'RandomPlus': 343}
 5000 {'DQNAgent': 617, 'RandomPlus': 254}
 5500 {'DQNAgent': 551, 'RandomPlus': 239}
 6000 {'DQNAgent': 652, 'RandomPlus': 195}
 6500 {'DQNAgent': 651, 'RandomPlus': 134}
 7000 {'DQNAgent': 686, 'RandomPlus': 149}
 7500 {'DQNAgent': 706, 'RandomPlus': 100}
 8000 {'DQNAgent': 769, 'RandomPlus': 70}
 8500 {'DQNAgent': 789, 'RandomPlus': 60}
 9000 {'DQNAgent': 776, 'RandomPlus': 59}
 9500 {'DQNAgent': 831, 'RandomPlus': 43}
10000 {'DQNAgent': 816, 'RandomPlus': 28}
10500 {'DQNAgent': 784, 'RandomPlus': 54}
11000 {'DQNAgent': 830, 'RandomPlus': 32}
11500 {'DQNAgent': 766, '

In [None]:
# Сравнение лучших моделей (без проигрышей)
models = [50_000]

agent.epsilon = 0
agent.masking = True

for model in models:
    agent.load_state_dict(torch.load(f'{PATH}model_{model}', map_location=torch.device('cpu')))
    eval_game = TicTacToe(agent, eval_random, board_size=board_size, win_size=win_size)
    eval_game.play(10_000)
    print(model, eval_game.wins)

50000 {'DQNAgent': 9495, 'RandomPlus': 61}


In [None]:
# Загрузка самой лучшей модели
agent.load_state_dict(torch.load(f'{PATH}model_61000', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
agent.epsilon = 0
test_game = TicTacToe(agent, Human(), board_size=board_size, win_size=win_size)
test_game.play(4, True)
test_game.wins

player -1's turn:
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
. . . . . . .
player -1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
Введите ваш ход (Строка, столбец)
3 3
player 1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  O  .  .  .  .
 .  .  .  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
player -1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  O  .  .  .  .
 .  .  X  X  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
Введите ваш ход (Строка, столбец)
4 5
player 1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  O  .  .  .  .
 .  .  X  X  O  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
player -1's turn:
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  .  O  X  .  .  .
 .  .  X  X  O  .  .
 .  .  .  .  .  .  .
 .  .  .  .  .  .  .
 .  . 

KeyboardInterrupt: Interrupted by user

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Первый ход за крестики и значения $Q$-фунцкии в начальном состоянии

In [None]:
state2d = torch.tensor(np.zeros((1, 7, 7))).to(device)

q_values = agent(state2d).squeeze(0).detach().cpu().numpy()
np.unravel_index(q_values.argmax(), q_values.shape)

(3, 3)

In [None]:
q_values.round(4)

array([[-0.0684, -0.064 , -0.0358, -0.0494, -0.0263, -0.0711, -0.0195],
       [-0.0457, -0.0207, -0.0132, -0.0035, -0.0091, -0.029 , -0.0438],
       [-0.0156, -0.0113,  0.0434,  0.0295,  0.0367, -0.0051, -0.0104],
       [-0.0425, -0.0255,  0.0354,  0.0539,  0.0183, -0.0157, -0.0361],
       [-0.0721, -0.0236,  0.0205,  0.0216,  0.0134, -0.0366, -0.0267],
       [-0.0979, -0.0705, -0.0431, -0.0291, -0.044 , -0.0472, -0.0386],
       [-0.0893, -0.0818, -0.0544, -0.0399, -0.0268, -0.0384, -0.0467]],
      dtype=float32)