In [1]:
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from keras import layers
from keras import models
from tensorflow.keras.optimizers import Adam
import numpy as np
from numpy.lib.stride_tricks import as_strided
from tensorflow import keras
from random import randint
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from keras.layers import Conv2D, Conv3D, Dense
import random

In [3]:
tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True) # Enable XLA
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [4]:
!nvidia-smi -L

#policy = mixed_precision.Policy('mixed_float16')
policy = mixed_precision.Policy('float32')
mixed_precision.set_policy(policy)

GPU 0: Tesla T4 (UUID: GPU-a7791ce8-efc4-997b-a26c-89d7b3832b68)


In [5]:
def preprocess_state(states):
    States = np.zeros((states.shape[0], 3, states.shape[1], states.shape[2]))
    States[:, 0] = states == 0
    States[:, 1] = states == -1
    States[:, 2] = states == 1
    return States

In [6]:
class QModel:
    # Конструктор
    def __init__(self, discount_factor=0.9, size=5, lr=0.001):
        self.discount_factor = discount_factor
        self.size = size
        self.lr = lr
        self.model = self.create_model(size=size)
        self.target_model = self.create_model(size=size)

    #Обучение на текущем состоянии и принятых действиях
    def learn(self, transitions):
        states, actions, next_states, dones, rewards = transitions # done - закончилась игра (0) или продолжается (1)

        invalid_next_actions = np.where(next_states != 0)
        invalid_actions = np.where(states != 0)

        next_qvalues = self.target_model.predict(preprocess_state(next_states))
        next_qvalues[invalid_next_actions[:][0], invalid_next_actions[:][1], invalid_next_actions[:][2]] = -1
        expected_qvalues_for_actions = rewards + dones * (self.discount_factor * next_qvalues.max(axis=(1, 2)))

        qvalues = self.model.predict(preprocess_state(states))
        qvalues[np.arange(qvalues.shape[0]), actions[:, 0], actions[:, 1]] = expected_qvalues_for_actions
        qvalues[invalid_actions[:][0], invalid_actions[:][1], invalid_actions[:][2]] = -1

        self.model.fit(preprocess_state(states), qvalues, epochs=1, verbose=0)

    #Создает нейросеть
    def create_model(self, size = 3):
        inputs = keras.Input(shape=(3, size, size, 1))
        x = Conv2D(256, kernel_size=4, activation='relu')(inputs)
        x = Conv3D(256, kernel_size=(3, 2, 2), activation='relu')(x)
        x = Dense(512, activation='relu')(x)
        x = Dense(size ** 2, activation='linear')(x)
        outputs = layers.Reshape((size, size), dtype='float32')(x)
        model = keras.Model(inputs=inputs, outputs=outputs)
        model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
        return model

    # Перекомпилирует модели, уменьшая скорость обучения
    def recompile(self, coeffcient):
        self.lr *= coeffcient
        self.target_model.compile(loss='mse', optimizer=Adam(learning_rate=self.lr))
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.lr))

    #Обновляет веса target модели
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

In [7]:
# Агент играет в игру, делая ход, соответствующий оптимальному Q-значению
class Agent: 
    def __init__(self, qmodel:QModel, epsilon=0.8, size=5):
        self.qmodel = qmodel
        self.epsilon = epsilon  # Вероятность выбора агентом случайного хода
        self.size = size        # Размер поля, на котором играет агент
    
    # Возвращает случайное действие
    def random_action(self, state):
        return randint(0, self.size - 1), randint(0, self.size - 1)

    # Возвращает лучшие значения в соответствии с текущей Q-таблицей
    def best_action(self, state):
            state2d, turn = state
            qvalues = self.qmodel.model.predict(preprocess_state((state2d).reshape((1, self.size, self.size))))
            return np.unravel_index(qvalues.argmax(), qvalues[0].shape)

    # Выбор действия, согласно eps-greedy стратегии
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return self.random_action(state)
        else:
            return self.best_action(state)

In [13]:
# Игра крестики-нолики
class TicTacToe:
    def __init__(self, player_1, player_2, board_size=3, win_size=3):
        # Игра - столкновение двоих игроков
        self.players = {-1: player_1, 1: player_2}

        # Вознаграждение за каждый исход игры (ничья, победа игрока 1, победа игрока 2, занятая клетка)
        self._reward = {0: 0, -1: 0.5, 1: -0.5, 2: 1}

        # Число ничей, побед первого и второго игроков
        self.wins = {0: 0, -1: 0, 1: 0}

        # Размер поля и выигрышной последовательности
        self.board_size=board_size
        self.win_size=win_size
        self.kernel = self.create_kernel()

    # Создает ядро свертки для расчета побед
    def create_kernel(self):
        kernel = np.zeros((2 * self.win_size + 2, self.win_size, self.win_size))
        for i in range(self.win_size):
            kernel[i, i, :] = np.ones(self.win_size)
        for i in range(self.win_size, 2 * self.win_size):
            kernel[i, :, i - self.win_size] = np.ones(self.win_size).T
        kernel[2 * self.win_size] = np.eye(self.win_size)
        kernel[2 * self.win_size + 1] = np.fliplr(np.eye(self.win_size))
        return kernel

    # Проверяет победы для состояний states, в кот. ходы были совершены игроками turns, turn={-1, 1}
    def test_wins(self, states):
        expanded_states = as_strided(
            states,
            shape=(states.shape[0],
                   states.shape[1] - self.win_size + 1,  # The feature map is a few pixels smaller than the input
                   states.shape[2] - self.win_size + 1,
                   self.win_size,
                   self.win_size,
            ),
            strides=(
                states.strides[0],
                states.strides[1],
                states.strides[2],  # When we move one step in the 3rd dimension, we should move one step in the original data too
                states.strides[1],
                states.strides[2]
            ),
            writeable=False,  # totally use this to avoid writing to memory in weird places
        )
        feature_map = np.einsum('nxyij,sij->nsxy', expanded_states, self.kernel)  # n - число states
        rewards = (feature_map == -self.win_size).any(axis=(1, 2, 3)).astype(int) - (feature_map == self.win_size).any(axis=(1, 2, 3)).astype(int)
        return rewards   

    # Проигрывание нескольких полных эпизодов
    def play(self, num_games=1, visualize=False):
        for _ in range(num_games):
            turn = -1
            state2d = np.zeros((self.board_size, self.board_size))
            state = (state2d, turn) # Полное состояние игры
            if visualize: 
                self.visualize_state(state2d, turn)
            while(True):
                state_2d, turn = state
                current_player = self.players[turn]
                action = current_player.get_action(state)
                next_state_2d, next_turn, reward = self.play_turn(state, action)
                if visualize: 
                    self.visualize_state(next_state_2d, turn)
                if next_turn == 0:
                    if visualize:
                        if (reward == self._reward[0]): print('Ничья!\n')
                        elif (reward > 0): print('Победили крестики!\n')
                        elif (reward < 0): print('Победили нолики!\n')
                    self.wins[-np.sign(reward)] += 1
                    break
                state = next_state_2d, next_turn

    # Проигрывание нескольких полных эпизодов
    def play_random_first_move(self, num_games=1, visualize=False):
        for _ in range(num_games):
            state2d = np.zeros((self.board_size, self.board_size))
            state2d[randint(0, self.board_size - 1), randint(0, self.board_size - 1)] = -1
            turn = 1
            state = (state2d, turn) # Полное состояние игры
            if visualize: 
                self.visualize_state(state2d, turn)
            while(True):
                state_2d, turn = state
                current_player = self.players[turn]
                action = current_player.get_action(state)
                next_state_2d, next_turn, reward = self.play_turn(state, action)
                if visualize: 
                    self.visualize_state(next_state_2d, turn)
                if next_turn == 0:
                    if visualize:
                        if (reward == self._reward[0]): print('Ничья!\n')
                        elif (reward > 0): print('Победили крестики!\n')
                        elif (reward < 0): print('Победили нолики!\n')
                    self.wins[-np.sign(reward)] += 1
                    break
                state = next_state_2d, next_turn

    # Выполняет конкретный ход, выбранный текущим игроком, и проверяет поле на некорректный ход / выигрыш / ничью
    def play_turn(self, state, action):
        state2d, turn = state
        next_state2d = state2d.copy()

        # Проверяет наличие хода в занятую клетку
        if (state2d[(action)] != 0):  # Неправильный ход
            return next_state2d, 0, turn * self._reward[2]  # Следующий игрок выиграл (# next_turn == 0 => Игра окончена)

        # Совершить ход
        next_state2d[action] = turn

        # Проверка победы
        if self.test_wins(next_state2d.reshape((1, self.board_size, self.board_size)) * turn)[0]:
            return next_state2d, 0, self._reward[turn]  # Текущий игрок побеждает (next_turn == 0 => Игра окончена)

        # Ничья
        if (next_state2d != 0).all():    
            return next_state2d, 0, self._reward[0]  # Ничья (next_turn == 0 => Игра окончена)

        # Если ни один ход не привел к победе - ход следующего игрока
        return next_state2d, -turn, self._reward[0] # next_turn == -turn => Смена хода

    # Выполняет конкретные actions на множестве states и проверяет поля на некорректные ходы / выигрыши / ничьи. Возвращает transitions
    def make_actions(self, states, actions, turn, dones = 0):
        n = actions.shape[0]
        busy = states[np.arange(n), actions[:, 0], actions[:, 1]] != 0                      # Проверяет клетки на возможность хода в них (True = занята, False = свободна)
        next_states = states.copy()
        if (turn == -1):
            next_states[np.arange(n), actions[:, 0], actions[:, 1]] -= busy == 0    # Совершение хода
        if (turn == 1):
            next_states[np.arange(n), actions[:, 0], actions[:, 1]] += (busy == 0) * dones
        wins = self.test_wins(next_states)                                                  # Проверка побед
        draws = (next_states != 0).all(axis=(1, 2))                                         # Проверка ничей
        dones = (busy + (wins != 0) + draws == 0).astype(float)                             # (0 - terminal, 1 - non-terminal)
        next_states *= dones[:, None, None]
        return busy, wins, draws, dones, next_states

    # Выводит на экран состояние игры после хода игрока
    @staticmethod
    def visualize_state(next_state, turn):
        print(f"player {turn}'s turn:")
        print(str(next_state)
              .replace(".", "")
              .replace("[[", "")
              .replace(" [", "")
              .replace("]]", "")
              .replace("]", "")
              .replace("-0", " .")
              .replace("0", ".")
              .replace("-1", " X")
              .replace("1", "O")
              )

In [9]:
# Агент - человек
class Human:
    # Выбор хода человеком
    def get_action(self, state):
        state2d, turn = state
        print('Введите ваш ход (Строка, столбец)')
        row, col = map(int, input().split())
        while (state2d[row - 1, col - 1] != 0):
            print('Клетка занята!')
            print('Введите ваш ход (Строка, столбец)')
            row, col = map(int, input().split())
        return row - 1, col - 1

In [10]:
board_size = 5    # Размер поля

x_model = QModel(size=5)
o_model = QModel(size=5)

#x_model.model.load_weights('tictactoe_x_900.h5')
#o_model.model.load_weights('tictactoe_o_900.h5')

#qmodel.model.load_weights('tictactoe_5_5_x_2.h5')
#agent = Agent(qmodel, epsilon=0, size=5)
human = Human()
game = TicTacToe(human, None, board_size=5, win_size=4)

In [None]:
n = 1000             # Количество партий, проигрываемых параллельно

eps = 0.99           # Доля выбора хода случайным образом
N = int(n * (1 - eps))


states_1 = np.zeros((n, board_size, board_size))
actions_1 = np.random.randint(board_size, size=(n, 2))
rand = np.arange(n)
np.random.shuffle(rand)
rand = rand[0: N]
qvalues = x_model.model.predict(preprocess_state(states_1[rand]))
actions_1[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])[0: N]
busy_1, wins_1, draws_1, dones_1, states_2 = game.make_actions(states_1, actions_1, -1)



where = np.where(dones_1 == 1)[0]
actions_2 = np.random.randint(board_size, size=(n, 2))
rand = np.arange(n)
np.random.shuffle(rand)
rand = rand[0: N]
qvalues = o_model.model.predict(preprocess_state(states_2[rand]))
actions_2[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])[0: N]
busy_2, wins_2, draws_2, dones_2, next_states_1 = game.make_actions(states_2, actions_2, 1, dones_1)
rewards_1 = -1*busy_1 + wins_1 + busy_2 + wins_2 - 0.5 * (draws_1 + draws_2)

transitions = states_1, actions_1, next_states_1, dones_1, rewards_1
x_model.learn(transitions)

In [None]:
for t in np.arange(4000):
    print('t = ', t, end=' ')
    N = int(n * (1 - eps))

    # Получение actions ~ eps-greedy стратегии
    states_1 = next_states_1
    free_coords = [np.array(np.where(states_1[k] == 0)).transpose() for k in np.arange(states_1.shape[0])]
    actions_1 = np.array([free_coords[k][random.randint(0, free_coords[k].shape[0] - 1)] for k in np.arange(states_1.shape[0])])
    #actions_1 = np.random.randint(board_size, size=(n, 2))
    rand = np.arange(n)
    np.random.shuffle(rand)
    rand = rand[0: N]
    qvalues = x_model.model.predict(preprocess_state(states_1[rand]))
    qvalues += 0.4 * np.random.rand(N, board_size, board_size) - 0.2 #Добавление шума
    actions_1[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])
    busy_1, wins_1, draws_1, dones_1, next_states_2 = game.make_actions(states_1, actions_1, -1)
    rewards_1 = busy_1 - wins_1 + -1 * busy_2 - wins_2
    transitions_1 = states_2[where], actions_2[where], next_states_2[where], dones_1[where] * dones_2[where], rewards_1[where]
    o_model.learn(transitions_1)
    
    

    states_2 = next_states_2
    where = np.where(dones_1 == 1)[0]
    free_coords = [np.array(np.where(states_2[k] == 0)).transpose() for k in np.arange(states_2.shape[0])]
    actions_2 = np.array([free_coords[k][random.randint(0, free_coords[k].shape[0] - 1)] for k in np.arange(states_2.shape[0])])
    #actions_2 = np.random.randint(board_size, size=(n, 2))
    rand = np.arange(n)
    np.random.shuffle(rand)
    rand = rand[0: N]
    qvalues = o_model.model.predict(preprocess_state(states_2[rand]))
    qvalues += 0.4 * np.random.rand(N, board_size, board_size) - 0.2 #Добавление шума
    actions_2[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])
    busy_2, wins_2, draws_2, dones_2, next_states_1 = game.make_actions(states_2, actions_2, 1, dones_1)
    rewards_2 = -1 * busy_1 + wins_1 + busy_2 + wins_2
    transitions_2 = states_1, actions_1, next_states_1, dones_1 * dones_2, rewards_2
    x_model.learn(transitions_2)


    print('wins_x = ', np.sum(wins_1 == 1), end=' ')
    print('wins_o = ', np.sum(wins_2 == -1), end=' ')
    print('draws = ', np.sum(draws_1 + draws_2), end=' ')
    print('busy_1 = ', np.sum(busy_1), end=' ')
    print('busy_2 = ', np.sum(busy_2), end=' ')
    print('eps = ', eps)

    if (t + 1) % 20 == 0:
        x_model.update_target_model()
        o_model.update_target_model()

    if (t + 1) % 100 == 0:
        x_model.model.save('tictactoe__x_' + str(t + 1) + '.h5')
        #x_model.recompile(0.9)
        o_model.model.save('tictactoe__o_' + str(t + 1) + '.h5')
        #o_model.recompile(0.9)
        eps *= 0.9

t =  0 wins_x =  0 wins_o =  0 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  1 wins_x =  0 wins_o =  0 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  2 wins_x =  2 wins_o =  0 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  3 wins_x =  9 wins_o =  5 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  4 wins_x =  25 wins_o =  24 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  5 wins_x =  31 wins_o =  32 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  6 wins_x =  60 wins_o =  71 draws =  0 busy_1 =  1 busy_2 =  0 eps =  0.99
t =  7 wins_x =  72 wins_o =  78 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  8 wins_x =  114 wins_o =  74 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  9 wins_x =  84 wins_o =  71 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  10 wins_x =  83 wins_o =  62 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  11 wins_x =  40 wins_o =  3 draws =  100 busy_1 =  0 busy_2 =  0 eps =  0.99
t =  12 wins_x =  14 wins_o =  12 draws =  7

KeyboardInterrupt: ignored

In [None]:
for t in np.arange(769, 4000):
    print('t = ', t, end=' ')
    N = int(n * (1 - eps))

    # Получение actions ~ eps-greedy стратегии
    states_1 = next_states_1
    free_coords = [np.array(np.where(states_1[k] == 0)).transpose() for k in np.arange(states_1.shape[0])]
    actions_1 = np.array([free_coords[k][random.randint(0, free_coords[k].shape[0] - 1)] for k in np.arange(states_1.shape[0])])
    #actions_1 = np.random.randint(board_size, size=(n, 2))
    rand = np.arange(n)
    np.random.shuffle(rand)
    rand = rand[0: N]
    qvalues = x_model.model.predict(preprocess_state(states_1[rand]))
    qvalues += 0.4 * np.random.rand(N, board_size, board_size) - 0.2 #Добавление шума
    actions_1[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])
    busy_1, wins_1, draws_1, dones_1, next_states_2 = game.make_actions(states_1, actions_1, -1)
    rewards_1 = busy_1 - wins_1 + -1 * busy_2 - wins_2
    transitions_1 = states_2[where], actions_2[where], next_states_2[where], dones_1[where] * dones_2[where], rewards_1[where]
    o_model.learn(transitions_1)
    
    

    states_2 = next_states_2
    where = np.where(dones_1 == 1)[0]
    free_coords = [np.array(np.where(states_2[k] == 0)).transpose() for k in np.arange(states_2.shape[0])]
    actions_2 = np.array([free_coords[k][random.randint(0, free_coords[k].shape[0] - 1)] for k in np.arange(states_2.shape[0])])
    #actions_2 = np.random.randint(board_size, size=(n, 2))
    rand = np.arange(n)
    np.random.shuffle(rand)
    rand = rand[0: N]
    qvalues = o_model.model.predict(preprocess_state(states_2[rand]))
    qvalues += 0.4 * np.random.rand(N, board_size, board_size) - 0.2 #Добавление шума
    actions_2[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])
    busy_2, wins_2, draws_2, dones_2, next_states_1 = game.make_actions(states_2, actions_2, 1, dones_1)
    rewards_2 = -1 * busy_1 + wins_1 + busy_2 + wins_2
    transitions_2 = states_1, actions_1, next_states_1, dones_1 * dones_2, rewards_2
    x_model.learn(transitions_2)


    print('wins_x = ', np.sum(wins_1 == 1), end=' ')
    print('wins_o = ', np.sum(wins_2 == -1), end=' ')
    print('draws = ', np.sum(draws_1 + draws_2), end=' ')
    print('busy_1 = ', np.sum(busy_1), end=' ')
    print('busy_2 = ', np.sum(busy_2), end=' ')
    print('eps = ', eps)

    if (t + 1) % 20 == 0:
        x_model.update_target_model()
        o_model.update_target_model()

    if (t + 1) % 100 == 0:
        x_model.model.save('tictactoe__x_' + str(t + 1) + '.h5')
        #x_model.recompile(0.9)
        o_model.model.save('tictactoe__o_' + str(t + 1) + '.h5')
        #o_model.recompile(0.9)
        eps *= 0.9

t =  769 wins_x =  0 wins_o =  0 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  770 wins_x =  0 wins_o =  0 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  771 wins_x =  67 wins_o =  57 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  772 wins_x =  103 wins_o =  93 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  773 wins_x =  86 wins_o =  97 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  774 wins_x =  91 wins_o =  99 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  775 wins_x =  71 wins_o =  79 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  776 wins_x =  73 wins_o =  65 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  777 wins_x =  61 wins_o =  46 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  778 wins_x =  45 wins_o =  64 draws =  0 busy_1 =  0 busy_2 =  0 eps =  0.47351393100000005
t =  779 wins_x =  74 wins_o =  6

In [None]:
for t in np.arange(4000, 6000):
    print('t = ', t, end=' ')
    N = int(n * (1 - eps))

    # Получение actions ~ eps-greedy стратегии
    states_1 = next_states_1
    free_coords = [np.array(np.where(states_1[k] == 0)).transpose() for k in np.arange(states_1.shape[0])]
    actions_1 = np.array([free_coords[k][random.randint(0, free_coords[k].shape[0] - 1)] for k in np.arange(states_1.shape[0])])
    #actions_1 = np.random.randint(board_size, size=(n, 2))
    rand = np.arange(n)
    np.random.shuffle(rand)
    rand = rand[0: N]
    qvalues = x_model.model.predict(preprocess_state(states_1[rand]))
    qvalues += 0.4 * np.random.rand(N, board_size, board_size) - 0.2 #Добавление шума
    actions_1[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])
    busy_1, wins_1, draws_1, dones_1, next_states_2 = game.make_actions(states_1, actions_1, -1)
    rewards_1 = busy_1 - wins_1 + -1 * busy_2 - wins_2
    transitions_1 = states_2[where], actions_2[where], next_states_2[where], dones_1[where] * dones_2[where], rewards_1[where]
    o_model.learn(transitions_1)
    
    

    states_2 = next_states_2
    where = np.where(dones_1 == 1)[0]
    free_coords = [np.array(np.where(states_2[k] == 0)).transpose() for k in np.arange(states_2.shape[0])]
    actions_2 = np.array([free_coords[k][random.randint(0, free_coords[k].shape[0] - 1)] for k in np.arange(states_2.shape[0])])
    #actions_2 = np.random.randint(board_size, size=(n, 2))
    rand = np.arange(n)
    np.random.shuffle(rand)
    rand = rand[0: N]
    qvalues = o_model.model.predict(preprocess_state(states_2[rand]))
    qvalues += 0.4 * np.random.rand(N, board_size, board_size) - 0.2 #Добавление шума
    actions_2[rand] = np.array([np.unravel_index(qvalues[i].argmax(), qvalues[i].shape) for i in np.arange(N)])
    busy_2, wins_2, draws_2, dones_2, next_states_1 = game.make_actions(states_2, actions_2, 1, dones_1)
    rewards_2 = -1 * busy_1 + wins_1 + busy_2 + wins_2
    transitions_2 = states_1, actions_1, next_states_1, dones_1 * dones_2, rewards_2
    x_model.learn(transitions_2)


    print('wins_x = ', np.sum(wins_1 == 1), end=' ')
    print('wins_o = ', np.sum(wins_2 == -1), end=' ')
    print('draws = ', np.sum(draws_1 + draws_2), end=' ')
    print('busy_1 = ', np.sum(busy_1), end=' ')
    print('busy_2 = ', np.sum(busy_2), end=' ')
    print('eps = ', eps)

    if (t + 1) % 20 == 0:
        x_model.update_target_model()
        o_model.update_target_model()

    if (t + 1) % 100 == 0:
        x_model.model.save('drive/MyDrive/My/5*5 new/tictactoe__x_' + str(t + 1) + '.h5')
        #x_model.recompile(0.9)
        o_model.model.save('drive/MyDrive/My/5*5 new/tictactoe__o_' + str(t + 1) + '.h5')
        #o_model.recompile(0.9)
        eps *= 0.9

t =  4000 wins_x =  21 wins_o =  14 draws =  51 busy_1 =  1 busy_2 =  4 eps =  0.014633074112020262
t =  4001 wins_x =  18 wins_o =  16 draws =  58 busy_1 =  3 busy_2 =  4 eps =  0.014633074112020262
t =  4002 wins_x =  25 wins_o =  12 draws =  59 busy_1 =  1 busy_2 =  9 eps =  0.014633074112020262
t =  4003 wins_x =  27 wins_o =  10 draws =  58 busy_1 =  0 busy_2 =  3 eps =  0.014633074112020262
t =  4004 wins_x =  30 wins_o =  17 draws =  62 busy_1 =  1 busy_2 =  3 eps =  0.014633074112020262
t =  4005 wins_x =  21 wins_o =  14 draws =  50 busy_1 =  1 busy_2 =  6 eps =  0.014633074112020262
t =  4006 wins_x =  20 wins_o =  19 draws =  56 busy_1 =  1 busy_2 =  5 eps =  0.014633074112020262
t =  4007 wins_x =  31 wins_o =  22 draws =  44 busy_1 =  0 busy_2 =  4 eps =  0.014633074112020262
t =  4008 wins_x =  28 wins_o =  10 draws =  45 busy_1 =  0 busy_2 =  9 eps =  0.014633074112020262
t =  4009 wins_x =  30 wins_o =  18 draws =  48 busy_1 =  2 busy_2 =  9 eps =  0.014633074112020262


In [None]:
states, actions, next_states, dones, rewards = transitions_1
where = np.where(rewards != 0)
states, actions, next_states, dones, rewards = states[where], actions[where], next_states[where], dones[where], rewards[where]
for i in np.arange(200):
    print(states[i])
    print(actions[i])
    print(next_states[i])
    print(dones[i])
    print(rewards[i])
    print()

In [18]:
states_2 = np.array([[[ 0,  0,  0,  0,  0],
                      [ 0,  0,  0,  0,  0],
                      [ 0,  0,  0,  0,  0],
                      [ 0,  0,  0,  0,  0],
                      [ 0,  0,  0,  0,  0]]])

In [20]:
predict = x_model.model.predict(preprocess_state(states_2))
for i in np.arange(100):
    print(predict[i])

[[-0.10421449  0.08581173 -0.26947507 -0.3355854  -0.4162956 ]
 [-0.30207926  0.23115492  0.19333053  0.14375764 -0.6796062 ]
 [-0.56253135  0.17832899  0.18069893  0.17263126 -0.4022004 ]
 [-0.23172453 -0.39776212  0.12213039 -0.6399231  -0.31406766]
 [-0.36811897 -0.32107675 -0.24771449 -0.24325877 -0.18334967]]


IndexError: ignored

In [58]:
o_model.model.load_weights('drive/MyDrive/My/5*5 DQN Решение/5*5 new/tictactoe__o_5000.h5')
x_model.model.load_weights('drive/MyDrive/My/5*5 DQN Решение/5*5 new/tictactoe__x_5000.h5')

In [59]:
#Игра с человеком (проверка "качества" игры)
agent_x = Agent(x_model, epsilon=0, size=5)
agent_o = Agent(o_model, epsilon=0, size=5)
game = TicTacToe(agent_x, human, board_size=5, win_size=4)
transit = game.play(num_games=1, visualize=True)

player -1's turn:
. . . . .
. . . . .
. . . . .
. . . . .
. . . . .
player -1's turn:
 .  .  .  .  .
 .  .  .  .  .
 .  X  .  .  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
3 3
player 1's turn:
 .  .  .  .  .
 .  .  .  .  .
 .  X  O  .  .
 .  .  .  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  .  X  .  .
 .  X  O  .  .
 .  .  .  .  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
2 2
player 1's turn:
 .  .  .  .  .
 .  O  X  .  .
 .  X  O  .  .
 .  .  .  .  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  O  X  .  .
 .  X  O  .  .
 .  .  .  X  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
2 4
player 1's turn:
 .  .  .  .  .
 .  O  X  O  .
 .  X  O  .  .
 .  .  .  X  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  O  X  O  .
 .  X  O  .  .
 .  X  .  X  .
 .  .  .  .  .
Введите ваш ход (Строка, столбец)
4 3
player 1's turn:
 .  .  .  .  .
 .  O  X  O  .
 .  X  O  .  .
 .  X  O  X  .
 .  .  .  .  .
player -1's turn:
 .  .  .  .  .
 .  O  

In [53]:
#Игра между моделями различных эпох

o_model.model.load_weights('drive/MyDrive/My/5*5 DQN Решение/5*5 new/tictactoe__o_6000.h5')
x_model.model.load_weights('drive/MyDrive/My/5*5 DQN Решение/5*5 new/tictactoe__x_6000.h5')

In [55]:
agent_x = Agent(x_model, epsilon=0, size=5)
agent_o = Agent(o_model, epsilon=0, size=5)
game = TicTacToe(agent_x, agent_o, board_size=5, win_size=4)
transit = game.play_random_first_move(num_games=100, visualize=True)
game.wins

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
player -1's turn:
 X  O  .  .  .
 X  O  X  O  .
 X  O  X  X  X
 O  X  O  .  O
 .  .  .  .  .
player 1's turn:
 X  O  .  .  .
 X  O  X  O  .
 X  O  X  X  X
 O  X  O  O  O
 .  .  .  .  .
player -1's turn:
 X  O  .  .  .
 X  O  X  O  .
 X  O  X  X  X
 O  X  O  O  O
 X  .  .  .  .
player 1's turn:
 X  O  .  .  .
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  .  .  .  .
player -1's turn:
 X  O  .  .  .
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  X  .  .  .
player 1's turn:
 X  O  .  .  .
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  X  O  .  .
player -1's turn:
 X  O  X  .  .
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  X  O  .  .
player 1's turn:
 X  O  X  .  .
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  X  O  O  .
player -1's turn:
 X  O  X  .  X
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  X  O  O  .
player 1's turn:
 X  O  X  .  X
 X  O  X  O  O
 X  O  X  X  X
 O  X  O  O  O
 X  X  O

{-1: 3, 0: 97, 1: 0}

# Test_win

https://jessicastringham.net/2017/12/31/stride-tricks/

https://jessicastringham.net/2018/01/01/einsum/

In [None]:
import numpy as np
from numpy.lib.stride_tricks import as_strided

turns = np.array([-1, -1])
win_size = 4
board_size = 5

states = np.array([[[0,  0,  0,  0,  0],
                  [ 0,  0,  0, -1,  0],
                  [ 0,  0, -1,  0,  0],
                  [ 0, -1,  0,  0,  0],
                  [-0,  0,  0,  0,  0]],
                 [[ 0,  0,  0,  0,  0],
                  [ 0,  0,  0, -1,  0],
                  [ 0,  0, -1,  0,  0],
                  [ 0, -1,  0,  0,  0],
                  [-1,  0,  0,  0,  0]]])

states = turns[:, None, None] * states
expanded_states = as_strided(
    states,
    shape=(states.shape[0],
           states.shape[1] - win_size + 1,  # The feature map is a few pixels smaller than the input
           states.shape[2] - win_size + 1,
           win_size,
           win_size,
    ),
    strides=(
        states.strides[0],
        states.strides[1],
        states.strides[2],  # When we move one step in the 3rd dimension, we should move one step in the original data too
        states.strides[1],
        states.strides[2]
    ),
    writeable=False,  # totally use this to avoid writing to memory in weird places
)

# Ядро свертки 
kernel = np.zeros((2 * win_size + 2, win_size, win_size))
for i in range(win_size):
    kernel[i, i, :] = np.ones(win_size)
for i in range(win_size, 2 * win_size):
    kernel[i, :, i - win_size] = np.ones(win_size).T
kernel[2 * win_size] = np.eye(win_size)
kernel[2 * win_size + 1] = np.fliplr(np.eye(win_size))

feature_map = np.einsum('nxyij,sij->nsxy', expanded_states, kernel)  # n - число states
rewards = (feature_map == win_size).any(axis=(1, 2, 3)).astype('float')
rewards

array([0., 1.])

In [None]:
# Старая функция
    # Проверка на выигрыш после совершения хода
    def test_win(self, state_2d, action, turn):
        row, col = action

        for k in np.arange(self.win_size):
            vertical = state_2d.T[col][max(row - self.win_size + k + 1, 0): min(self.board_size, row + k + 1)]
            if vertical.sum() == turn * self.win_size:
                return True

            horizontal = state_2d[row][max(col - self.win_size + k + 1, 0): min(self.board_size, col + k + 1)]
            if horizontal.sum() == turn * self.win_size:
                return True

            if (row - self.win_size + k + 1 >= 0) and (row + k < self.board_size) and (col - self.win_size + k + 1 >= 0) and (col + k < self.board_size):
                diagonal_1 = np.array([state_2d[(row - self.win_size + i + k + 1, col - self.win_size + i + k + 1)] for i in np.arange(self.win_size)])
                if diagonal_1.sum() == turn * self.win_size:
                    return True

            if (row - self.win_size + k + 1 >= 0) and (row + k < self.board_size) and (col + self.win_size - k - 1 < self.board_size) and (col - k >= 0):
                diagonal_2 = np.array([state_2d[(row - self.win_size + i + k + 1, col + self.win_size - i - k - 1)] for i in np.arange(self.win_size)])
                if diagonal_2.sum() == turn * self.win_size:
                    return True
        return False

#make_actions

In [None]:
win_size = 4
board_size = 5

states = np.array([[[ 0,  0,  0,  0,  0],    # state_2d * turns
                    [ 0,  0,  0,  1,  0], 
                    [ 0,  0,  1,  0,  0],
                    [ 0,  1,  0,  0,  0],
                    [ 0,  0,  0,  0,  0]],
                   [[ 0,  0,  0,  0,  0],
                    [ 0,  0,  0,  1,  0],
                    [ 0,  0,  1,  0,  0],
                    [ 0,  1,  0,  0,  0],
                    [ 0,  0,  0,  0,  0]]], dtype=np.float)

actions = np.array([[1, 3], [4, 0]])

In [None]:
# Выполняет конкретный ход, выбранный текущим игроком, и проверяет поле на некорректный ход / выигрыш / ничью
n = actions.shape[0]
busy = states[np.arange(n), actions[:, 0], actions[:, 1]] != 0          # Проверяет клетки на возможность хода в них (True = занята, False = свободна)
next_states = states.copy()
next_states[np.arange(n), actions[:, 0], actions[:, 1]] += busy == 0    # Совершение хода
wins = game.test_wins(next_states)                                      # Проверка побед
draws = (next_states != 0).all(axis=(1, 2))                             # Проверка ничей

next_states = -next_states
dones = (busy + wins + draws == 0).astype(np.float)
rewards = -1 * busy + 0.5 * wins
transitions = []
for i in np.arange(n):
    transitions.append((states[i], actions[i], next_states[i], dones[i], rewards[i]))      
transitions                                                             # Возврат list(state, action, next_state, done, reward)

In [None]:
# Проверяет наличие хода в занятую клетку
if (state2d[(action)] != 0):  # Неправильный ход
    return next_state2d, 0, turn * -1  # Следующий игрок выиграл (# next_turn == 0 => Игра окончена)

# Совершить ход
next_state2d[action] = turn

# Проверка победы
if self.test_wins(next_state2d.reshape((1, self.board_size, self.board_size)) * turn)[0]:
    return next_state2d, 0, 0.5  # Текущий игрок побеждает (next_turn == 0 => Игра окончена)

# Ничья
if (next_state2d != 0).all():    
    return next_state2d, 0, 0  # Ничья (next_turn == 0 => Игра окончена)

# Если ни один ход не привел к победе - ход следующего игрока
return next_state2d, -turn, 0 # next_turn == -turn => Смена хода