In [1]:
import copy
import math
import random
from itertools import product

import gym
import numpy as np
import matplotlib.pyplot as plt

from collections import defaultdict

In [2]:
class TicTacToe(gym.Env):
    def __init__(self, n_rows, n_cols, n_win, clone=None):
        if clone is not None:
            self.n_rows, self.n_cols, self.n_win = clone.n_rows, clone.n_cols, clone.n_win
            self.board = copy.deepcopy(clone.board)
            self.curTurn = clone.curTurn
            self.emptySpaces = None
            self.boardHash = None
        else:
            self.n_rows = n_rows
            self.n_cols = n_cols
            self.n_win = n_win

            self.reset()

    def getEmptySpaces(self):
        if self.emptySpaces is None:
            res = np.where(self.board == 0)
            self.emptySpaces = np.array([ (i, j) for i,j in zip(res[0], res[1]) ])
        return self.emptySpaces

    def makeMove(self, player, i, j):
        self.board[i, j] = player
        self.emptySpaces = None
        self.boardHash = None

    def getHash(self):
        if self.boardHash is None:
            self.boardHash = ''.join(['%s' % (x+1) for x in self.board.reshape(self.n_rows * self.n_cols)])
        return self.boardHash

    def isTerminal(self):
        # проверим, не закончилась ли игра
        cur_marks, cur_p = np.where(self.board == self.curTurn), self.curTurn
        for i,j in zip(cur_marks[0], cur_marks[1]):
            win = False
            if i <= self.n_rows - self.n_win:
                if np.all(self.board[i:i+self.n_win, j] == cur_p):
                    win = True
            if not win:
                if j <= self.n_cols - self.n_win:
                    if np.all(self.board[i,j:j+self.n_win] == cur_p):
                        win = True
            if not win:
                if i <= self.n_rows - self.n_win and j <= self.n_cols - self.n_win:
                    if np.all(np.array([ self.board[i+k,j+k] == cur_p for k in range(self.n_win) ])):
                        win = True
            if not win:
                if i <= self.n_rows - self.n_win and j >= self.n_win-1:
                    if np.all(np.array([ self.board[i+k,j-k] == cur_p for k in range(self.n_win) ])):
                        win = True
            if win:
                self.gameOver = True
                return self.curTurn

        if len(self.getEmptySpaces()) == 0:
            self.gameOver = True
            return 0

        self.gameOver = False
        return None

    def printBoard(self):
        for i in range(0, self.n_rows):
            print('----'*(self.n_cols)+'-')
            out = '| '
            for j in range(0, self.n_cols):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('----'*(self.n_cols)+'-')

    def getState(self):
        return (self.getHash(), self.getEmptySpaces(), self.curTurn)

    def action_from_int(self, action_int):
        return ( int(action_int / self.n_cols), int(action_int % self.n_cols))

    def int_from_action(self, action):
        return action[0] * self.n_cols + action[1]
    
    def step(self, action):
        if self.board[action[0], action[1]] != 0:
            return self.getState(), -10, True, {}
        self.makeMove(self.curTurn, action[0], action[1])
        reward = self.isTerminal()
        self.curTurn = -self.curTurn
        return self.getState(), 0 if reward is None else reward, reward is not None, {}

    def reset(self):
        self.board = np.zeros((self.n_rows, self.n_cols), dtype=int)
        self.boardHash = None
        self.gameOver = False
        self.emptySpaces = None
        self.curTurn = 1

In [4]:
class AgentQ:

    def __init__(self, n_rows, n_cols, n_win, alpha=0.05, epsilon=0.1, gamma=0.99) -> None:
        self.board_size = n_rows * n_cols
        self.env = TicTacToe(n_rows=n_rows, n_cols=n_cols, n_win=n_win)
        actions = [f"{i}{j}" for i in range(n_rows) for j in range(n_cols)]
        self.q_zeros = defaultdict(lambda: {a: 0 for a in actions})
        self.q_cross = defaultdict(lambda: {a: 0 for a in actions})
        self.zeros_policy = defaultdict(lambda: np.random.choice(actions))
        self.cross_policy = defaultdict(lambda: np.random.choice(actions))
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        
    @staticmethod
    def empty_to_string(empty_spaces):
        return ["".join(map(str, coor)) for coor in empty_spaces]

    def update_cross_policy(self):
        for state, action_dict in self.q_cross.items():
            best_action = max(action_dict.items(), key=lambda x: x[1])[0]
            self.cross_policy[state] = best_action

    def update_zeros_policy(self):
        for state, action_dict in self.q_zeros.items():
            best_action = max(action_dict.items(), key=lambda x: x[1])[0]
            self.zeros_policy[state] = best_action
            
    def get_action(self, state, empty_spaces, cur_turn):
        spaces_strings = self.empty_to_string(empty_spaces)
        if cur_turn == 1:
            action = self.cross_policy[state] if random.random() > self.epsilon else random.choice(spaces_strings)
        else:
            action = self.zeros_policy[state] if random.random() > self.epsilon else random.choice(spaces_strings)
        return action
    
    def step_zeros(self, state, empty_spaces):
        
        spaces_strings = self.empty_to_string(empty_spaces)
        action = self.zeros_policy[state] if random.random() > self.epsilon else random.choice(spaces_strings)
        (new_state, empty_spaces, cur_turn), reward_zeros, done, _ = env.step(action)

        return new_state, empty_spaces, cur_turn, reward, -reward, done
    
    def update(self):
        
        self.env.reset()
        state_cross, empty_spaces, cur_turn = env.getState()
        action_cross = self.get_action(state_cross, empty_spaces, cur_turn)
        (state_zeros, empty_spaces, cur_turn), reward_cross, done, _ = env.step(action)
        action_zeros = self.get_action(state_zeros, empty_spaces, cur_turn)
        (new_state_cross, empty_spaces, cur_turn), reward_zeros, done, _ = env.step(action_zeros)
        
        

SyntaxError: invalid syntax (4292653795.py, line 10)

In [8]:
def Q_learning_episode(env, pi_cross, pi_zero, Q_cross, Q_zero, board_size, mapping, alpha=0.05, epsilon=0.0, gamma=0.9):
    env.reset()
    square_size = board_size ** 2
    s_cross = mapping[env.getState()[0]]
    a_cross = pi_cross[s_cross] if np.random.rand() > epsilon else np.random.randint(0, square_size)
    _, reward_cross, _, _ = env.step(square_action(a_cross, board_size))
    s_zero = mapping[env.getState()[0]]
    a_zero = pi_zero[s_zero] if np.random.rand() > epsilon else np.random.randint(0, square_size)
    for _ in range(9):
        if env.curTurn == 1:
            s_prime_zero, reward_cross, done, _ = env.step(square_action(a_cross, board_size))
            if reward_cross == 1:
                reward_zero = -1
            s_prime_zero = mapping[s_prime_zero[0]]
            a_prime_zero = pi_zero[s_prime_zero] if np.random.rand() > epsilon else np.random.randint(0, square_size)
            Q_zero[s_zero, a_zero] = Q_zero[s_zero, a_zero] + alpha * (reward_zero + gamma * Q_zero[s_prime_zero].max() - Q_zero[s_zero, a_zero])
            s_zero, a_zero = s_prime_zero, a_prime_zero
            if done:
                Q_cross[s_cross, a_cross] = reward_cross
                break
        else:
            s_prime_cross, reward_zero, done, _ = env.step(square_action(a_zero, board_size))
            if reward_zero == -1:
                reward_zero = 1
                reward_cross = -1
            s_prime_cross = mapping[s_prime_cross[0]]
            a_prime_cross = pi_cross[s_prime_cross] if np.random.rand() > epsilon else np.random.randint(0, square_size)
            Q_cross[s_cross, a_cross] = Q_cross[s_cross, a_cross] + alpha * (reward_cross + gamma * Q_cross[s_prime_cross].max() - Q_cross[s_cross, a_cross])
            s_cross, a_cross = s_prime_cross, a_prime_cross
            if done:
                Q_zero[s_zero, a_zero] = reward_zero
                break
    return Q_cross, Q_zero, reward_cross, reward_zero

In [9]:
mapping = create_mapping(N_ROWS)
Q_cross = init_Q(mapping, N_ROWS)
Q_zero = init_Q(mapping, N_ROWS)
pi_cross = update_policy(Q_cross, N_ROWS)
pi_zero = update_policy(Q_zero, N_ROWS)
env = TicTacToe(N_ROWS, N_COLS, N_WIN)

In [10]:
total_episodes = 1_500_000
gamma = 0.9
rewards_cross = []
rewards_zero = []

for n in range(total_episodes):
    Q_cross, Q_zero, reward_cross, reward_zero = Q_learning_episode(
        env=env, 
        pi_cross=pi_cross,
        pi_zero=pi_zero, 
        Q_cross=Q_cross, 
        Q_zero=Q_zero,
        board_size = N_ROWS,
        mapping=mapping,
        alpha=0.1, 
        epsilon= 0.2, 
        gamma=gamma
    )
    pi_cross = update_policy(Q_cross, N_ROWS)
    pi_zero = update_policy(Q_zero, N_ROWS)
    rewards_cross.append(reward_cross)
    rewards_zero.append(reward_zero)

In [11]:
for _ in range(50):
    done = False
    env.reset()
    while not done:
        s = mapping[env.getState()[0]]
        a = square_action(pi_cross[s], N_COLS)
        _, reward, done, _ = env.step(a)
        print(env.board)
        if done and reward == 1:
            print("You loose")
            break
        elif done and reward == -10:
            print("Bot debil")
            break
        elif done:
            print("Draw")
            break
        user_a = input("Your turn: ")
        action = tuple(map(lambda x: int(x) - 1, user_a.split()))
        _, reward, done, _ = env.step(action)
        print(env.board)
        if done:
            print("You win")
            break

[[0 0 1]
 [0 0 0]
 [0 0 0]]
