# RL and Advanced DL: Домашнее задание 2

## Часть первая: крестики-нолики при помощи Q-обучения


1. Реализуйте обычное (табличное) Q-обучение. Обучите стратегии крестиков и ноликов для доски 3х3.
2. Попробуйте обучить стратегии крестиков и ноликов для доски 4х4 и/или 5х5.


### Tic Tac Toe environment, за основу взята среда из семинара

In [None]:
import gym
import copy
import numpy as np
import matplotlib.pyplot as plt


EMPTY = 0
CROSSES_TURN = 1
NOUGHTS_TURN = 2


DRAW_REWARD = 0
BAD_MOVE_REWARD = -10
CROSSES_WIN_REWARD = CROSSES_TURN
NOUGHTS_WIN_REWARD = NOUGHTS_TURN


class TicTacToeEnv(gym.Env):

    def __init__(self, n_rows, n_cols, n_win):
        self.n_rows = n_rows
        self.n_cols = n_cols
        self.n_win = n_win
        self.reset()
    
    
    # методы требующиеся для интерфейса env
    
    def reset(self):
        self.board = np.full((self.n_rows, self.n_cols), EMPTY, dtype=int)
        self.allowed_actions = self._allowed_actions()
        self.board_hash = self._board_hash()
        self.turn = CROSSES_TURN
    
    
    def step(self, action):
        reward = self.player_move(action) if self.board[action[0], action[1]] == EMPTY else BAD_MOVE_REWARD
        reward, done = (0 if reward is None else reward), (reward is not None)
        return self.state(), reward, done, {}
    
    
    def state(self):
        return (self.board_hash, self.allowed_actions, self.turn)
    
    
    # вспомогательные методы    
    
    def _other_player_turn(self):
        return CROSSES_TURN if self.turn == NOUGHTS_TURN else NOUGHTS_TURN

    
    def _board_hash(self):
        return ''.join([str(x) for x in self.board.reshape(self.n_rows * self.n_cols)])

    
    def _allowed_actions(self):
        free_positions = np.where(self.board == EMPTY)
        return np.array([ (i, j) for i, j in zip(free_positions[0], free_positions[1]) ])
    
    
    def _winner(self):
        return CROSSES_WIN_REWARD if self.turn == CROSSES_TURN else NOUGHTS_WIN_REWARD
    
    
    def _player_move(self, action):
        self.board[action[0], action[1]] = self.turn
        self.board_hash = self._board_hash()
        self.allowed_actions = self._allowed_actions()
        self.turn = self._other_player_turn()
        return self._reward()

    
    def _reward(self):
        '''
        check if current turn wins the game, returns reward or None if not done yet
        '''
        turn_positions = np.where(self.board == self.turn)
        
        for i, j in zip(turn_positions[0], turn_positions[1]):
            # check rows
            if i <= self.n_rows - self.n_win:
                if np.all(self.board[i:i+self.n_win, j] == self.turn):
                    return self.winner()
            # check cols
            if j <= self.n_cols - self.n_win:
                if np.all(self.board[i,j:j+self.n_win] == self.turn):
                    return self.winner()
            # check primary diags
            if i <= self.n_rows - self.n_win and j <= self.n_cols - self.n_win:
                if np.all(np.array([ self.board[i+k,j+k] == self.turn for k in range(self.n_win) ])):
                    return self.winner()
            # check secondary diags
            if i <= self.n_rows - self.n_win and j >= self.n_win-1:
                if np.all(np.array([ self.board[i+k,j-k] == self.turn for k in range(self.n_win) ])):
                    return self.winner()

        if len(self.allowed_actions) == 0:
            return DRAW_REWARD

        # not done yet
        return None