Tic Tac Toe
---
Two players against each other

<img style="float:left" src="board.png" alt="drawing" width="200"/>

In [1]:
import numpy as np

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 3

### Board State
---
Reflect & Judge the state

2 players p1 and p2; p1 uses symbol 1 and p2 uses symbol 2, vacancy as 0

In [1]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        # init p1 plays first
        self.playerSymbol = 1
    
    # get unique hash of current board state
    def getHash(self):
        if self.boardHash is None:
            self.boardHash = 0
            for i in self.board.reshape(BOARD_ROWS * BOARD_COLS):
                if i == -1:
                    i = 2
                self.boardHash = self.boardHash * 3 + i
        return int(self.boardHash)
    
    def winner(self):
        # row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
        # col
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
        # diagonal
        diag_sum = 0
        for i in range(BOARD_COLS):
            diag_sum += self.board[i][i]
        if diag_sum == 3:
            self.isEnd = True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        # tie
        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None
    
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append([i, j])
        return positions
    
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
    
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 3:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -3:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)

In [None]:
class Player:
    def __init__(self):
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value
    
    def chooseAction(self, positions):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            action = np.random.choice(positions)
        else:
            value_max = -999
            for p in positions:
                value = 0 if self.states_value.get(p) is None else self.states_value.get(p)
                if value >= value_max:
                    value_max = value
                    action = p
                
        return action
    
    # append a hash state
    def addState(self, state):
        self.states.append(state)
    
    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        for st in reversed(self.states):
            self.states_value[st] +=  self.lr*(self.decay_gamma*reward - self.states_value[st])
            reward = self.states_value[st]
            
    def reset(self):
        self.states = []