Tic Tac Toe
---
Two players against each other

<img style="float:left" src="board.png" alt="drawing" width="200"/>

In [1]:
import numpy as np

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 3

### Board State
---
Reflect & Judge the state

2 players p1 and p2; p1 uses symbol 1 and p2 uses symbol 2, vacancy as 0

In [71]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        # init p1 plays first
        self.playerSymbol = 1
    
    # get unique hash of current board state
    def getHash(self):
        self.boardHash = str(self.board)
        return self.boardHash
    
    def winner(self):
        # row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
        # col
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
        # diagonal
        diag_sum = 0
        for i in range(BOARD_COLS):
            diag_sum += self.board[i][i]
        if diag_sum == 3:
            self.isEnd = True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        # tie
        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None
    
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions
    
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
    
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 3:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -3:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
    
    def play(self, rounds=100):
        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions)
            # take action and upate board state
            self.updateState(p1_action)
            board_hash = self.getHash()
            self.p1.addState(board_hash)
            # check board status if it is end
            
            win = self.winner()
            if win is not None:
                print("p1 states\n", self.p1.states)
                self.showBoard()
                # ended with p1 either win or draw
                self.giveReward()
                self.p1.reset()
                self.p2.reset()
                self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
                
            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)

                self.updateState(p2_action)
                board_hash = self.getHash()
                self.p2.addState(board_hash)

                win = self.winner()
                if win is not None:
                    print("p1 states\n", self.p1.states)
                    self.showBoard()
                    # ended with p2 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
                        
    def showBoard(self):
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')    

In [72]:
class Player:
    def __init__(self, name):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value
    
    def chooseAction(self, positions):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                value = 0 if self.states_value.get(p) is None else self.states_value.get(p)
                if value >= value_max:
                    value_max = value
                    action = p
        print("{} takes action {}".format(self.name, action))
        return action
    
    # append a hash state
    def addState(self, state):
        self.states.append(state)
    
    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        for st in reversed(self.states):
            self.states_value[st] +=  self.lr*(self.decay_gamma*reward - self.states_value[st])
            reward = self.states_value[st]
            
    def reset(self):
        self.states = []

In [73]:
p1 = Player("p1")
p2 = Player("p2")

In [74]:
st = State(p1, p2)

In [36]:
st.availablePositions()

[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]

In [37]:
st.isEnd

False

In [40]:
st.winner() is None

True

In [47]:
st.showBoard()

--------------
|   |   |   | 
--------------
|   |   |   | 
--------------
|   |   |   | 
--------------


In [75]:
st.play()

p1 takes action (2, 2)
p2 takes action (2, 1)
p1 takes action (2, 0)
p2 takes action (0, 0)
p1 takes action (1, 1)
p2 takes action (1, 2)
p1 takes action (1, 0)
p2 takes action (0, 2)
p1 takes action (0, 1)
p1 states
 ['[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 1.]]', '[[ 0.  0.  0.]\n [ 0.  0.  0.]\n [ 1. -1.  1.]]', '[[-1.  0.  0.]\n [ 0.  1.  0.]\n [ 1. -1.  1.]]', '[[-1.  0.  0.]\n [ 1.  1. -1.]\n [ 1. -1.  1.]]', '[[-1.  1. -1.]\n [ 1.  1. -1.]\n [ 1. -1.  1.]]']
-------------
| o | x | o | 
-------------
| x | x | o | 
-------------
| x | o | x | 
-------------


KeyError: '[[-1.  1. -1.]\n [ 1.  1. -1.]\n [ 1. -1.  1.]]'