# Tic Tac Toe

In [0]:
#@title Run this cell to load tic-tac-toe into the Python interpreter { display-mode: "form" }
import numpy as np
import time

class BasePlayer:
    def __init__(self):
        self.reset_metrics()

    def set_n(self,n):
        self.n = n

    def set_env(self, env):
        self.env = env

    def reset_metrics(self):
        self.wins = 0
        self.losses = 0
        self.ties = 0

    def record_outcome(self, game, outcome):
        if outcome == self.n:
            self.wins += 1
        elif outcome == -self.n:
            self.losses += 1
        else:
            self.ties += 1

    def reset(self):
        pass

    def update(self, game, state, reward, done):
        pass

    def __str__(self):
        return self.__class__.__name__ + ' w/l/t=' + str(self.wins) + '/' + str(self.losses) + '/' + str(self.ties)

class EmptyPlayer(BasePlayer):
    def __init__(self):
        super().__init__()

    def move(self, game, state):
        pass

class RandomPlayer(BasePlayer):
    def __init__(self):
        super().__init__()

    def move(self, game, state):
        return game.sample(legal=True)

class HumanPlayer(BasePlayer):
    def __init__(self):
        super().__init__()

    def move(self, game, state):
        print(game)
        return int(input())

    def record_outcome(self, game, outcome):
        print(game)
        if outcome == self.n:
            print('...YOU WIN!')
        elif outcome == -self.n:
            print('...YOU LOSE...')
        else:
            print('...TIE GAME...')

class MinimaxPlayer(BasePlayer):
    # return a valid move (0..9, equal to [row * 3 + col] )
    def move(self, game, state):
        best_row = -1
        best_col = -1
        best_score = -100
        for row in range(3):
            for col in range(3):
                if game.board[row][col] == 0:
                    game.board[row][col] = self.n
                    score = self.minimax(game,False)
                    game.board[row][col] = 0
                    if score > best_score:
                        best_score = score
                        best_row = row
                        best_col = col
        return best_row * 3 + best_col

    # check if the board is full (implying that the game is over)
    def board_is_full(self, board):
        for row in range(3):
            for col in range(3):
                if board[row][col] == 0:
                    return False
        return True

    # do the work first described by Jon Von Neumann in 1928
    def minimax(self, game, isMe):

        # first, check for terminal conditions...
        scores = game.max_min()
        if scores[0] == 3:
            return scores[0] * self.n       # 'x' wins...
        elif scores[1] == -3:
            return scores[1] * self.n       # 'o' wins...
        if self.board_is_full(game.board):
            return 0                        # ...and a tie is also a terminal condition.

        if isMe:
            best = -4
            for row in range(3):
                for col in range(3):
                    if game.board[row][col] == 0:
                        game.board[row][col] = self.n
                        best = max(best, self.minimax(game, not isMe))
                        game.board[row][col] = 0
        else:
            best = 4
            for row in range(3):
                for col in range(3):
                    if game.board[row][col] == 0:
                        game.board[row][col] = -self.n
                        best = min(best, self.minimax(game, not isMe))
                        game.board[row][col] = 0
        return best

class Game:
    def __init__(self, x_player=EmptyPlayer(), o_player=MinimaxPlayer(), animation=None):
        self.x_player = x_player
        self.o_player = o_player
        self.x_player.set_n(1)
        self.o_player.set_n(-1)
        self.x_player.set_env(self)
        self.o_player.set_env(self)
        self.i = 0
        self.animation = animation
        self.reset()

    def reset(self, mode='reinforcement_learning'):
        self.board = np.zeros((3,3))
        self.x_turn = True
        self.x_player.reset()
        self.o_player.reset()
        self.moves = []
        self.states = []
        self.available = [0,1,2,3,4,5,6,7,8]
        if self.animation is not None:
            self.animation.clear()
        if mode == 'reinforcement_learning':
            return self.state(self.x_player, self.board)

    # used for reinforcement learning only... for normal game play, use move()
    def step(self, action):
        try:
            self.move(action, self.x_player)
            if self.x_wins():
                return self.state(self.x_player, self.board), 1, True
            elif len(self.available) > 0:
                action = self.o_player.move(self, self.state(self.o_player, self.board))
                outcome = self.move(action, self.o_player)
                if self.o_wins():
                    return self.state(self.x_player, self.board), -1, True
                else:
                    return self.state(self.x_player, self.board), 0, len(self.available) == 0
            else:
                return self.state(self.x_player, self.board), 0, True
        except ValueError:
            return self.state(self.x_player, self.board), -1, True

    def state_space():
        return 3**9

    def action_space():
        return 9

    def x_wins(self):
        return max(self.max_min()) == 3

    def o_wins(self):
        return min(self.max_min()) == -3

    # returns the max and min of sum of each axis + each diagonal
    def max_min(self):
        col_sum = np.sum(self.board,0)
        row_sum = np.sum(self.board,1)
        maxs = np.maximum(col_sum,row_sum)
        mins = np.minimum(col_sum,row_sum)
        diag0 = self.board.trace(0)
        diag1 = np.flip(self.board,0).trace(0)
        return max(max(maxs), diag0, diag1), min(min(mins), diag0, diag1)

    def move(self, action, player):
        n_player = 1 if player is self.x_player else -1
        row = action // 3
        col = action % 3
        if self.board[row][col] == 0:
            self.board[row][col] = n_player
            self.available.remove(action)
            self.moves.append((n_player,row,col))
            self.states.append(self.state(self.x_player, self.board)) # supports instant replay
            if self.animation is not None:
                if n_player == 1:
                    self.animation.x(row,col)
                    time.sleep(0.1)
                else:
                    self.animation.o(row,col)
                    time.sleep(0.1)
        else:
            raise ValueError("illegal move")
        x,o = self.max_min()
        return x == 3 or o == -3

    # return a random legal move (do not call if all 9 squares are taken!)
    def sample(self, legal=False):
        if legal:
            if len(self.available) == 0:
                raise ValueError('cannot sample randomly; board is full')
            else:
                return self.available[np.random.randint(len(self.available))]
        else:
            return np.random.randint(9)

    def sequential(self):
        for row in range(3):
            for col in range(3):
                if self.board[row][col] == 0:
                    return row * 3 + col
        raise ValueError('cannot sample sequentially; board is full')

    def state(self, player, board):
        i = 0
        n = 1 if player is self.x_player else -1
        for row in range(3):
            for col in range(3):
                i += (n * board[row][col] + 1) * (3 ** (row*3+col))
        return int(i)

    def construct_board(state):
        board = np.zeros((3,3))
        for row in range(2,-1,-1):
            for col in range(2,-1,-1):
                exp = 3 ** (row*3+col)
                board[row][col] = state // exp - 1
                state = state % exp
        return board

    def play(self):
        self.reset()
        self.i+=1
        while len(self.available) > 0:
            player = self.x_player if self.x_turn else self.o_player
            opponent = self.x_player if not self.x_turn else self.o_player
            state = self.state(player, self.board)
            p_row_col = player.move(self,state)
            try:
                player_wins = self.move(p_row_col,player)
            except ValueError:
                player.update(self,state,-100, True)
                opponent.update(self,state,1,True)
                break
            if player_wins:
                player.update(self,state,1, True)
                opponent.update(self,state,-1, True)
                break
            else:
                opponent.update(self,state,0, False)
            self.x_turn = not self.x_turn
        if player_wins:
            self.x_player.record_outcome(self, player.n)
            self.o_player.record_outcome(self, player.n)
        else:
            self.x_player.record_outcome(self, 0)
            self.o_player.record_outcome(self, 0)

    def game_over(self):
        return np.max(np.absolute(self.max_min())) == 3 or self.i >= 9

    def replay(self):
        print('=== REPLAY =================================')
        print(self.states)
        i = 0
        for state in self.states:
            i += 1
            print('===( ' + str(i) + ' [ state=' + str(state) + ' ] )=====================\n')
            self.board = Game.construct_board(state)
            print(Game.draw(self.board))
            if (self.x_wins()):
                print('X wins!')
            if (self.o_wins()):
                print('O wins!')
        print('\n')

    def draw(board):
        s = ''
        for i in range(3):
            for j in range(3):
                if board[i][j] == 1:
                    s += ' X'
                elif board[i][j] == -1:
                    s += ' O'
                else:
                    s += ' .'
            s += '\n'
        s += '\n'
        return s

    def __str__(self):
        s = '\n---( '
        s += str(9-len(self.available))
        s += ' )--------------------------\n\n'
        s += Game.draw(self.board)
        s += 'x state = '
        s += str(self.state(self.x_player, self.board))
        s += ', o state = '
        s += str(self.state(self.o_player, self.board))
        s += ' available: ' + str(self.available)
        return s

In [4]:
g = Game(RandomPlayer(),RandomPlayer())
g.play()
g.replay()

[9842, 7655, 7898, 7871, 8600, 8519, 8522, 1961, 1970]

 X . .
 . . .
 . . .



 X . .
 . . .
 . O .



 X . .
 . . X
 . O .



 X . .
 O . X
 . O .



 X . .
 O . X
 X O .



 X . .
 O O X
 X O .



 X X .
 O O X
 X O .



 X X .
 O O X
 X O O



 X X X
 O O X
 X O O


X wins!




In [5]:
g = Game(MinimaxPlayer(), MinimaxPlayer())
g.play()
g.replay()

[9842, 9761, 9764, 9755, 10484, 10457, 10700, 8513, 15074]

 X . .
 . . .
 . . .



 X . .
 . O .
 . . .



 X X .
 . O .
 . . .



 X X O
 . O .
 . . .



 X X O
 . O .
 X . .



 X X O
 O O .
 X . .



 X X O
 O O X
 X . .



 X X O
 O O X
 X O .



 X X O
 O O X
 X O X






In [6]:
g = Game(RandomPlayer(), MinimaxPlayer())
g.play()
g.replay()

[9868, 9867, 9870, 9789, 10032, 10023, 10752, 4191]

 . . .
 X . .
 . . .



 O . .
 X . .
 . . .



 O X .
 X . .
 . . .



 O X .
 X O .
 . . .



 O X .
 X O X
 . . .



 O X O
 X O X
 . . .



 O X O
 X O X
 X . .



 O X O
 X O X
 X . O


O wins!




In [7]:
g = Game(HumanPlayer(), MinimaxPlayer())
g.play()



---( 0 )--------------------------

 . . .
 . . .
 . . .

x state = 9841, o state = 9841 available: [0, 1, 2, 3, 4, 5, 6, 7, 8]
2

---( 2 )--------------------------

 . . X
 . O .
 . . .

x state = 9769, o state = 9913 available: [0, 1, 3, 5, 6, 7, 8]
8

---( 4 )--------------------------

 . . X
 . O O
 . . X

x state = 16087, o state = 3595 available: [0, 1, 3, 6, 7]
3

---( 6 )--------------------------

 O . X
 X O O
 . . X

x state = 16113, o state = 3569 available: [1, 6, 7]
7

---( 8 )--------------------------

 O . X
 X O O
 O X X

x state = 17571, o state = 2111 available: [1]
1

---( 9 )--------------------------

 O X X
 X O O
 O X X

x state = 17574, o state = 2108 available: []
...TIE GAME...


In [8]:
print(Game.state_space(),Game.action_space())

19683 9


In [9]:
import numpy as np

env = Game()

q = np.zeros((Game.state_space(), Game.action_space()))
explore_rate = 0.02

# not functional -- used to report progress
moves = 0

state = env.reset()
while True:
    if np.random.random()<explore_rate: 
        action = env.sample()         
    else:                               
        action = np.argmax(q[state])
        
    obs,reward,done = env.step(action)
    q[state][action] = reward+(3/4)*np.max(q[obs]) # notice the 3/4?
    state = obs

    # not functional -- used to report progress
    moves += 1

    if done:
        if reward == 0:
            break                      
        else:
            # not functional -- used to report progress
            print(moves)
            moves = 0
            state = env.reset()       

env.replay()

3
2
2
3
3
3
3
3
3
4
4
4
4
4
5
5
5
5
5
5
5
5
[9842, 9761, 9764, 9755, 10484, 10457, 10700, 8513, 15074]

 X . .
 . . .
 . . .



 X . .
 . O .
 . . .



 X X .
 . O .
 . . .



 X X O
 . O .
 . . .



 X X O
 . O .
 X . .



 X X O
 O O .
 X . .



 X X O
 O O X
 X . .



 X X O
 O O X
 X O .



 X X O
 O O X
 X O X






In [0]:
class QPlayer(BasePlayer):

    def __init__(self, q):
        super().__init__()
        self.q = q

    def move(self,game,state):
        return np.argmax(q[state])

    def update(self,env,state,reward,done):
        print('reward = ',reward)


In [11]:
g = Game(QPlayer(q), HumanPlayer())
g.play()


---( 1 )--------------------------

 X . .
 . . .
 . . .

x state = 9842, o state = 9840 available: [1, 2, 3, 4, 5, 6, 7, 8]
3
reward =  0
reward =  -100

---( 2 )--------------------------

 X . .
 O . .
 . . .

x state = 9815, o state = 9867 available: [1, 2, 4, 5, 6, 7, 8]
...TIE GAME...
