In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
LENGTH = 3
class Environment:
    def __init__(self, board = None):
        self.x = 1
        self.o = -1
        self.board = np.zeros((LENGTH, LENGTH))
        self.winner = None
        self.num_states = 3 ** (LENGTH * LENGTH)
    
    def game_over(self):
        # Check the rows to see if any player aligned 3 pieces
        for i in range(LENGTH):
            row_sum = self.board[i, :].sum()
            if row_sum == LENGTH:
                self.winner = self.x
                return True
            if row_sum == -LENGTH:
                self.winner = self.o
                return True
        # Check columns
        for j in range(LENGTH):
            col_sum = self.board[:, j].sum()
            if col_sum == LENGTH:
                self.winner = self.x
                return True
            if col_sum == -LENGTH:
                self.winner = self.o
                return True
            
        # Check main diagonal
        diag_sum = self.board.trace()
        if diag_sum == LENGTH:
            self.winner = self.x
            return True
        if diag_sum == -LENGTH:
            self.winner = self.o
            return True
        
        # Check the second diagonal
        opp_diag_sum = np.fliplr(self.board).trace()
        if opp_diag_sum == LENGTH:
            self.winner = self.x
            return True
        if opp_diag_sum == -LENGTH:
            self.winner = self.o
            return True
        
        # If none of the above is true, check if the grid is full
        for i in range(LENGTH):
            for j in range(LENGTH):
                if self.is_empty(i, j):
                    return False
        return True
    
    def draw_board(self):
        print('-------------------------')
        for i in range(LENGTH):
            for j in range(LENGTH):
                print('|', end = '   ')
                if self.board[i, j] == self.x:
                    print('x', end = '   ')
                elif self.board[i, j] == self.o:
                    print('o', end = '   ')
                else:
                    print(' ', end = '   ')
            print('|', end = '\n')
            print('-------------------------')
    
    # The state is an integer obtained by considering the grid as a number in base 3, then converting this number to the decimal base
    # e.g:
    #-------------------------
    #|   x   |       |       |
    #-------------------------       state = 3^0 * 1 + 3^1 * 0 + 3^2 * 0 + 3^3 * 0 + 3^4 * 1 + ... + 3^7 * 2 + 3^8 * 2 = 17578
    #|       |   x   |       |
    #-------------------------
    #|       |   o   |   o   |
    #------------------------
    def get_state(self):
        result = 0
        power = 0
        for i in range(LENGTH):
            for j in range(LENGTH):
                value = 0
                if self.board[i, j] == self.x:
                    value = 1
                elif self.board[i, j] == self.o:
                    value = 2
                result += (3 ** power) * value
                power += 1
        return result
    
    def set_board(self, board):
        self.board = board
        
    # Converts a state (number) to a board (3x3 grid) by finding the representation of the state in base 3
    def convert_state_to_board(self, state):
        # grid of length 9 that will contain the base 3 representation of the state
        grid = np.zeros(LENGTH * LENGTH)
        i = 0
        # Division algorithm to go from base 10 to another base
        while state >= 3:
            grid[i] = state % 3
            state //= 3
            i += 1
        grid[i] = state
        # Replace the '1' and '2' by the actual symbols for x and o
        grid = np.where(grid == 1, self.x, grid)
        grid = np.where(grid == 2, self.o, grid)
        
        grid = grid.reshape((LENGTH, LENGTH))
        
        return grid
    
    def is_empty(self, i, j):
        return self.board[i, j] == 0
    
    # The player gets a reward only if he wins the game, otherwise he doesn't get any
    def reward(self, symbol):
        if self.game_over() and self.winner == symbol:
            return 1
        else:
            return 0
        
    def reset(self):
        self.winner = None
        self.board = np.zeros((LENGTH, LENGTH))

In [3]:
env = Environment()
env.board[0, 0] = 1
env.board[1, 1] = 1
env.board[2, 1] = -1
env.board[2, 2] = -1
env.draw_board()
env.game_over()
env.reward(-1)

-------------------------
|   x   |       |       |
-------------------------
|       |   x   |       |
-------------------------
|       |   o   |   o   |
-------------------------


0

In [4]:
class Agent:
    def __init__(self, eps, alpha, symbol, verbose = False):
        self.eps = eps    # for epsilon-greedy algorithm
        self.alpha = alpha    # learning rate
        self.verbose = verbose    # wether or not the internal behaviours should be reported
        self.symbol = symbol    # empty, 'x' = 1 or 'o' = -1
        self.state_history = []    # all the states we went through during a game
        self.value_function = {}    # (state, value) mapping, saved as a dictionary
        self.set_value_function_iterative()    # calls the set_value_function method that finds the values of all possible states
    
    # helper method
    def set_value_function_recursive_helper(self):
        if self.verbose:
            print('Initializing value function recursively...')
        self.set_value_function_in(Environment(), self.value_function, 0, 0)
        
    def set_symbol(self, symbol):
        self.symbol = symbol
        
    def set_verbose(self, verbose):
        self.verbose = verbose
    
    # finds values of all possible states, given the current environment (current board) and the coordinates of the next symbol to be placd on the grid
    def set_value_function_recursive(self, env, value_dict, i, j):
        for symbol in [env.x, env.o, 0]:
            env.board[i, j] = symbol
            # Find the (state, value) pair
            state = env.get_state()
            value = 0
            if env.game_over():
                if env.winner == self.symbol:
                    value = 1
                else:
                    value = 0
            else:
                value = 0.5

            # save the (state, value) pair in the dictionary
            value_dict[state] = value

            # Find the next coordinates and only recurse if (i, j) != (2, 2)
            if i < LENGTH - 1:
                if j < LENGTH - 1:
                    next_i = i
                    next_j = j + 1
                else:
                    next_i = i + 1
                    next_j = 0
                self.set_value_function_recursive(env, value_dict, next_i, next_j)
            else:
                if j < LENGTH - 1:
                    next_i = i
                    next_j = j + 1
                    self.set_value_function_recursive(env, value_dict, next_i, next_j)   
                    
    # More efficient way of finding the value function (without recursion)         
    def set_value_function_iterative(self):
        if self.verbose:
            print('Initializing value function iteratively...')
        env = Environment()
        self.value_function = {}
        for state in range(env.num_states):
            # Convert the state (number) to a board (3x3 grid)
            board = env.convert_state_to_board(state)
            # Set the new board to be the board of our environment
            env.set_board(board)
            # Find the initial value (1 if win, 0 if lose or draw, 0.5 otherwise)
            value = 0
            if env.game_over():
                if env.winner == self.symbol:
                    value = 1
                else:
                    value = 0
            else:
                value = 0.5
            # Save the (state, value) pair in the value function dictionary
            self.value_function[state] = value
            
    def take_action(self, env, training_mode = True):
        # find the posible actions (empty cells) the agent can take
        actions = []
        values = []
        for i in range(LENGTH):
            for j in range(LENGTH):
                if env.board[i, j] == 0:
                    actions.append((i, j))
        
        action = (0, 0)
        # epsilon-greedy algorithm
        r = np.random.random()
        if r < self.eps and training_mode:
            # take a random action (exploration, only when training)
            if self.verbose:
                print('Taking random action')
            action_index = np.random.choice(len(actions))
            action = actions[action_index]
        else:
            # Find the action with the maximum value
            if self.verbose:
                print('Finding action with maximum value')
            max_value = -1
            max_action = (0, 0)
            for (i, j) in actions:
                env.board[i, j] = self.symbol
                state = env.get_state()
                value = self.value_function[state]
                values.append(value)
                if value > max_value:
                    max_value = value
                    max_action = (i, j)
                env.board[i, j] = 0
            
            if self.verbose:
                # Printing board with the values of each possible action (Not  necessary)
                print('-------------------------')
                for i in range(LENGTH):
                    for j in range(LENGTH):
                        print('|', end = ' ')
                        if env.board[i, j] == env.x:
                            print('  x', end = '   ')
                        elif env.board[i, j] == env.o:
                            print('  o', end = '   ')
                        else:
                            print(' {:0.2f}'.format(values[actions.index((i, j))]), end = ' ')
                    print('|', end = '\n')
                    print('-------------------------')
                
            action = max_action
        
        # Place symbol on the selected action
        env.board[action] = self.symbol
        
        # Update state history
        self.update_state_history(env.get_state())
    
    # Saves each state reached by the agent (in order)
    def update_state_history(self, state):
        self.state_history.append(state)
    
    # Updates the value functions with the formula:
    #    V(s) = V(s) + alpha(V(s') - V(s))
    def update(self):
        #print(self.state_history)
        #print('Old values:')
        #for state in self.state_history:
        #    print(self.value_function[state], end = ' ')
        #print()
        for i in range(len(self.state_history) - 2, -1, -1):
            state = self.state_history[i]
            next_state = self.state_history[i + 1]
            self.value_function[state] += self.alpha * (self.value_function[next_state] - self.value_function[state])
        #print('---------------------------')
        #print('New values:')
        #for state in self.state_history:
        #    print(self.value_function[state], end = ' ')
        #print('\n-----------------------------')
            
    def reset_history(self):
        self.state_history = []
       

In [46]:
# Play one game
def play_game(p1, p2, env, verbose = False, training_mode = True):
    # Keep switching between players until the game is over
    while not env.game_over():
        if verbose:
            print('Player 1 turn:')
            
        # since take_action only updates the state history of player 1, we should update player 2
        #    history manually
        p1.take_action(env, training_mode)
        p2.update_state_history(env.get_state())
        
        
        if verbose:
            env.draw_board()
        
        # In case player 1 won before player 2 had to play
        if env.game_over():
            break
        
        if verbose:
            print('Player 2 turn:')
        
        # since take_action only updates the state history of player 2, we should update player 1
        #    history manually
        p2.take_action(env, training_mode)
        p1.update_state_history(env.get_state())
        
        if verbose:
            env.draw_board()
    # print results
    if verbose:
        if env.winner == None:
            print('Game Over. Draw')
        elif env.winner == p1.symbol:
            print('Game Over. Player 1 wins')
        else:
            print('Game Over. Player 2 wins')

In [6]:
env = Environment()
p1 = Agent(0.3, 0.3, env.x, True)
p2 = Agent(0.3, 0.3, env.o, True)

Initializing value function iteratively...
Initializing value function iteratively...


In [7]:
#play_game(p1, p2, env, True)

In [8]:
# Play games for the given number of episodes and update the value functions after every game
def train_agents(p1, p2, env, episodes = 10000, verbose = False):
    p1.set_verbose(verbose)
    p2.set_verbose(verbose)
    for ep in range(episodes):
        # Alternate the starters, so that each player learns both how to attack and defend
        r = np.random.random()
        if r < 0.5:
            play_game(p1, p2, env, verbose)
        else:
            play_game(p2, p1, env, verbose)
        # update the value functions, then reset state histories for a new game
        p1.update()
        p2.update()
        p1.reset_history()
        p2.reset_history()
        # reset the board for the next game
        env.reset()

In [43]:
train_agents(p1, p2, env, episodes = 1000, verbose = False)

In [10]:
env.reset()
p1.set_verbose(True)
p2.set_verbose(True)
play_game(p1, p2, env, verbose = True, training_mode = False)

Player 1 turn:
Finding action with maximum value
-------------------------
|  0.45 |  0.46 |  0.45 |
-------------------------
|  0.46 |  0.58 |  0.42 |
-------------------------
|  0.46 |  0.43 |  0.46 |
-------------------------
-------------------------
|       |       |       |
-------------------------
|       |   x   |       |
-------------------------
|       |       |       |
-------------------------
Player 2 turn:
Finding action with maximum value
-------------------------
|  0.38 |  0.38 |  0.40 |
-------------------------
|  0.42 |   x   |  0.41 |
-------------------------
|  0.40 |  0.40 |  0.42 |
-------------------------
-------------------------
|       |       |       |
-------------------------
|   o   |   x   |       |
-------------------------
|       |       |       |
-------------------------
Player 1 turn:
Finding action with maximum value
-------------------------
|  0.53 |  0.55 |  0.50 |
-------------------------
|   o   |   x   |  0.50 |
---------------------

In [39]:
# Adds the take_human_action method, which is more appropriate for human players
class Human(Agent):
    def __init__(self, symbol):
        super().__init__(0,0,symbol)
    # This function will ask the human player for his actions, instead of choosing them greedily
    def take_human_action(self, env, training_mode = False):
        # find the posible actions (empty cells) the agent can take
        actions = []
        values = []
        for i in range(LENGTH):
            for j in range(LENGTH):
                if env.board[i, j] == 0:
                    actions.append((i, j))
        # Ask for the human player's action, given as a double digit integer
        #    e.g.: 21 corresponds to action (2, 1)
        print('Enter action:', end = ' ')
        action_str = input()
        i = int(action_str[0])
        j = int(action_str[1])
        action = (i, j)
        # Keep asking for a new action if the given one is not valid
        while not action in actions:
            print('Invalid action!')
            print('Enter another action:', end = ' ')
            action_str = input()
            i = int(action_str[0])
            j = int(action_str[1])
            action = (i, j)
            
        # Place symbol on the selected action
        env.board[action] = self.symbol
        
        # Update state history (not really necessary here)
        self.update_state_history(env.get_state())

In [40]:
human = Human(env.o)

In [47]:
# This method will handle games between ai and human better (or human and human)
def play_vs_human(p1, p2, env, verbose = True):
    env.reset()
    while not env.game_over():
        if verbose:
            print('Player 1 turn:')
            
        # use take_action if ai playing, or take_human_action if human playing
        if isinstance(p1, Human):
            p1.take_human_action(env, False)
        else:
            p1.take_action(env, False)
            
        if verbose:
            env.draw_board()
        
        # If player 1 won already, no need for player 2 to play
        if env.game_over():
            break
            
        if verbose:
            print('Player 2 turn:')
        
        # use take_action if ai playing, or take_human_action if human playing
        if isinstance(p2, Human):
            p2.take_human_action(env, False)
        else:
            p2.take_action(env, False)
            
        if verbose:
            env.draw_board()
    # print results
    if verbose:
        if env.winner == None:
            print('Game Over. Draw')
        elif env.winner == p1.symbol:
            print('Game Over. Player 1 wins')
        else:
            print('Game Over. Player 2 wins')

In [45]:
play_vs_human(human, p1, env, True)

Player 1 turn:
Enter action: 11
-------------------------
|       |       |       |
-------------------------
|       |   o   |       |
-------------------------
|       |       |       |
-------------------------
Player 2 turn:
-------------------------
|       |       |       |
-------------------------
|   x   |   o   |       |
-------------------------
|       |       |       |
-------------------------
Player 1 turn:
Enter action: 20
-------------------------
|       |       |       |
-------------------------
|   x   |   o   |       |
-------------------------
|   o   |       |       |
-------------------------
Player 2 turn:
-------------------------
|       |       |   x   |
-------------------------
|   x   |   o   |       |
-------------------------
|   o   |       |       |
-------------------------
Player 1 turn:
Enter action: 21
-------------------------
|       |       |   x   |
-------------------------
|   x   |   o   |       |
-------------------------
|   o   |   o   

In [48]:
h1 = Human(env.x)
h2 = Human(env.o)

In [50]:
play_vs_human(h1, h2, env)

Player 1 turn:
Enter action: 00
-------------------------
|   x   |       |       |
-------------------------
|       |       |       |
-------------------------
|       |       |       |
-------------------------
Player 2 turn:
Enter action: 11
-------------------------
|   x   |       |       |
-------------------------
|       |   o   |       |
-------------------------
|       |       |       |
-------------------------
Player 1 turn:
Enter action: 02
-------------------------
|   x   |       |   x   |
-------------------------
|       |   o   |       |
-------------------------
|       |       |       |
-------------------------
Player 2 turn:
Enter action: 01
-------------------------
|   x   |   o   |   x   |
-------------------------
|       |   o   |       |
-------------------------
|       |       |       |
-------------------------
Player 1 turn:
Enter action: 21
-------------------------
|   x   |   o   |   x   |
-------------------------
|       |   o   |       |
--------