In [1]:
import numpy as np
from game.game_round import GameRound
from game.game_series import GameSeries
from agents.human_agent import HumanAgent
from agents.random_agent import RandomAgent
from agents.abstract_agent import AbstractAgent
from environment.board import Board
from environment.player_type import PlayerType
from game.game_reward import GameReward

# Human player

In [4]:
GameRound(HumanAgent("Alice"), HumanAgent("Bob")).play()

Human Alice's turn as X


  |   |        |   |  
---------    ---------
  | X |        | 1 |  
---------    ---------
  |   |        |   |  
Human Bob's turn as O
  |   |        |   |  
---------    ---------
  | X | O      | 1 | 2
---------    ---------
  |   |        |   |  
Human Alice's turn as X
X |   |      3 |   |  
---------    ---------
  | X | O      | 1 | 2
---------    ---------
  |   |        |   |  
Human Bob's turn as O
X |   |      3 |   |  
---------    ---------
  | X | O      | 1 | 2
---------    ---------
  | O |        | 4 |  
Human Alice's turn as X
X |   |      3 |   |  
---------    ---------
  | X | O      | 1 | 2
---------    ---------
  | O | X      | 4 | 5
Human Alice wins!


1

# Random agent player

In [2]:
GameRound(RandomAgent("Charlie"), RandomAgent("Damian")).play()

Random Agent Charlie's turn as X
  |   |        |   |  
---------    ---------
X |   |      1 |   |  
---------    ---------
  |   |        |   |  
Random Agent Damian's turn as O
  |   |        |   |  
---------    ---------
X |   |      1 |   |  
---------    ---------
O |   |      2 |   |  
Random Agent Charlie's turn as X
  |   |        |   |  
---------    ---------
X |   |      1 |   |  
---------    ---------
O |   | X    2 |   | 3
Random Agent Damian's turn as O
O |   |      4 |   |  
---------    ---------
X |   |      1 |   |  
---------    ---------
O |   | X    2 |   | 3
Random Agent Charlie's turn as X
O | X |      4 | 5 |  
---------    ---------
X |   |      1 |   |  
---------    ---------
O |   | X    2 |   | 3
Random Agent Damian's turn as O
O | X |      4 | 5 |  
---------    ---------
X |   |      1 |   |  
---------    ---------
O | O | X    2 | 6 | 3
Random Agent Charlie's turn as X
O | X |      4 | 5 |  
---------    ---------
X |   | X    1 |   | 7
---------    

1

## Versus human player

In [3]:
GameRound(HumanAgent("Alice"), RandomAgent("Charlie")).play()

Human Alice's turn as X
X |   |      1 |   |  
---------    ---------
  |   |        |   |  
---------    ---------
  |   |        |   |  
Random Agent Charlie's turn as O
X |   |      1 |   |  
---------    ---------
  |   |        |   |  
---------    ---------
O |   |      2 |   |  
Human Alice's turn as X
X |   |      1 |   |  
---------    ---------
  | X |        | 3 |  
---------    ---------
O |   |      2 |   |  
Random Agent Charlie's turn as O
X | O |      1 | 4 |  
---------    ---------
  | X |        | 3 |  
---------    ---------
O |   |      2 |   |  
Human Alice's turn as X
X | O |      1 | 4 |  
---------    ---------
  | X |        | 3 |  
---------    ---------
O |   | X    2 |   | 5
Human Alice wins!


1

# Learning agent player - values iteration

In [22]:
class EnvironmentModel:
    def get_valid_board_state_actions(self, board_state):
        valid_state_actions = {}
        for action_coords in GridFunctions.all_grid_coordinates(Board.ROWS, Board.COLUMNS):
            action_x, action_y = action_coords
            if board_state[action_x][action_y] == PlayerType.NO_PLAYER:
                valid_state_actions[action_coords] = self._get_resultant_board_state(board_state, action_coords)
        return valid_state_actions
    
    def get_reward_for_board_state(self, board_state):
        win_state = self._get_win_state(board_state)
        if win_state == None:
            return 0
        if win_state == PlayerType.NO_PLAYER:
            return GameReward.DRAW_REWARD
        return GameReward.WIN_REWARD if win_state == PlayerType.PLAYER_1 else GameReward.LOSE_REWARD

    def _get_resultant_board_state(self, board_state, action_coords):
        resultant_board_state = board_state.copy()
        resultant_board_state[action_coords[0]][action_coords[1]] = PlayerType.PLAYER_1
        return resultant_board_state    
    
    def _get_win_state(self, board_state):
        for i in range(Board.ROWS):
            player_at_intersection = board_state[i][i]
            if player_at_intersection == PlayerType.NO_PLAYER:
                continue
            is_matching_row = board_state[i][0] == board_state[i][1] == board_state[i][2]
            is_matching_column = board_state[0][i] == board_state[1][i] == board_state[2][i]
            if is_matching_row or is_matching_column:
                return player_at_intersection
        player_at_centre = board_state[1][1]
        if player_at_centre == PlayerType.NO_PLAYER:
            return None
        is_matching_diagonal = board_state[0][0] == board_state[1][1] == board_state[2][2]
        is_matching_inverse_diagonal = board_state[0][2] == board_state[1][1] == board_state[2][0]
        if is_matching_diagonal or is_matching_inverse_diagonal:
            return player_at_centre
        is_board_full = all([x != 0 for row in board_state for x in row])
        return PlayerType.NO_PLAYER if is_board_full else None

class ValuesIterationPolicy:
    def __init__(self, discount=0.9, delta_threshold=0.01, max_iterations=10):
        self._discount = discount
        self._delta_threshold = delta_threshold
        self._state_values = self._initial_states_value()
        self._model = EnvironmentModel()
        self._iterate_on_values(discount, delta_threshold, max_iterations)

    def get_action_policy(self, state):
        valid_state_actions = self._model.get_valid_board_state_actions(state)
        valid_state_actions_values = {}
        for (x, y), resultant_board_state in valid_state_actions.items():
            valid_state_actions_values[(x, y)] = self._get_state_value(self._board_state_to_state(resultant_board_state))

        actions_values_as_grid = [
            [
                valid_state_actions_values[(x,y)] if (x,y) in valid_state_actions else 0
                for x in range(Board.COLUMNS)
            ]
            for y in range(Board.ROWS)
        ]
        return GridFunctions.normalise_grid(actions_values_as_grid, Board.ROWS, Board.COLUMNS)

    def handle_reward(self, prior_state, action_taken, new_state, reward):
        # this is a dynamic programming model based approach, so learning is upfront
        pass

    def _iterate_on_values(self, discount, delta_threshold, max_iterations):
        for iteration in range(max_iterations):
            delta = 0
            # iterate over states in reverse order as this matches the reward propagation for this game and for a majority of others
            for state in reversed(self._get_all_state_combinations()):
                board_state = self._state_to_board_state(state)
                current_state_value = self._get_state_value(state)
                valid_state_actions = self._model.get_valid_board_state_actions(board_state)
                values = []
                for _, resultant_board_state in valid_state_actions.items():
                    reward = self._model.get_reward_for_board_state(resultant_board_state)
                    resultant_state = self._board_state_to_state(resultant_board_state)
                    value = reward + discount * self._get_state_value(resultant_state)
                    values.append(value)

                new_state_value = max(values) if values else 0
                self._set_state_value(state, new_state_value)
                delta = max(delta, abs(current_state_value - new_state_value))
                
            print(f"Iteration: {iteration}, delta: {delta}")
            if delta < delta_threshold:
                break

    def _initial_states_value(self):
        return np.zeros((3, 3, 3, 3, 3, 3, 3, 3, 3))

    def _get_state_value(self, state):
        return self._state_values[state]
    
    def _set_state_value(self, state, value):
        self._state_values[state] = value

    def _get_all_state_combinations(self):
        return [
            (x0y0, x1y0, x2y0, x0y1, x1y1, x2y1, x0y2, x1y2, x2y2)
            for x0y0 in range(3)
            for x1y0 in range(3)
            for x2y0 in range(3)
            for x0y1 in range(3)
            for x1y1 in range(3)
            for x2y1 in range(3)
            for x0y2 in range(3)
            for x1y2 in range(3)
            for x2y2 in range(3)
        ]
    
    def _state_to_board_state(self, state):
        return [
            [state[0], state[1], state[2]],
            [state[3], state[4], state[5]],
            [state[6], state[7], state[8]],
        ]
    
    def _board_state_to_state(self, board_state):
        return tuple([item for row in board_state for item in row])

class LearningAgent(AbstractAgent):
    MAX_ACTIONS = Board.ROWS * Board.COLUMNS

    def __init__(self, name):
        self.name = name
        self.policy = ValuesIterationPolicy()
        self.round_state_actions = []

    def get_move(self, board_plays, current_player):
        board_state = self._get_board_state(board_plays, current_player)
        action_policy = self._get_state_action_policy(self.policy.get_action_policy(board_state), board_plays)
        action_coordinates = self.random_from_grid(action_policy)
        self.round_state_actions.append((board_state, action_coordinates))
        return action_coordinates

    def random_from_grid(self, action_policy):
        actions_coordinates = GridFunctions.flatten_to_coordinates(action_policy)
        action_index = np.random.choice(range(Board.ROWS*Board.COLUMNS), 1, p=list(actions_coordinates.values())).item()
        return list(actions_coordinates.keys())[action_index]

    def handle_reward(self, reward, board_plays, current_player):
        new_board_state = self._get_board_state(board_plays, current_player)
        prior_board_state, action = self.round_state_actions[-1] if self.round_state_actions else (None, None)
        self.policy.handle_reward(prior_board_state, action, new_board_state, reward)
    
    def __str__(self):
        return f"Learning Agent {self.name}"
    
    def _get_board_state(self, board_plays, current_player):
        return self._get_board_plays_in_agent_perspective(board_plays, current_player)

    def _get_board_plays_in_agent_perspective(self, board_plays, current_player):
        if current_player == PlayerType.PLAYER_1:
            # player ordinal matches agent ordinal
            return board_plays
        player_2_to_agent_a_lookup = {
            PlayerType.NO_PLAYER: 0,
            PlayerType.PLAYER_2: 1,
            PlayerType.PLAYER_1: 2,
        }
        return [
            [player_2_to_agent_a_lookup[square] for square in row]
            for row in board_plays 
        ]
    
    def _get_state_action_policy(self, action_probabilities, board_plays):
        valid_action_probabilities = action_probabilities.copy()
        for x, y in GridFunctions.all_grid_coordinates(Board.ROWS, Board.COLUMNS):
            if board_plays[x][y] != PlayerType.NO_PLAYER:
                valid_action_probabilities[x][y] = 0
        return GridFunctions.normalise_grid(valid_action_probabilities, Board.ROWS, Board.COLUMNS)
    
class GridFunctions:
    @staticmethod
    def normalise_grid(grid, row_count, column_count):
        total = np.sum([grid[x][y] for x, y in GridFunctions.all_grid_coordinates(row_count, column_count)])
        if total == 0:
            average_value = 1 / (row_count * column_count)
            return [[average_value for _ in range(row_count)] for _ in range(column_count)]
        return [
            [grid[x][y] / total for y in range(row_count)] 
            for x in range(column_count)
        ]
    
    @staticmethod
    def all_grid_coordinates(row_count, column_count):
        return [(x, y) for x in range(row_count) for y in range(column_count)]
    
    @staticmethod
    def flatten_to_coordinates(grid):
        return {
            (row_index, column_index): value
            for row_index, row in enumerate(grid) 
            for column_index, value in enumerate(row)
        }
    

In [23]:
learning_agent_a = LearningAgent("Emma")
learning_agent_b = LearningAgent("Fred")
GameSeries(20, learning_agent_a, learning_agent_b).play()

Iteration: 0, delta: 3.0
Iteration: 1, delta: 0
Iteration: 0, delta: 3.0
Iteration: 1, delta: 0


[1, 2, 1, 2, 2, 2, 1, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 2, 2, 2]

## Versus random agent player

In [26]:
GameSeries(20, learning_agent_a, RandomAgent("Charlie")).play()

[1, 2, 0, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 2, 1, 0]

## Versus human player

In [25]:
GameRound(learning_agent_a, HumanAgent("Alice")).play()

Learning Agent Emma's turn as X
X |   |      1 |   |  
---------    ---------
  |   |        |   |  
---------    ---------
  |   |        |   |  
Human Alice's turn as O
X |   |      1 |   |  
---------    ---------
  | O |        | 2 |  
---------    ---------
  |   |        |   |  
Learning Agent Emma's turn as X
X |   |      1 |   |  
---------    ---------
  | O |        | 2 |  
---------    ---------
  |   | X      |   | 3
Human Alice's turn as O
X |   | O    1 |   | 4
---------    ---------
  | O |        | 2 |  
---------    ---------
  |   | X      |   | 3
Learning Agent Emma's turn as X
X |   | O    1 |   | 4
---------    ---------
  | O | X      | 2 | 5
---------    ---------
  |   | X      |   | 3
Human Alice's turn as O
X |   | O    1 |   | 4
---------    ---------
  | O | X      | 2 | 5
---------    ---------
O |   | X    6 |   | 3
Human Alice wins!


2