In [21]:
import itertools
import random

class TicTacToeEnv:
    def __init__(self):
        self.states = list(itertools.product(["X", "O", " "], repeat=9))

    @staticmethod
    def actions(state):
        return [i for i, s in enumerate(state) if s == " "]

    @staticmethod
    def transition_model(state, action, player):
        state_list = list(state)
        state_list[action] = player
        return tuple(state_list)

    @staticmethod

    def reward(state, player):
        win_positions = [(0, 1, 2), (3, 4, 5), (6, 7, 8), (0, 3, 6), (1, 4, 7), (2, 5, 8), (0, 4, 8), (2, 4, 6)]

        for pos in win_positions:
            if state[pos[0]] == state[pos[1]] == state[pos[2]] == player:
                return 1

        if " " not in state:
            return 0

        # Additional scenarios
        if player == "X":
            opponent = "O"
        else:
            opponent = "X"

        # Check for a fork (two possible ways to win)
        fork_positions = [(0, 2, 4), (0, 4, 8), (2, 4, 6), (2, 5, 8)]
        for pos in fork_positions:
            if state[pos[0]] == state[pos[1]] == player and state[pos[2]] == " ":
                return 0.9

        # Block opponent's fork
        for pos in fork_positions:
            if state[pos[0]] == state[pos[1]] == opponent and state[pos[2]] == " ":
                return 0.8

        # Check for center control
        if state[4] == player:
            return 0.7

        # Check for opposite corners
        if state[0] == player and state[8] == player:
            return 0.6

        # Check for corners
        corner_positions = [0, 2, 6, 8]
        if any(state[i] == player for i in corner_positions):
            return 0.5

        # Check for edges
        edge_positions = [1, 3, 5, 7]
        if any(state[i] == player for i in edge_positions):
            return 0.4

        return -1

    @staticmethod
    def is_terminal(state):
        return TicTacToeEnv.reward(state, "X") == 1 or TicTacToeEnv.reward(state, "O") == 1 or " " not in state

    @staticmethod
    def get_available_actions(state):
        return [i for i, s in enumerate(state) if s == " "]

if __name__ == "__main__":
    game = TicTacToeEnv()
    print("Possible States:")
    print(game.states)


Possible States:
[('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', ' '), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', '

### Policy Iteration

In [22]:
class PolicyIteration:
    def __init__(self, theta=0.1, discount_factor=0.99):
        self.theta = theta
        self.discount_factor = discount_factor
        self.states = TicTacToeEnv().states
        self.V = {s: 0 for s in self.states}
        self.pi = {s: 0 for s in self.states}  # Initialize policy arbitrarily

    def is_terminal(self, state):
        return TicTacToeEnv.reward(state, "X") == 1 or TicTacToeEnv.reward(state, "O") == 1 or " " not in state

    def policy_evaluation(self):
        while True:
            delta = 0
            for s in self.states:
                if self.is_terminal(s):
                    self.V[s] = 0
                    continue

                v = self.V[s]
                a = self.pi[s]
                s_prime = TicTacToeEnv.transition_model(s, a, "X")
                self.V[s] = TicTacToeEnv.reward(s_prime, "X") + self.discount_factor * self.V[s_prime]

                delta = max(delta, abs(v - self.V[s]))

            if delta <= self.theta:
                break

    def policy_improvement(self):
        policy_stable = True

        for s in self.states:
            if self.is_terminal(s):
                continue

            old_action = self.pi[s]
            # Update the policy
            self.pi[s] = max(TicTacToeEnv.actions(s), key=lambda a: sum([p * (TicTacToeEnv.reward(s_prime, "X") + self.discount_factor * self.V[s_prime])
                                                                      for p, s_prime in [(1, TicTacToeEnv.transition_model(s, a, "X"))]]))

            if old_action != self.pi[s]:
                policy_stable = False

        return policy_stable

    def print_board(self, state):
        print(state[0:3])
        print(state[3:6])
        print(state[6:9])
        print("\n")

    def play_game(self):
        # Game loop
        state = (" ",) * 9   # Initial state
        current_player = "X"
        while not self.is_terminal(state):
            self.print_board(state)
            if current_player == "X":  # "X" could be a human player
                action = int(input("Enter your move (0-8): "))
            else:  # "O" follows the current policy
                action = self.pi[state]

            # Update state
            state = TicTacToeEnv.transition_model(state, action, current_player)
            current_player = "X" if current_player == "O" else "O"

        # Final outcome
        self.print_board(state)
        if TicTacToeEnv.reward(state, "X") == 1:
            print("Player X wins!")
        elif TicTacToeEnv.reward(state, "O") == 1:
            print("Player O wins!")
        else:
            print("It's a draw!")

    def policy_iteration(self):
        is_policy_stable = False

        while not is_policy_stable:
            self.policy_evaluation()
            is_policy_stable = self.policy_improvement()

if __name__ == "__main__":
    game = PolicyIteration()
    game.policy_iteration()
    game.play_game()


(' ', ' ', ' ')
(' ', ' ', ' ')
(' ', ' ', ' ')


(' ', ' ', ' ')
(' ', 'X', ' ')
(' ', ' ', ' ')


('O', ' ', ' ')
(' ', 'X', ' ')
(' ', ' ', ' ')


('O', ' ', ' ')
(' ', 'X', 'X')
(' ', ' ', ' ')


('O', 'O', ' ')
(' ', 'X', 'X')
(' ', ' ', ' ')


('O', 'O', ' ')
('X', 'X', 'X')
(' ', ' ', ' ')


Player X wins!
