In [37]:
import itertools
import random

class TicTacToeEnv:
    def __init__(self):
        self.states = list(itertools.product(["X", "O", " "], repeat=9))

    @staticmethod
    def actions(state):
        return [i for i, s in enumerate(state) if s == " "]

    @staticmethod
    def transition_model(state, action, player):
        state_list = list(state)
        state_list[action] = player
        return tuple(state_list)


    @staticmethod
    def reward(state, player,scaling_factor = 10):
        win_positions = [(0, 1, 2), (3, 4, 5), (6, 7, 8), (0, 3, 6), (1, 4, 7), (2, 5, 8), (0, 4, 8), (2, 4, 6)]

        #scaling_factor = 10  # Adjust this factor as needed

        for pos in win_positions:
            if state[pos[0]] == state[pos[1]] == state[pos[2]] == player:
                return 1 * scaling_factor  # Increased reward for winning

        if " " not in state:
            return 0  # No additional reward for a draw

        # Additional scenarios
        if player == "X":
            opponent = "O"
        else:
            opponent = "X"

        # Check for a fork (two possible ways to win)
        fork_positions = [(0, 2, 4), (0, 4, 8), (2, 4, 6), (2, 5, 8)]
        for pos in fork_positions:
            if state[pos[0]] == state[pos[1]] == player and state[pos[2]] == " ":
                return 0.9 * scaling_factor

        # Block opponent's fork
        for pos in fork_positions:
            if state[pos[0]] == state[pos[1]] == opponent and state[pos[2]] == " ":
                return 0.8 * scaling_factor

        # Check for center control
        if state[4] == player:
            return 0.7 * scaling_factor

        # Check for opposite corners
        if state[0] == player and state[8] == player:
            return 0.6 * scaling_factor

        # Check for corners
        corner_positions = [0, 2, 6, 8]
        if any(state[i] == player for i in corner_positions):
            return 0.5 * scaling_factor

        # Check for edges
        edge_positions = [1, 3, 5, 7]
        if any(state[i] == player for i in edge_positions):
            return 0.4 * scaling_factor

        return -1 * scaling_factor


    @staticmethod
    def is_terminal(state):
        return TicTacToeEnv.reward(state, "X") == 1 or TicTacToeEnv.reward(state, "O") == 1 or " " not in state

    @staticmethod
    def get_available_actions(state):
        return [i for i, s in enumerate(state) if s == " "]

if __name__ == "__main__":
    game = TicTacToeEnv()
    print("Possible States:")
    print(game.states)


Possible States:
[('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', ' '), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', '

### Value Iteration

In [39]:
class ValueIteration:
    def __init__(self, theta=0.001, discount_factor=0.99):
        self.theta = theta
        self.discount_factor = discount_factor
        self.states = TicTacToeEnv().states
        self.V = {s: 0 for s in self.states}

    def value_iteration(self):
        while True:
            delta = 0
            for s in self.states:
                if TicTacToeEnv.is_terminal(s):
                    self.V[s] = 0
                    continue
                v = self.V[s]
                self.V[s] = max([sum([p * (TicTacToeEnv.reward(s_prime, "X") + self.discount_factor * self.V[s_prime])
                                     for p, s_prime in [(1, TicTacToeEnv.transition_model(s, a, "X"))]])
                                 for a in TicTacToeEnv.actions(s)])
                delta = max(delta, abs(v - self.V[s]))
            if delta < self.theta:
                break

    def get_policy(self, state):
        return max(TicTacToeEnv.actions(state), key=lambda a:
                   sum([p * (TicTacToeEnv.reward(s_prime, "X") + self.discount_factor * self.V[s_prime])
                        for p, s_prime in [(1, TicTacToeEnv.transition_model(state, a, "X"))]]))

    def policy_random(self, state):
        return random.choice(TicTacToeEnv.actions(state))

    # print the board
    def print_board(self,state):
        print(state[0:3])
        print(state[3:6])
        print(state[6:9])
        print("\n")


if __name__ == "__main__":
    game = ValueIteration()
    game.value_iteration()

    state = (" ",) * 9  # Initial state
    current_player = "X"
    while not TicTacToeEnv.is_terminal(state):
        game.print_board(state)
        if current_player == "X":  # "X" could be a human player
            action = int(input("Enter your move (0-8): "))
        else:  # "O" follows a policy
            action = game.get_policy(state)

        # Update state
        state = TicTacToeEnv.transition_model(state, action, current_player)
        current_player = "X" if current_player == "O" else "O"

    # Final outcome
    scaling_factor=10
    game.print_board(state)
    if TicTacToeEnv.reward(state, "X",scaling_factor) == 1*scaling_factor:
        print("Player X wins!")
    elif TicTacToeEnv.reward(state, "O",scaling_factor) == 1*scaling_factor:
        print("Player O wins!")
    else:
        print("It's a draw!")


(' ', ' ', ' ')
(' ', ' ', ' ')
(' ', ' ', ' ')


(' ', ' ', ' ')
(' ', 'X', ' ')
(' ', ' ', ' ')


('O', ' ', ' ')
(' ', 'X', ' ')
(' ', ' ', ' ')


('O', ' ', 'X')
(' ', 'X', ' ')
(' ', ' ', ' ')


('O', ' ', 'X')
(' ', 'X', ' ')
('O', ' ', ' ')


('O', ' ', 'X')
('X', 'X', ' ')
('O', ' ', ' ')


('O', ' ', 'X')
('X', 'X', 'O')
('O', ' ', ' ')


('O', ' ', 'X')
('X', 'X', 'O')
('O', 'X', ' ')


('O', 'O', 'X')
('X', 'X', 'O')
('O', 'X', ' ')


('O', 'O', 'X')
('X', 'X', 'O')
('O', 'X', ' ')


('O', 'O', 'X')
('X', 'X', 'O')
('O', 'X', 'O')


It's a draw!
