In [2]:
import itertools
import random

class TicTacToeEnv:
    def __init__(self):
        self.states = list(itertools.product(["X", "O", " "], repeat=9))

    @staticmethod
    def actions(state):
        return [i for i, s in enumerate(state) if s == " "]

    @staticmethod
    def transition_model(state, action, player):
        state_list = list(state)
        state_list[action] = player
        return tuple(state_list)

    @staticmethod

    def reward(state, player):
        win_positions = [(0, 1, 2), (3, 4, 5), (6, 7, 8), (0, 3, 6), (1, 4, 7), (2, 5, 8), (0, 4, 8), (2, 4, 6)]
        if player == "X":
            opponent = "O"
        else:
            opponent = "X"

        for pos in win_positions:
            if state[pos[0]] == state[pos[1]] == state[pos[2]] == player:
                return 1  # Reward for winning


            # Add a negative reward for losing
        for pos in win_positions:
            if state[pos[0]] == state[pos[1]] == state[pos[2]] == opponent:
                return -1  # Negative reward for losing

        if " " not in state:
            return 0  # Reward for a draw

        # Modify the reward for other scenarios
        if player == "X":
            opponent = "O"
        else:
            opponent = "X"

        # Example: Encourage taking the center position
        if state[4] == player:
            return 0.5

        # Example: Discourage taking corners
        corner_positions = [0, 2, 6, 8]
        if any(state[i] == player for i in corner_positions):
            return -0.2

        # Example: Slightly discourage taking edges
        edge_positions = [1, 3, 5, 7]
        if any(state[i] == player for i in edge_positions):
            return -0.1

        return 0  # Default reward for other situations

    @staticmethod
    def is_terminal(state):
        return TicTacToeEnv.reward(state, "X") == 1 or TicTacToeEnv.reward(state, "O") == 1 or " " not in state

    @staticmethod
    def get_available_actions(state):
        return [i for i, s in enumerate(state) if s == " "]

if __name__ == "__main__":
    game = TicTacToeEnv()
    print("Possible States:")
    print(game.states)


Possible States:
[('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', ' '), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', 'O', ' ', ' '), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', 'X'), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', 'O'), ('X', 'X', 'X', 'X', 'X', 'X', ' ', 'X', '

### Monte Carlo Visit

In [7]:
class MonteCarloVisit:
    def __init__(self, num_episodes=10000):
        self.num_episodes = num_episodes
        self.game = TicTacToeEnv()

    def generate_episode(self):
        episode = []
        state = (" ",) * 9
        current_player = "X"

        while not self.game.is_terminal(state):
            available_actions = self.game.get_available_actions(state)
            action = random.choice(available_actions)
            episode.append((state, action, current_player))
            state = self.game.transition_model(state, action, current_player)
            current_player = "X" if current_player == "O" else "O"

        return episode

    def monte_carlo_visit(self):
        state_values = {s: 0 for s in self.game.states}
        state_visit_count = {s: 0 for s in self.game.states}

        for _ in range(self.num_episodes):
            episode = self.generate_episode()
            G = 0

            for t in reversed(range(len(episode))):
                state, action, player = episode[t]
                G += self.game.reward(state, "X")  # Assuming "X" is the player we want to estimate for
                if state not in [s for s, _, _ in episode[:t]]:
                    state_visit_count[state] += 1
                    state_values[state] += (G - state_values[state]) / state_visit_count[state]

        return state_values

    def print_board(self, state):
        print(state[0:3])
        print(state[3:6])
        print(state[6:9])
        print("\n")

    def human_move(self, state):
        while True:
            try:
                action = int(input("Enter your move (0-8): "))
                if action in self.game.get_available_actions(state):
                    return action
                else:
                    print("Invalid move. Try again.")
            except ValueError:
                print("Invalid input. Please enter a number (0-8).")

    def ai_move(self, state, state_values):
        available_actions = self.game.get_available_actions(state)
        best_action = None
        best_value = -float("inf")

        for action in available_actions:
            next_state = self.game.transition_model(state, action, "X")  # Assuming "X" is the AI player
            if state_values.get(next_state, 0) > best_value:
                best_action = action
                best_value = state_values[next_state]

        return best_action

    def play_game(self):
        state_values = self.monte_carlo_visit()

        # Print estimated values for each state
        for state, value in state_values.items():
            print("State:", state)
            print("Estimated Value:", value)

        # Game loop for playing against AI
        state = (" ",) * 9  # Initial state
        current_player = "X"
        while not self.game.is_terminal(state):
            self.print_board(state)
            if current_player == "X":  # "X" is the AI player
                action = self.ai_move(state, state_values)
            else:
                action = self.human_move(state)

            # Update state
            state = self.game.transition_model(state, action, current_player)
            current_player = "X" if current_player == "O" else "O"

        # Final outcome
        self.print_board(state)
        if self.game.reward(state, "X") == 1:
            print("AI wins!")
        elif self.game.reward(state, "O") == 1:
            print("You win!")
        else:
            print("It's a draw.")  # Correctly identify a draw


if __name__ == "__main__":
    game = MonteCarloVisit(num_episodes=100000)
    game.play_game()


State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'O')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', ' ')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'X')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'O')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', ' ')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'X')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', 'O')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'X', ' ', ' ')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'X')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', 'O')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'X', ' ')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'X')
Estimated Value: 0
State: ('X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'O')
Estimated V