**Reinforcement Learning**

In [5]:
import numpy as np
import random

In [6]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1

    def make_move(self, row, col):
        if self.board[row][col] == 0:
            self.board[row][col] = self.current_player
            self.current_player = 3 - self.current_player  # Toggle between players

    def is_game_over(self):
        # Check rows, columns, and diagonals for a win
        for i in range(3):
            if self.board[i][0] == self.board[i][1] == self.board[i][2] != 0:
                return True
            if self.board[0][i] == self.board[1][i] == self.board[2][i] != 0:
                return True
        if self.board[0][0] == self.board[1][1] == self.board[2][2] != 0:
            return True
        if self.board[0][2] == self.board[1][1] == self.board[2][0] != 0:
            return True
        # Check for a draw
        if np.all(self.board != 0):
            return True
        return False

    def is_valid_move(self, row, col):
        return 0 <= row < 3 and 0 <= col < 3 and self.board[row][col] == 0

    def print_board(self):
        for row in self.board:
            print(" ".join(map(str, row)))


In [7]:
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.2, gamma=0.9):
        self.q_table = {}
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.last_state = None
        self.last_action = None

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)

    def choose_action(self, state, valid_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_moves)
        else:
            best_actions = []
            best_q_value = float("-inf")
            for action in valid_moves:
                q_value = self.get_q_value(state, action)
                if q_value > best_q_value:
                    best_actions = [action]
                    best_q_value = q_value
                elif q_value == best_q_value:
                    best_actions.append(action)
            return random.choice(best_actions)

    def learn(self, state, action, reward, next_state):
        old_q_value = self.get_q_value(state, action)
        next_max_q_value = max(
            [self.get_q_value(next_state, next_action) for next_action in [(i, j) for i in range(3) for j in range(3)]]
        )
        new_q_value = (1 - self.alpha) * old_q_value + self.alpha * (reward + self.gamma * next_max_q_value)
        self.q_table[(state, action)] = new_q_value


def get_reward(game, player):
    if game.is_game_over():
        if np.any(game.board == 3 - player):
            return -1  # Opponent wins
        elif np.any(game.board == player):
            return 1  # Player wins
        else:
            return 0  # Draw
    return 0  # Game ongoing


def play_game():
    game = TicTacToe()
    agent = QLearningAgent()
    player = 1

    num_episodes = 10000

    for episode in range(num_episodes):
        game.reset()
        state = str(game.board.flatten())
        done = False

        while not done:
            if player == 1:
                valid_moves = [(i, j) for i in range(3) for j in range(3) if game.is_valid_move(i, j)]
                action = agent.choose_action(state, valid_moves)
                game.make_move(action[0], action[1])
                next_state = str(game.board.flatten())
                reward = get_reward(game, player)
                agent.learn(state, action, reward, next_state)
                state = next_state
                player = 3 - player
            else:
                print("Current board:")
                game.print_board()
                row, col = map(int, input("Enter your move (row and column): ").split())
                if game.is_valid_move(row, col):
                    game.make_move(row, col)
                    player = 3 - player
                else:
                    print("Invalid move. Try again.")

            done = game.is_game_over()

        reward = get_reward(game, player)
        agent.learn(state, action, reward, None)

    print("Training complete. You can now play against the trained agent.")

    while True:
        game.reset()
        player = 1
        state = str(game.board.flatten())
        done = False

        while not done:
            if player == 1:
                valid_moves = [(i, j) for i in range(3) for j in range(3) if game.is_valid_move(i, j)]
                action = agent.choose_action(state, valid_moves)
                game.make_move(action[0], action[1])
                state = str(game.board.flatten())
                player = 3 - player
            else:
                print("Current board:")
                game.print_board()
                row, col = map(int, input("Enter your move (row and column): ").split())
                if game.is_valid_move(row, col):
                    game.make_move(row, col)
                    player = 3 - player
                else:
                    print("Invalid move. Try again.")

            done = game.is_game_over()

        game.print_board()
        reward = get_reward(game, player)

        if reward == 1:
            print("You win!")
        elif reward == -1:
            print("Agent wins!")
        else:
            print("It's a draw!")

        play_again = input("Play again? (yes/no): ").lower()
        if play_again != "yes":
            break


if __name__ == "__main__":
    play_game()

Current board:
0 0 0
0 0 0
0 0 1


Current board:
0 0 0
0 2 1
0 0 1
Current board:
1 2 0
0 2 1
0 0 1
Invalid move. Try again.
Current board:
1 2 0
0 2 1
0 0 1
Invalid move. Try again.
Current board:
1 2 0
0 2 1
0 0 1
Invalid move. Try again.
Current board:
1 2 0
0 2 1
0 0 1
Invalid move. Try again.
Current board:
1 2 0
0 2 1
0 0 1
Current board:
1 0 0
0 0 0
0 0 0


ValueError: not enough values to unpack (expected 2, got 0)