### Reinforcement Learning
Build a Tic-Tac-Toe game using reinforcement learning in Python by using following
tasks

a. Setting up the environment

b. Defining the Tic-Tac-Toe game

c. Building the reinforcement learning model

d. Training the model

e. Testing the model

In [2]:
# Setting up the environment
# Defining the Tic-Tac-Toe game

import numpy as np

class TicTacToeEnv:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.game_over = False
        self.winner = None
        self.current_player = 1  # Player 1 starts

    def is_valid_action(self, action):
        return self.board[action] == 0

    def step(self, action):
        if self.is_valid_action(action):
            self.board[action] = self.current_player
            self.current_player = -1 if self.current_player == 1 else 1
            self.check_game_over()
            reward = self.get_reward()
            return self.board.copy(), reward, self.game_over
        else:
            return self.board.copy(), -10, self.game_over  # Penalty for invalid action

    def check_game_over(self):
        # Check for victory
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
                self.game_over = True
                self.winner = True
                return
        if abs(sum([self.board[i, i] for i in range(3)])) == 3 or abs(sum([self.board[i, 2-i] for i in range(3)])) == 3:
            self.game_over = True
            self.winner = True
            return
        # Check for draw
        if not 0 in self.board:
            self.game_over = True
            self.winner = False

    def get_reward(self):
        if self.game_over:
            return 1 if self.winner else 0.5  # Win: 1, Draw: 0.5, Continue playing: 0
        return 0

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.game_over = False
        self.winner = None
        self.current_player = 1
        return self.board.copy()


In [3]:
# Building the reinforcement learning model
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_q_value(self, state, action):
        return self.q_table.get((str(state), action), 0)

    def set_q_value(self, state, action, value):
        self.q_table[(str(state), action)] = value

    def choose_action(self, state, valid_actions):
        if np.random.random() < self.epsilon:  # Explore: choose a random action
            return np.random.choice(valid_actions)
        else:  # Exploit: choose the best action
            q_values = [self.get_q_value(state, a) for a in valid_actions]
            max_q = max(q_values)
            # If multiple actions have the same max q value, choose randomly among them
            actions_with_max_q = [a for a, q in zip(valid_actions, q_values) if q == max_q]
            return np.random.choice(actions_with_max_q)

    def learn(self, state, action, reward, next_state, done):
        old_value = self.get_q_value(state, action)
        next_max = max([self.get_q_value(next_state, a) for a in range(9)
                        if next_state.reshape(9)[a] == 0])  # Valid actions in the next state

        # Q-learning formula
        new_value = old_value + self.alpha * (reward + self.gamma * next_max * (not done) - old_value)
        self.set_q_value(state, action, new_value)


In [4]:
# Training
def train(agent, env, episodes):
    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            valid_actions = [i for i in range(9) if state.reshape(9)[i] == 0]
            action = agent.choose_action(state, valid_actions)
            next_state, reward, done = env.step(divmod(action, 3))
            agent.learn(state, action, reward, next_state, done)
            state = next_state


In [5]:
# Testing
def test(agent, env, episodes):
    total_rewards = 0
    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            valid_actions = [i for i in range(9) if state.reshape(9)[i] == 0]
            action = agent.choose_action(state, valid_actions)
            state, reward, done = env.step(divmod(action, 3))
            total_rewards += reward
    return total_rewards / episodes
