# load and process the dataset

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv("../data/tic-tac-toe.data", header=None)
data.columns = [
    "top-left", "top-middle", "top-right",
    "middle-left", "middle-middle", "middle-right",
    "bottom-left", "bottom-middle", "bottom-right",
    "outcome"
]

# Display the first few rows
print(data.head())

  top-left top-middle top-right middle-left middle-middle middle-right  \
0        x          x         x           x             o            o   
1        x          x         x           x             o            o   
2        x          x         x           x             o            o   
3        x          x         x           x             o            o   
4        x          x         x           x             o            o   

  bottom-left bottom-middle bottom-right   outcome  
0           x             o            o  positive  
1           o             x            o  positive  
2           o             o            x  positive  
3           o             b            b  positive  
4           b             o            b  positive  


In [3]:
# Encode the data
mapping = {'x': 1, 'o': -1, 'b': 0, 'positive': 1, 'negative': 0}
data = data.replace(mapping).infer_objects(copy=False)

# Separate features and target
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

print("Processed data:")
print(X[:5], y[:5])

Processed data:
[[ 1  1  1  1 -1 -1  1 -1 -1]
 [ 1  1  1  1 -1 -1 -1  1 -1]
 [ 1  1  1  1 -1 -1 -1 -1  1]
 [ 1  1  1  1 -1 -1 -1  0  0]
 [ 1  1  1  1 -1 -1  0 -1  0]] [1 1 1 1 1]


# build the environment

In [4]:
import numpy as np

class TicTacToeEnv:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)  # Empty board
        self.done = False
        self.winner = None
        return self.board

    def step(self, action, player):
        # action is the index (0-8), player is 1 (agent) or -1 (opponent)
        row, col = divmod(action, 3)
        if self.board[row, col] != 0:
            return self.board, -10, True  # Invalid move penalty
        self.board[row, col] = player

        if self.check_winner(player):
            self.done = True
            self.winner = player
            return self.board, 1 if player == 1 else -1, True  # Reward for winning

        if not np.any(self.board == 0):  # Draw condition
            self.done = True
            self.winner = 0
            return self.board, 0, True

        return self.board, 0, False  # No reward, game continues

    def check_winner(self, player):
        for row in self.board:
            if np.all(row == player):
                return True
        for col in self.board.T:
            if np.all(col == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False

# train the RL agent

In [5]:
import random

class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.95, epsilon=1.0, epsilon_decay=0.99):
        self.q_table = {}
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay

    def get_q_value(self, state, action):
        return self.q_table.get((tuple(state.flatten()), action), 0)

    def update_q_value(self, state, action, reward, next_state):
        max_next_q = max([self.get_q_value(next_state, a) for a in range(9)])
        current_q = self.get_q_value(state, action)
        self.q_table[(tuple(state.flatten()), action)] = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, 8)  # Explore
        q_values = [self.get_q_value(state, a) for a in range(9)]
        return np.argmax(q_values)  # Exploit

# evaluation

In [6]:
def random_opponent(board):
    """Random opponent selects a random valid action."""
    valid_actions = [i for i in range(9) if board.flatten()[i] == 0]
    return random.choice(valid_actions) if valid_actions else None

def evaluate_agent(agent, env, num_games=100, opponent="random"):
    results = {"wins": 0, "losses": 0, "draws": 0}
    for _ in range(num_games):
        state = env.reset()
        done = False
        player_turn = 1  # Agent starts first

        while not done:
            if player_turn == 1:  # Agent's turn
                action = agent.select_action(state)
            else:  # Opponent's turn
                if opponent == "random":
                    action = random_opponent(state)
                elif opponent == "rule":
                    action = rule_based_opponent(state)  # Add rule-based logic

            if action is None:  # No valid moves, game over
                break

            _, reward, done = env.step(action, player_turn)
            player_turn *= -1  # Alternate turns

        # Record results
        if env.winner == 1:
            results["wins"] += 1
        elif env.winner == -1:
            results["losses"] += 1
        else:
            results["draws"] += 1

    print(f"Evaluation Results: {results}")
    return results