In [2]:
import numpy as np
import random


In [4]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros(9, dtype=int)
        self.current_player = 1
        self.done = False
        self.winner = None
        return self.board.copy()

    def available_actions(self):
        return [i for i in range(9) if self.board[i] == 0]

    def step(self, action):
        if self.board[action] != 0 or self.done:
            return self.board.copy(), -10, True  # Invalid move
        self.board[action] = self.current_player
        self.check_winner()
        reward = 1 if self.winner == self.current_player else 0
        self.current_player = 3 - self.current_player
        return self.board.copy(), reward, self.done

    def check_winner(self):
        combos = [(0,1,2),(3,4,5),(6,7,8),
                  (0,3,6),(1,4,7),(2,5,8),
                  (0,4,8),(2,4,6)]
        for (a,b,c) in combos:
            if self.board[a] == self.board[b] == self.board[c] != 0:
                self.done = True
                self.winner = self.board[a]
                return
        if np.all(self.board != 0):
            self.done = True

    def render(self):
        symbols = [' ', 'X', 'O']
        print("\nBoard:")
        for i in range(0, 9, 3):
            print(' | '.join(symbols[self.board[i + j]] for j in range(3)))
        print()


In [6]:
def encode_state(board):
    return ''.join(map(str, board))


In [8]:
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def get_qs(self, state):
        return self.q_table.setdefault(state, np.zeros(9))

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)
        qs = self.get_qs(state)
        qs = np.array([qs[a] if a in available_actions else -np.inf for a in range(9)])
        return int(np.argmax(qs))

    def update(self, state, action, reward, next_state, done, available_actions):
        current_q = self.get_qs(state)[action]
        max_future_q = 0 if done else max([self.get_qs(next_state)[a] for a in available_actions], default=0)
        new_q = current_q + self.alpha * (reward + self.gamma * max_future_q - current_q)
        self.q_table[state][action] = new_q

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)


In [10]:
def train(episodes=50000):
    env = TicTacToe()
    agent = QLearningAgent()

    for episode in range(episodes):
        state = encode_state(env.reset())
        done = False

        while not done:
            available = env.available_actions()
            action = agent.choose_action(state, available)
            next_state_arr, reward, done = env.step(action)
            next_state = encode_state(next_state_arr)
            agent.update(state, action, reward, next_state, done, env.available_actions())
            state = next_state

        agent.decay_epsilon()

        if (episode + 1) % 5000 == 0:
            print(f"Episode {episode + 1}/{episodes} | Epsilon: {agent.epsilon:.3f}")

    return agent


In [12]:
def test(agent, games=10):
    env = TicTacToe()
    win, lose, draw = 0, 0, 0

    for _ in range(games):
        state = encode_state(env.reset())
        done = False

        while not done:
            available = env.available_actions()
            action = agent.choose_action(state, available)
            next_state, reward, done = env.step(action)
            state = encode_state(next_state)

            if not done:
                opp_action = random.choice(env.available_actions())
                _, _, done = env.step(opp_action)
                state = encode_state(env.board)

        if env.winner == 1:
            win += 1
        elif env.winner == 2:
            lose += 1
        else:
            draw += 1

    print(f"Test results: {win} Wins, {lose} Losses, {draw} Draws")


In [14]:
# Train the agent
agent = train(episodes=50000)


Episode 5000/50000 | Epsilon: 0.010
Episode 10000/50000 | Epsilon: 0.010
Episode 15000/50000 | Epsilon: 0.010
Episode 20000/50000 | Epsilon: 0.010
Episode 25000/50000 | Epsilon: 0.010
Episode 30000/50000 | Epsilon: 0.010
Episode 35000/50000 | Epsilon: 0.010
Episode 40000/50000 | Epsilon: 0.010
Episode 45000/50000 | Epsilon: 0.010
Episode 50000/50000 | Epsilon: 0.010


In [15]:
# Test the trained agent
test(agent, games=20)


Test results: 19 Wins, 1 Losses, 0 Draws
