In [4]:
import random
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import trange

# Environment
class TicTacToe:
    def __init__(self):
        self.reset()
    def reset(self):
        self.board = [0]*9
        self.current_player = 1
        self.done = False
        self.winner = None
        return tuple(self.board)
    def available_actions(self):
        return [i for i,v in enumerate(self.board) if v==0]
    def step(self, action):
        if self.done:
            raise ValueError('Game already finished')
        if self.board[action] != 0:
            self.done = True
            self.winner = -self.current_player
            reward = -1.0 if self.current_player==1 else 1.0
            return tuple(self.board), reward, self.done, {}
        self.board[action] = self.current_player
        lines = [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]
        for (i,j,k) in lines:
            s = self.board[i]+self.board[j]+self.board[k]
            if s == 3:
                self.done = True
                self.winner = 1
                return tuple(self.board), 1.0, True, {}
            if s == -3:
                self.done = True
                self.winner = -1
                return tuple(self.board), -1.0, True, {}
        if all(v!=0 for v in self.board):
            self.done = True
            self.winner = 0
            return tuple(self.board), 0.5, True, {}
        self.current_player *= -1
        return tuple(self.board), 0.0, False, {}

# Agent
class QAgent:
    def __init__(self, alpha=0.5, gamma=0.9, epsilon=0.1):
        self.Q = defaultdict(float)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
    def get_Q(self, state, action):
        return self.Q[(state, action)]
    def choose_action(self, state, available_actions, training=True):
        if training and random.random() < self.epsilon:
            return random.choice(available_actions)
        qvals = [self.get_Q(state, a) for a in available_actions]
        max_q = max(qvals)
        max_actions = [a for a,q in zip(available_actions, qvals) if q==max_q]
        return random.choice(max_actions)
    def update(self, state, action, reward, next_state, next_available_actions, done):
        cur = self.get_Q(state, action)
        if done:
            target = reward
        else:
            future_qs = [self.get_Q(next_state, a) for a in next_available_actions] if next_available_actions else [0.0]
            target = reward + self.gamma * max(future_qs)
        self.Q[(state, action)] = cur + self.alpha * (target - cur)

def train(agent, episodes=20000, opponent='random', verbose=False):
    env = TicTacToe()
    stats = {'wins':0, 'losses':0, 'draws':0}
    history = {'wins':[], 'losses':[], 'draws':[]}
    report_every = max(1, episodes//50)
    for ep in trange(episodes, desc='Training'):
        state = env.reset()
        if random.random() < 0.5:
            env.current_player = 1
        else:
            env.current_player = -1
        done = False
        while not done:
            if env.current_player == 1:
                avail = env.available_actions()
                action = agent.choose_action(state, avail, training=True)
                next_state, reward, done, _ = env.step(action)
                if done:
                    agent.update(state, action, reward, next_state, [], done=True)
                    if reward == 1.0:
                        stats['wins'] += 1
                    elif reward == -1.0:
                        stats['losses'] += 1
                    else:
                        stats['draws'] += 1
                else:
                    opp_avail = env.available_actions()
                    opp_action = random.choice(opp_avail)
                    next_state2, reward2, done2, _ = env.step(opp_action)
                    if done2:
                        if reward2 == -1.0:
                            agent.update(state, action, -1.0, next_state2, [], done=True)
                            stats['losses'] += 1
                        elif reward2 == 1.0:
                            agent.update(state, action, 1.0, next_state2, [], done=True)
                            stats['wins'] += 1
                        else:
                            agent.update(state, action, 0.5, next_state2, [], done=True)
                            stats['draws'] += 1
                        done = True
                    else:
                        next_avail = env.available_actions()
                        agent.update(state, action, 0.0, next_state2, next_avail, done=False)
                        state = next_state2
            else:
                avail = env.available_actions()
                action = random.choice(avail)
                state_after, reward_after, done, _ = env.step(action)
                if done:
                    if reward_after == -1.0:
                        stats['losses'] += 1
                    elif reward_after == 1.0:
                        stats['wins'] += 1
                    else:
                        stats['draws'] += 1
                    break
                else:
                    state = state_after
        if (ep+1) % report_every == 0:
            history['wins'].append(stats['wins'])
            history['losses'].append(stats['losses'])
            history['draws'].append(stats['draws'])
            if verbose:
                print('Episode {}: W/L/D = {}/{}/{}'.format(ep+1, stats['wins'], stats['losses'], stats['draws']))
    return agent, history

if __name__ == '__main__':
    agent = QAgent(alpha=0.5, gamma=0.9, epsilon=0.2)
    agent, history = train(agent, episodes=20000, opponent='random', verbose=False)
    print('Training finished. Q-size:', len(agent.Q))
    wins, losses, draws = evaluate(agent, games=2000, opponent='random') if 'evaluate' in globals() else (None, None, None)


Training:   0%|          | 0/20000 [00:00<?, ?it/s]

Training: 100%|██████████| 20000/20000 [00:00<00:00, 24564.62it/s]

Training finished. Q-size: 13822





In [4]:
def print_board(board):
    symbols = {1: 'X', -1: 'O', 0: ' '}
    print("\nBoard:")
    for i in range(0, 9, 3):
        print(f" {symbols[board[i]]} | {symbols[board[i+1]]} | {symbols[board[i+2]]} ")
        if i < 6:
            print("---+---+---")
    print()


def play_against_agent(agent, human_starts=True):
    env = TicTacToe()
    state = env.reset()
    print("🎮 Welcome to Tic-Tac-Toe!")
    print("Positions are numbered 1–9 as follows:")
    print(" 1 | 2 | 3\n 4 | 5 | 6\n 7 | 8 | 9\n")
    print("You are 'O' (-1). The AI is 'X' (+1). Let's play!\n")

    while not env.done:
        # Human's turn
        if (env.current_player == -1 and human_starts) or (env.current_player == 1 and not human_starts):
            print_board(env.board)
            move = input("Enter your move (1–9): ")

            # Validate move
            if not move.isdigit():
                print("❌ Invalid input. Enter a number between 1 and 9.")
                continue

            move = int(move) - 1  # Convert to 0-based index

            if move not in env.available_actions():
                print("❌ That cell is already taken or invalid. Try again.")
                continue

            state, reward, done, _ = env.step(move)

        # AI's turn
        else:
            avail = env.available_actions()
            action = agent.choose_action(state, avail, training=False)
            state, reward, done, _ = env.step(action)
            print(f"\n🤖 AI played at position {action + 1}")
            print_board(env.board)

        # Check game result
        if done:
            if env.winner == 1:
                print("💻 AI wins!")
            elif env.winner == -1:
                print("🎉 You win!")
            else:
                print("🤝 It's a draw!")
            break


# --- Run this after training ---
play_against_agent(agent, human_starts=True)


🎮 Welcome to Tic-Tac-Toe!
Positions are numbered 1–9 as follows:
 1 | 2 | 3
 4 | 5 | 6
 7 | 8 | 9

You are 'O' (-1). The AI is 'X' (+1). Let's play!


🤖 AI played at position 5

Board:
   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


Board:
   |   |   
---+---+---
   | X |   
---+---+---
   |   |   


🤖 AI played at position 3

Board:
 O |   | X 
---+---+---
   | X |   
---+---+---
   |   |   


Board:
 O |   | X 
---+---+---
   | X |   
---+---+---
   |   |   


🤖 AI played at position 2

Board:
 O | X | X 
---+---+---
   | X |   
---+---+---
 O |   |   


Board:
 O | X | X 
---+---+---
   | X |   
---+---+---
 O |   |   


🤖 AI played at position 4

Board:
 O | X | X 
---+---+---
 X | X |   
---+---+---
 O | O |   


Board:
 O | X | X 
---+---+---
 X | X |   
---+---+---
 O | O |   

🎉 You win!
