Practical No: 6
Build a Tic-Tac-Toe game using reinforcement learning in Python


In [3]:
# Cell 1: Import necessary libraries
import numpy as np
import random
from collections import defaultdict


In [4]:
# Cell 2: Define the Tic-Tac-Toe environment and rules
class TicTacToe:
    def __init__(self):
        self.board = [' ' for _ in range(9)]
        self.current_winner = None

    def print_board(self):
        for row in [self.board[i*3:(i+1)*3] for i in range(3)]:
            print('| ' + ' | '.join(row) + ' |')

    def available_moves(self):
        return [i for i, spot in enumerate(self.board) if spot == ' ']

    def empty_squares(self):
        return ' ' in self.board

    def make_move(self, square, letter):
        if self.board[square] == ' ':
            self.board[square] = letter
            if self.winner(square, letter):
                self.current_winner = letter
            return True
        return False

    def winner(self, square, letter):
        # Check row
        row_ind = square // 3
        row = self.board[row_ind*3:(row_ind+1)*3]
        if all([s == letter for s in row]):
            return True
        # Check column
        col_ind = square % 3
        col = [self.board[col_ind+i*3] for i in range(3)]
        if all([s == letter for s in col]):
            return True
        # Check diagonals
        if square % 2 == 0:
            diagonal1 = [self.board[i] for i in [0,4,8]]
            diagonal2 = [self.board[i] for i in [2,4,6]]
            if all([s == letter for s in diagonal1]) or all([s == letter for s in diagonal2]):
                return True
        return False

    def reset(self):
        self.board = [' ' for _ in range(9)]
        self.current_winner = None


In [5]:
# Cell 3: Define Q-Learning agent for AI
class QLearningAgent:
    def __init__(self, alpha=0.3, gamma=0.9, epsilon=0.2):
        self.q_table = defaultdict(lambda: np.zeros(9))
        self.alpha = alpha    # Learning rate
        self.gamma = gamma    # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_state(self, game):
        return ''.join(game.board)

    def choose_action(self, game):
        state = self.get_state(game)
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(game.available_moves())
        else:
            q_values = self.q_table[state]
            max_q = max([q_values[a] for a in game.available_moves()])
            max_actions = [a for a in game.available_moves() if q_values[a] == max_q]
            return random.choice(max_actions)

    def learn(self, state, action, reward, next_state, done):
        future = 0 if done else max(self.q_table[next_state])
        self.q_table[state][action] += self.alpha * (reward + self.gamma * future - self.q_table[state][action])


In [6]:
# Cell 4: Train the AI by playing multiple games against random moves
def train(agent, episodes=10000):
    game = TicTacToe()
    for _ in range(episodes):
        game.reset()
        state = agent.get_state(game)
        done = False
        while not done:
            # AI move
            action = agent.choose_action(game)
            game.make_move(action, 'X')
            next_state = agent.get_state(game)
            
            if game.current_winner == 'X':
                agent.learn(state, action, 1, next_state, True)
                done = True
            elif not game.empty_squares():
                agent.learn(state, action, 0.5, next_state, True)
                done = True
            else:
                # Random opponent move
                opponent_action = random.choice(game.available_moves())
                game.make_move(opponent_action, 'O')
                next_state_op = agent.get_state(game)
                if game.current_winner == 'O':
                    agent.learn(state, action, -1, next_state_op, True)
                    done = True
                else:
                    agent.learn(state, action, 0, next_state_op, False)
                    state = next_state_op

# Initialize and train the agent
agent = QLearningAgent()
train(agent, episodes=10000)


In [7]:
# Cell 5: Play the game interactively with human input
def print_board_positions():
    print("Board positions (0-8):")
    for row in [[str(i+j*3) for i in range(3)] for j in range(3)]:
        print('| ' + ' | '.join(row) + ' |')

def play_human_vs_ai(agent):
    game = TicTacToe()
    print_board_positions()
    game.print_board()
    
    while game.empty_squares():
        # AI move
        action = agent.choose_action(game)
        game.make_move(action, 'X')
        print("\nAI's move:")
        game.print_board()
        if game.current_winner == 'X':
            print("AI wins!")
            return
        if not game.empty_squares():
            print("It's a tie!")
            return

        # Human move
        valid_move = False
        while not valid_move:
            try:
                human_move = input("Enter your move (0-8): ")
                if human_move.lower() == 'exit':
                    print("Game exited.")
                    return
                human_move = int(human_move)
                if human_move in game.available_moves():
                    game.make_move(human_move, 'O')
                    valid_move = True
                else:
                    print("Invalid move! Position already taken or out of range.")
            except ValueError:
                print("Invalid input! Enter a number between 0 and 8.")
        
        print("\nYour move:")
        game.print_board()
        if game.current_winner == 'O':
            print("You win!")
            return


In [8]:
# Cell 6: Start the interactive game
play_human_vs_ai(agent)


Board positions (0-8):
| 0 | 1 | 2 |
| 3 | 4 | 5 |
| 6 | 7 | 8 |
|   |   |   |
|   |   |   |
|   |   |   |

AI's move:
|   |   |   |
|   |   |   |
|   |   | X |


Enter your move (0-8):  0



Your move:
| O |   |   |
|   |   |   |
|   |   | X |

AI's move:
| O |   |   |
|   |   | X |
|   |   | X |


Enter your move (0-8):  2



Your move:
| O |   | O |
|   |   | X |
|   |   | X |

AI's move:
| O |   | O |
| X |   | X |
|   |   | X |


Enter your move (0-8):  2


Invalid move! Position already taken or out of range.


Enter your move (0-8):  1



Your move:
| O | O | O |
| X |   | X |
|   |   | X |
You win!


In [1]:
import numpy as np


class TicTacToeEnvironment:
    def __init__(self):
        self.state = [0] * 9  # 0 = empty, 1 = X, -1 = O
        self.is_terminal = False

    def reset(self):
        self.state = [0] * 9
        self.is_terminal = False

    def get_available_moves(self):
        return [i for i, mark in enumerate(self.state) if mark == 0]

    def make_move(self, move, player_mark):
        if self.is_terminal:
            raise ValueError("Game is already over.")
        if self.state[move] != 0:
            raise ValueError(f"Cell {move} is not empty.")
        self.state[move] = player_mark

    def check_win(self, player_mark):
        winning_states = [
            [0, 1, 2],
            [3, 4, 5],
            [6, 7, 8],  # rows
            [0, 3, 6],
            [1, 4, 7],
            [2, 5, 8],  # columns
            [0, 4, 8],
            [2, 4, 6],  # diagonals
        ]
        for idxs in winning_states:
            if all(self.state[i] == player_mark for i in idxs):
                self.is_terminal = True
                return True
        return False

    def is_draw(self):
        if 0 not in self.state:
            self.is_terminal = True
            return True
        return False


class QLearningAgent:
    def __init__(self, learning_rate=0.9, discount_factor=0.9, exploration_rate=0.3):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_table = np.zeros((3**9, 9), dtype=np.float32)

    def get_state_index(self, state):
        state_index = 0
        for i, mark in enumerate(state):
            state_index += (3**i) * (mark + 1)
        return state_index

    def choose_action(self, state, available_moves):
        state_index = self.get_state_index(state)

        # Îµ-greedy policy
        if np.random.random() < self.exploration_rate:
            return int(np.random.choice(available_moves))

        q_values = self.q_table[state_index, available_moves]
        best_idx_in_subset = int(np.argmax(q_values))
        return available_moves[best_idx_in_subset]

    def update_q_table(self, state, action, next_state, reward):
        state_index = self.get_state_index(state)
        if next_state is None:
            max_next_q = 0.0
        else:
            next_state_index = self.get_state_index(next_state)
            max_next_q = float(np.max(self.q_table[next_state_index]))

        current_q = self.q_table[state_index, action]
        target = reward + self.discount_factor * max_next_q
        self.q_table[state_index, action] = (
            1 - self.learning_rate
        ) * current_q + self.learning_rate * target


def evaluate_agents(agent1, agent2, num_episodes=1000):
    env = TicTacToeEnvironment()
    agent1_wins = agent2_wins = draws = 0

    for _ in range(num_episodes):
        env.reset()
        current_agent = agent1  # agent1 = X (+1), agent2 = O (-1)

        while not env.is_terminal:
            available_moves = env.get_available_moves()
            current_state = env.state.copy()

            action = current_agent.choose_action(current_state, available_moves)
            env.make_move(action, 1 if current_agent is agent1 else -1)

            # Check terminal outcomes
            if env.check_win(1 if current_agent is agent1 else -1):
                current_agent.update_q_table(current_state, action, None, 10.0)
                if current_agent is agent1:
                    agent1_wins += 1
                else:
                    agent2_wins += 1
                break

            if env.is_draw():
                current_agent.update_q_table(current_state, action, None, 0.0)
                draws += 1
                break

            next_state = env.state.copy()
            current_agent.update_q_table(current_state, action, next_state, 0.0)

            # Switch agent
            current_agent = agent2 if current_agent is agent1 else agent1

    return agent1_wins, agent2_wins, draws


# Create and evaluate agents
agent1 = QLearningAgent()
agent2 = QLearningAgent()

agent1_wins, agent2_wins, draws = evaluate_agents(agent1, agent2, num_episodes=1000)

print(f"Agent 1 wins: {agent1_wins}")
print(f"Agent 2 wins: {agent2_wins}")
print(f"Draws: {draws}")



Agent 1 wins: 695
Agent 2 wins: 222
Draws: 83
