<a href="https://colab.research.google.com/github/Mootha-sri-harshit/tic-tac-toe/blob/main/Copy_of_tic_tac_toe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = [' '] * 9  # 3x3 Board

    def reset(self):
        """Resets the board for a new game."""
        self.board = [' '] * 9
        return self.get_state()

    def get_state(self):
        """Returns the board state as a tuple."""
        return tuple(self.board)

    def available_moves(self):
        """Returns available moves (empty spaces)."""
        return [i for i in range(9) if self.board[i] == ' ']

    def make_move(self, position, player):
        """Makes a move if the position is valid."""
        if self.board[position] == ' ':
            self.board[position] = player
            return True
        return False

    def check_winner(self):
        """Checks if there's a winner."""
        winning_combinations = [
            (0, 1, 2), (3, 4, 5), (6, 7, 8),  # Rows
            (0, 3, 6), (1, 4, 7), (2, 5, 8),  # Columns
            (0, 4, 8), (2, 4, 6)  # Diagonals
        ]
        for (x, y, z) in winning_combinations:
            if self.board[x] == self.board[y] == self.board[z] and self.board[x] != ' ':
                return self.board[x]
        if ' ' not in self.board:
            return "Draw"
        return None

    def print_board(self):
        """Displays the board."""
        for i in range(0, 9, 3):
            print(f"{self.board[i]} | {self.board[i+1]} | {self.board[i+2]}")
        print("-" * 9)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Define the Deep Q-Network
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# DQN Agent
class DQNAgent:
    def __init__(self):
        self.state_size = 9  # Tic-Tac-Toe board size
        self.action_size = 9  # 9 possible moves
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.memory = deque(maxlen=2000)  # Experience replay buffer

        self.model = DQN(self.state_size, self.action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def get_state_tensor(self, state):
        """Convert board state to tensor."""
        return torch.FloatTensor([1 if x == "X" else -1 if x == "O" else 0 for x in state])

    def choose_action(self, state, available_moves):
        """Choose action using epsilon-greedy policy."""
        if np.random.rand() <= self.epsilon:
            return random.choice(available_moves)
        state_tensor = self.get_state_tensor(state).unsqueeze(0)
        q_values = self.model(state_tensor)
        sorted_moves = sorted(available_moves, key=lambda x: q_values[0][x].item(), reverse=True)
        return sorted_moves[0]  # Best move from available options

    def store_experience(self, state, action, reward, next_state, done):
        """Store experience in memory for replay."""
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size=32):
        """Train the model using experience replay."""
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state_tensor = self.get_state_tensor(next_state).unsqueeze(0)
                target += self.gamma * torch.max(self.model(next_state_tensor)).item()

            state_tensor = self.get_state_tensor(state).unsqueeze(0)
            target_tensor = self.model(state_tensor)
            target_tensor[0][action] = target  # Update only the chosen action

            self.optimizer.zero_grad()
            loss = self.criterion(self.model(state_tensor), target_tensor)
            loss.backward()
            self.optimizer.step()

        # Decay epsilon for less exploration over time
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
def train_dqn_agent(episodes=10000):
    env = TicTacToe()
    agent = DQNAgent()

    for episode in range(episodes):
        state = env.reset()
        done = False
        player = "X"

        while not done:
            available_moves = env.available_moves()
            action = agent.choose_action(state, available_moves)
            env.make_move(action, player)

            winner = env.check_winner()
            next_state = env.get_state()

            if winner == "X":
                reward = 1
                done = True
            elif winner == "O":
                reward = -1
                done = True
            elif winner == "Draw":
                reward = 0
                done = True
            else:
                reward = 0

            agent.store_experience(state, action, reward, next_state, done)
            state = next_state
            player = "O" if player == "X" else "X"

        agent.replay()

        if episode % 1000 == 0:
            print(f"Training Episode {episode}, Epsilon: {agent.epsilon}")

    print("Training complete!")
    return agent

# Train the DQN agent
dqn_agent1 = train_dqn_agent()




Training Episode 0, Epsilon: 1.0
Training Episode 1000, Epsilon: 0.00998645168764533
Training Episode 2000, Epsilon: 0.00998645168764533
Training Episode 3000, Epsilon: 0.00998645168764533
Training Episode 4000, Epsilon: 0.00998645168764533
Training Episode 5000, Epsilon: 0.00998645168764533
Training Episode 6000, Epsilon: 0.00998645168764533
Training Episode 7000, Epsilon: 0.00998645168764533
Training Episode 8000, Epsilon: 0.00998645168764533
Training Episode 9000, Epsilon: 0.00998645168764533
Training complete!


In [None]:
# Function to play against the AI
def play_against_ai(agent):
    env = TicTacToe()
    state = env.reset()
    player = "X"

    print("Welcome to Tic-Tac-Toe AI!")
    env.print_board()

    while True:
        if player == "X":
            try:
                move = int(input("Enter your move (0-8): "))
                if move not in env.available_moves():
                    print("Invalid move! Try again.")
                    continue
            except ValueError:
                print("Invalid input! Enter a number between 0-8.")
                continue
        else:
            move = agent.choose_action(state, env.available_moves())
            print(f"AI chooses: {move}")

        if env.make_move(move, player):
            env.print_board()
            winner = env.check_winner()
            if winner:
                print(f"Game Over! Winner: {winner}")
                break
            player = "O" if player == "X" else "X"
            state = env.get_state()
        else:
            print("Invalid move! Try again.")

# Play against the trained DQN agent
play_against_ai(dqn_agent1)

Welcome to Tic-Tac-Toe AI!
  |   |  
  |   |  
  |   |  
---------
Enter your move (0-8): 7
  |   |  
  |   |  
  | X |  
---------
AI chooses: 1
  | O |  
  |   |  
  | X |  
---------
Enter your move (0-8): 4
  | O |  
  | X |  
  | X |  
---------
AI chooses: 5
  | O |  
  | X | O
  | X |  
---------
Enter your move (0-8): 6
  | O |  
  | X | O
X | X |  
---------
AI chooses: 8
  | O |  
  | X | O
X | X | O
---------
Enter your move (0-8): 2
  | O | X
  | X | O
X | X | O
---------
Game Over! Winner: X
