In [5]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

In [6]:
class TicTacToe:
    def __init__(self):
        self.board = [[0 for _ in range(3)] for _ in range(3)]
        self.current_player = 1

    def reset(self):
        self.board = [[0 for _ in range(3)] for _ in range(3)]
        self.current_player = 1

    def render(self):
        players = {1: 'X', -1: 'O', 0: ' '}
        for i, row in enumerate(self.board):
            print(" " + " | ".join([players[cell] for cell in row]))
            if i < 2:
                print("---+---+---")

    def available_moves(self):
        return [(r, c) for r in range(3) for c in range(3) if self.board[r][c] == 0]

    def make_move(self, row, col):
        if self.board[row][col] == 0:
            self.board[row][col] = self.current_player
            self.current_player *= -1
            return True
        return False

    def check_winner(self):
        board = np.array(self.board)
        row_sum = np.sum(board, axis=1)
        col_sum = np.sum(board, axis=0)
        diag1_sum = np.trace(board)
        diag2_sum = np.trace(np.fliplr(board))
        all_sums = np.concatenate((row_sum, col_sum, [diag1_sum], [diag2_sum]))

        if 3 in all_sums:
            return 1
        if -3 in all_sums:
            return -1
        if not (board == 0).any():
            return 0
        return None

    def clone(self):
        cloned_game = TicTacToe()
        cloned_game.board = [row[:] for row in self.board]
        cloned_game.current_player = self.current_player
        return cloned_game

    def random_play(self):
        while self.check_winner() is None:
            moves = self.available_moves()
            move = random.choice(moves)
            self.make_move(*move)


In [9]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
GAMMA = 0.99
EPSILON = 1.0
EPSILON_MIN = 0.1
EPSILON_DECAY = 0.995
LEARNING_RATE = 0.001
BATCH_SIZE = 64
UPDATE_TIME = 100

In [10]:
class DDQNAgent:
    def __init__(self):
        self.gamma = GAMMA
        self.epsilon = EPSILON
        self.epsilon_min = EPSILON_MIN
        self.epsilon_decay = EPSILON_DECAY
        self.learning_rate = LEARNING_RATE
        self.batch_size = BATCH_SIZE

        self.memory = deque(maxlen=2000)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = DQN(9, 9).to(self.device)
        self.target_model = DQN(9, 9).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        self.update_time = UPDATE_TIME
        self.update_target_network()

    def update_target_network(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def append_samples(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.choice(range(9))
        state = torch.FloatTensor(state).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).to(self.device)
            next_state = torch.FloatTensor(next_state).to(self.device)

            target = reward
            if not done:
                next_action = torch.argmax(self.model(next_state)).item()
                target = reward + self.gamma * self.target_model(next_state)[next_action].item()

            target_f = self.model(state)
            target_f[action] = target

            self.optimizer.zero_grad()
            loss = nn.MSELoss()(self.model(state), target_f.detach())
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [11]:
# main
if __name__ == "__main__":
    agent = DDQNAgent()
    episodes = 1000

    for e in range(episodes):
        game = TicTacToe()
        state = np.array(game.board).flatten()
        done = False

        while not done:
            action = agent.act(state)
            row, col = divmod(action, 3)
            if not game.make_move(row, col):
                continue

            reward = 0
            winner = game.check_winner()
            if winner is not None:
                done = True
                if winner == 1:
                    reward = 1
                elif winner == -1:
                    reward = -1
                else:
                    reward = 0.3

            next_state = np.array(game.board).flatten()
            agent.append_samples(state, action, reward, next_state, done)
            state = next_state

        agent.replay()
        if e % 100 == 0:
            agent.update_target_network()

        if e % 100 == 0:
            print(f"Episode {e}/{episodes} - Epsilon: {agent.epsilon}")

Episode 0/1000 - Epsilon: 1.0
Episode 100/1000 - Epsilon: 0.6274028820538087
Episode 200/1000 - Epsilon: 0.3800621177172763
Episode 300/1000 - Epsilon: 0.23023039494318503
Episode 400/1000 - Epsilon: 0.13946676683816583
Episode 500/1000 - Epsilon: 0.0996820918179746
Episode 600/1000 - Epsilon: 0.0996820918179746
Episode 700/1000 - Epsilon: 0.0996820918179746
Episode 800/1000 - Epsilon: 0.0996820918179746
Episode 900/1000 - Epsilon: 0.0996820918179746
