In [1]:
import numpy as np
import random
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [2]:
class TicTacToe:
    def init(self):
        self.board = np.zeros(9) 
        self.done = False
        self.winner = None
    
    def reset(self):
        self.board = np.zeros(9)
        self.done = False
        self.winner = None
        return self.board
    
    def available_actions(self):
        return [i for i in range(9) if self.board[i] == 0]
    
    def step(self, action, player):
        if self.board[action] != 0:
            return self.board, -1, True  # Invalid move
        
        self.board[action] = player
        if self.check_win(player):
            self.winner = player
            self.done = True
            return self.board, 1, self.done  # Player wins
        elif np.all(self.board != 0):
            self.done = True
            return self.board, 0, self.done  # Draw
        else:
            return self.board, 0, self.done  # Game continues
    
    def check_win(self, player):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
            [0, 4, 8], [2, 4, 6]              # Diagonals
        ]
        for condition in win_conditions:
            if np.all(self.board[condition] == player):
                return True
        return False

In [3]:
def build_model():
    model = Sequential()
    model.add(Dense(32, input_dim=9, activation='relu')) 
    model.add(Dense(32, activation='relu'))
    model.add(Dense(9, activation='linear'))  
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.01))
    return model

In [4]:
class DQNAgent:
    def __init__(self, model, gamma=0.9, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, batch_size=16, memory_size=10000):
        self.model = model
        self.gamma = gamma  
        self.epsilon = epsilon  
        self.epsilon_min = epsilon_min  
        self.epsilon_decay = epsilon_decay  
        self.batch_size = batch_size
        self.memory = []  
        self.memory_size = memory_size

    def remember(self, state, action, reward, next_state, done):
        if len(self.memory) >= self.memory_size:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice([i for i in range(9) if state[i] == 0])  
        q_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)  
        return np.argmax(q_values[0])  
    
    def train(self):
        if len(self.memory) < self.batch_size:
            return

        # Выбор случайной мини-партии из памяти
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            q_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)  # Текущие Q-значения
            if done:
                q_values[0][action] = reward  # Если игра закончена, то обновляем Q-значение для последнего хода
            else:
                q_values_next = self.model.predict(np.expand_dims(next_state, axis=0), verbose=0)  # Q-значения следующего состояния
                q_values[0][action] = reward + self.gamma * np.max(q_values_next[0])  # Обновление Q-значений
            
            self.model.fit(np.expand_dims(state, axis=0), q_values, epochs=1, verbose=0)  # Обучение модели

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  # Уменьшение вероятности случайных действий

: 

In [None]:
model = build_model()
agent = DQNAgent(model)
env = TicTacToe()

episodes = 500 
for e in tqdm(range(episodes)):
    state = env.reset()  # Сбросить игру
    done = False
    while not done:
        action = agent.act(state)  # Выбор хода
        next_state, reward, done = env.step(action, 1)  # Совершить ход (игрок 1)
        agent.remember(state, action, reward, next_state, done)  # Сохранить опыт
        agent.train()  # Обучить модель
        state = next_state  # Переход в следующее состояние
    if e % 100 == 0:
        print(f"Episode {e}, epsilon {agent.epsilon}")

  0%|          | 0/500 [00:00<?, ?it/s]2024-11-22 01:22:50.029829: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Episode 0, epsilon 1.0


 20%|██        | 101/500 [2:29:33<17:39:11, 159.28s/it]

Episode 100, epsilon 0.1518722266715875


 29%|██▉       | 145/500 [3:44:08<1:57:37, 19.88s/it]  

In [50]:
model.save(f'tic_{episodes}eps.keras')

753
