In [1]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        processed = np.array(state).astype(float)
        processed[processed == 1] = -1
        processed[processed == 2] = 1   
        return processed

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    target_ratio = 1.0
    prioritize_draw = False
    learning_rate_schedule = [0.001, 0.0005, 0.0001]  # Gradually reduce learning rate
    
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)

        # Adjust learning rate based on training phase
        if episode < 3000:
            agent.model.optimizer.learning_rate = learning_rate_schedule[0]
        elif episode < 7000:
            agent.model.optimizer.learning_rate = learning_rate_schedule[1]
        else:
            agent.model.optimizer.learning_rate = learning_rate_schedule[2]
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
            
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = 0.2 if prioritize_draw else 0.0
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 1000 == 0:
            win_ratio = history['wins'] / (history['losses'] + 1e-5)
            if win_ratio < target_ratio and episode >= 100:
                prioritize_draw = True
                agent.epsilon = min(agent.epsilon + 0.1, 1.0)
            else:
                prioritize_draw = False
            print(f"Episode: {episode}, Win/Loss Ratio: {win_ratio:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove: {smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model14_episode_{episode}.h5')
            print(f"Model saved at episode {episode}")

    agent.model.save('model14_final.h5')




agent = train_agent()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Episode: 0, Win/Loss Ratio: 0.00, Epsilon: 1.000, smartmove: 0.0
Wins: 0, Losses: 0, Draws: 1
Model saved at episode 0
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Rando



Episode: 1000, Win/Loss Ratio: 1.07, Epsilon: 0.010, smartmove: 0.16666666666666666
Wins: 454, Losses: 425, Draws: 122
Model saved at episode 1000
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses posi



Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Resetting epsilon to 1.0
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Episode: 2000, Win/Loss Ratio: 1.68, Epsilon: 1.000, smartmove: 0.3333333333333333
Wins: 1111, Losses: 661, Draws: 229
Model saved at episode 2000
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (S



Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Episode: 3000, Win/Loss Ratio: 1.22, Epsilon: 0.010, smartmove: 0.5
Wins: 1430, Losses: 1172, Draws: 399
Model saved at episode 3000
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 



Player 1 (Smart/Random) chooses position 4
Resetting epsilon to 1.0
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Episode: 4000, Win/Loss Ratio: 1.31, Epsilon: 1.000, smartmove: 0.6666666666666666
Wins: 1854, Losses: 1417, Draws: 730
Model saved at episode 4000
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (



Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Episode: 5000, Win/Loss Ratio: 1.05, Epsilon: 0.010, smartmove: 0.8333333333333334
Wins: 2030, Losses: 1933, Draws: 1038
Model saved at episode 5000
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses po



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Resetting epsilon to 1.0
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Episode: 6000, Win/Loss Ratio: 1.05, Epsilon: 1.000, smartmove: 1
Wins: 2191, Losses: 2091, Draws: 1719
Model saved at episode 6000
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) ch



Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Episode: 7000, Win/Loss Ratio: 0.83, Epsilon: 0.110, smartmove: 1
Wins: 2230, Losses: 2672, Draws: 2099
Model saved at episode 7000
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Resetting epsilon to 1.0
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Episode: 8000, Win/Loss Ratio: 0.83, Epsilon: 1.000, smartmove: 1
Wins: 2299, Losses: 2766, Draws: 2936
Model saved at episode 8000
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) ch



Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Episode: 9000, Win/Loss Ratio: 0.70, Epsilon: 0.110, smartmove: 1
Wins: 2344, Losses: 3334, Draws: 3323
Model saved at episode 9000
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1



Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
