In [1]:
import random
from TicTacToe import TicTacToe

def validBoard(game):
    x_count = game.board.count(1)
    o_count = game.board.count(2)
    
    
    if not (x_count == o_count or x_count == o_count + 1):
        return False
    
    
    if game.check_winner(1) and game.check_winner(2):
        return False
    
    # Check if O wins but X has more or equal moves
    if game.check_winner(2) and x_count != o_count:
        return False
    
    # Check if X wins but O has equal moves
    if game.check_winner(1) and x_count != o_count + 1:
        return False
    
    return True

# Example usage
game = TicTacToe()
game.board = [1, 2, 1, 2, 1, 2, 1, 0, 0]  # Example board state
print(validBoard(game))  # Output: True or False based on the board state


True


In [13]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95, learning_rate=0.001, batch_size=32, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.999, replay_buffer_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            return random.choice(valid_actions)
        q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
        q_values = [q_values[i] if i in valid_actions else float('-inf') for i in range(self.action_size)]
        return np.argmax(q_values)

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        
        states = np.zeros((self.batch_size, self.state_size))
        q_values_batch = np.zeros((self.batch_size, self.action_size))

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
            if done:
                q_target = reward
            else:
                next_q_values = self.model.predict(next_state.reshape(1, -1), verbose=0)[0]
                valid_actions = TicTacToe().empty_positions()
                max_next_q = max(next_q_values[a] for a in valid_actions) if valid_actions else 0
                q_target = reward + self.gamma * max_next_q
            
            q_values[action] = q_target
            states[i] = state
            q_values_batch[i] = q_values

        self.model.fit(states, q_values_batch, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=1000, save_interval=200, model_path="trained_sqn_model.h5"):
    agent = SQNAgent()
    loss_count = 0
    win_count = 0
    draw_count = 0

    for episode in range(num_episodes):
        # Linearly scale smartMovePlayer1_prob and epsilon based on progress in episodes
        smartMovePlayer1_prob = min(1.0, episode / num_episodes)  # Increase from 0 to 1 over the episodes
        agent.epsilon = max(agent.epsilon_min, 1.0 - (episode / num_episodes) * (1.0 - agent.epsilon_min))  # Decay from 1.0 to epsilon_min

        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False
        
        if episode % 10 == 0:
            print(f"Episode: {episode}, Wins: {win_count}, Losses: {loss_count}, Draws: {draw_count}, "
                  f"SmartMoveProb: {smartMovePlayer1_prob:.2f}, Epsilon: {agent.epsilon:.4f}")

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.player1_move()
            next_state = np.array(game.board)
            done = game.current_winner is not None or game.is_full()
            reward = game.get_reward() if done else 0
            
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state
            
            if done:
                if game.current_winner == 1:
                    loss_count += 1
                elif game.is_full():
                    draw_count += 1
                break

            game.make_move(action, player=2)
            done = game.current_winner is not None or game.is_full()
            state = np.array(game.board)
            
            if game.current_winner == 2:
                win_count += 1
                done = True

        agent.train_from_experience()

        if episode % save_interval == 0:
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    print("Training complete and model saved")
    return agent

train_sqn_agent()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 0, Wins: 0, Losses: 0, Draws: 0, SmartMoveProb: 0.00, Epsilon: 1.0000
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Ra



Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (S



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (S



Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (S



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (S



Training complete and model saved


<__main__.SQNAgent at 0x34e501b80>

In [29]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95, learning_rate=0.001, batch_size=32, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.999, replay_buffer_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        processed_state = self.process_state(state)
        if np.random.rand() < self.epsilon:
            return random.choice(valid_actions)
        
        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = 1
            if self.check_winner(temp_state, 1):
                return action

        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = -1
            if self.check_winner(temp_state, -1):
                return action

        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        q_values = [q_values[i] if i in valid_actions else float('-inf') for i in range(self.action_size)]
        return np.argmax(q_values)

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        
        states = np.zeros((self.batch_size, self.state_size))
        q_values_batch = np.zeros((self.batch_size, self.action_size))

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            state = self.process_state(state)
            next_state = self.process_state(next_state)
            
            q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
            if done:
                q_target = reward
            else:
                next_q_values = self.model.predict(next_state.reshape(1, -1), verbose=0)[0]
                valid_actions = TicTacToe().empty_positions()
                max_next_q = max(next_q_values[a] for a in valid_actions) if valid_actions else 0
                q_target = reward + self.gamma * max_next_q
            
            q_values[action] = q_target
            states[i] = state
            q_values_batch[i] = q_values

        self.model.fit(states, q_values_batch, epochs=2, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def process_state(self, state):
        return np.array([-1 if x == 1 else 1 if x == 2 else 0 for x in state])

    def check_winner(self, board, player):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        for condition in win_conditions:
            if all(board[i] == player for i in condition):
                return True
        return False

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=1000, save_interval=200, model_path="trained_sqn_model_action.h5"):
    agent = SQNAgent()
    loss_count = 0
    win_count = 0
    draw_count = 0

    for episode in range(num_episodes):
        smartMovePlayer1_prob = min(1.0, episode / num_episodes)
        agent.epsilon = max(agent.epsilon_min, 1.0 - (episode / num_episodes) * (1.0 - agent.epsilon_min))

        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False
        
        if episode % 10 == 0:
            print(f"Episode: {episode}, Wins: {win_count}, Losses: {loss_count}, Draws: {draw_count}, "
                  f"SmartMoveProb: {smartMovePlayer1_prob:.2f}, Epsilon: {agent.epsilon:.4f}")

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.player1_move()
            next_state = np.array(game.board)
            done = game.current_winner is not None or game.is_full()
            reward = game.get_reward() if done else 0
            
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state
            
            if done:
                if game.current_winner == 1:
                    loss_count += 1
                elif game.is_full():
                    draw_count += 1
                break

            game.make_move(action, player=2)
            done = game.current_winner is not None or game.is_full()
            state = np.array(game.board)
            
            if game.current_winner == 2:
                win_count += 1
                reward = game.get_reward()
                done = True
            elif game.current_winner == 1:
                reward = game.get_reward()
            elif game.is_full():
                reward = 0

            agent.store_experience((state, action, reward, next_state, done))

        agent.train_from_experience()

        if episode % save_interval == 0:
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    print("Training complete and model saved")
    return agent

train_sqn_agent()




Episode: 0, Wins: 0, Losses: 0, Draws: 0, SmartMoveProb: 0.00, Epsilon: 1.0000
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Ra

KeyboardInterrupt: 

In [None]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.99, learning_rate=0.001, batch_size=64, epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.995, replay_buffer_size=20000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(128, input_dim=self.state_size, activation='relu'),
            Dense(128, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        processed_state = self.process_state(state)
        
        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = 1
            if self.check_winning_move(temp_state):
                return action

        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = -1
            if self.check_winning_move(temp_state):
                return action

        if len(valid_actions) == 8:  
            if state[4] == 1:
                corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
                if corners:
                    return random.choice(corners)
            if state[0] == 1 or state[2] == 1 or state[6] == 1 or state[8] == 1:
                if 4 in valid_actions:
                    return 4

        if len(valid_actions) == 6:
            opposite_corners = {0: 8, 2: 6, 6: 2, 8: 0}
            for corner in [0, 2, 6, 8]:
                if state[corner] == 1 and opposite_corners[corner] in valid_actions:
                    return opposite_corners[corner]

        if np.random.rand() < self.epsilon:
            strategic_moves = []
            if 4 in valid_actions:
                strategic_moves.append(4)
            corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
            strategic_moves.extend(corners)
            edges = [pos for pos in valid_actions if pos in [1, 3, 5, 7]]
            strategic_moves.extend(edges)
            
            return strategic_moves[0] if strategic_moves else random.choice(valid_actions)
        
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
            
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)
        
        x = []
        y = []
        
        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            target = reward if done else reward + self.gamma * np.max(next_q_values[i])
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)
        
        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def process_state(self, state):
        return np.array([1 if x == 2 else -1 if x == 1 else 0 for x in state])

    def check_winning_move(self, board):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        return any(sum(board[i] for i in condition) == 3 for condition in win_conditions)

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=5000, save_interval=500, model_path="trained_sqn_model_v3.h5"):
    agent = SQNAgent()
    win_count = draw_count = loss_count = 0
    last_100_games = deque(maxlen=100)

    for episode in range(num_episodes):
        smartMovePlayer1_prob = min(0.9, episode / (num_episodes * 0.5))
        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False

        game.player1_move()
        state = np.array(game.board)

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.make_move(action, player=2)
            reward = 0
            
            if game.current_winner == 2:
                reward = 2.0
                done = True
            elif game.is_full():
                reward = 0.5
                done = True
            else:
                game.player1_move()
                if game.current_winner == 1:
                    reward = -2.0
                    done = True
                elif game.is_full():
                    reward = 0.5
                    done = True

            next_state = np.array(game.board)
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state

            if done:
                if game.current_winner == 2:
                    win_count += 1
                    last_100_games.append(1)
                elif game.current_winner == 1:
                    loss_count += 1
                    last_100_games.append(-1)
                else:
                    draw_count += 1
                    last_100_games.append(0)

        agent.train_from_experience()

        if episode % 100 == 0:
            win_rate = sum(1 for x in last_100_games if x == 1) / len(last_100_games) if last_100_games else 0
            draw_rate = sum(1 for x in last_100_games if x == 0) / len(last_100_games) if last_100_games else 0
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Draw Rate: {draw_rate:.2f}, Wins: {win_count}, Draws: {draw_count}")

        if episode % save_interval == 0:
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    return agent

train_sqn_agent()



Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Episode: 0, Win Rate: 1.00, Draw Rate: 0.00, Wins: 1, Draws: 0
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses po



Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Episode: 500, Win Rate: 0.57, Draw Rate: 0.06, Wins: 252, Draws: 28
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) choos



Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Episode: 1000, Win Rate: 0.44, Draw Rate: 0.08, Wins: 465, Draws: 57
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) choo



Episode: 1500, Win Rate: 0.30, Draw Rate: 0.06, Wins: 615, Draws: 84
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) choo



Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Episode: 2000, Win Rate: 0.17, Draw Rate: 0.03, Wins: 724, Draws: 103
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Episode: 2500, Win Rate: 0.09, Draw Rate: 0.04, Wins: 789, Draws: 123
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Episode: 3000, Win Rate: 0.13, Draw Rate: 0.09, Wins: 852, Draws: 150
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Episode: 3500, Win Rate: 0.18, Draw Rate: 0.06, Wins: 913, Draws: 179
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Episode: 4000, Win Rate: 0.12, Draw Rate: 0.07, Wins: 967, Draws: 213
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Episode: 4500, Win Rate: 0.10, Draw Rate: 0.08, Wins: 1023, Draws: 238
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) ch



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7


<__main__.SQNAgent at 0x17eb40bc0>

In [24]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.99, learning_rate=0.001, batch_size=64, epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.995, replay_buffer_size=20000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        processed_state = self.process_state(state)
        
        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = 1
            if self.check_winning_move(temp_state):
                return action

        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = -1
            if self.check_winning_move(temp_state):
                return action

        if len(valid_actions) == 8:  
            if state[4] == 1:
                corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
                if corners:
                    return random.choice(corners)
            if state[0] == 1 or state[2] == 1 or state[6] == 1 or state[8] == 1:
                if 4 in valid_actions:
                    return 4

        if len(valid_actions) == 6:
            opposite_corners = {0: 8, 2: 6, 6: 2, 8: 0}
            for corner in [0, 2, 6, 8]:
                if state[corner] == 1 and opposite_corners[corner] in valid_actions:
                    return opposite_corners[corner]

        if np.random.rand() < self.epsilon:
            strategic_moves = []
            if 4 in valid_actions:
                strategic_moves.append(4)
            corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
            strategic_moves.extend(corners)
            edges = [pos for pos in valid_actions if pos in [1, 3, 5, 7]]
            strategic_moves.extend(edges)
            
            return strategic_moves[0] if strategic_moves else random.choice(valid_actions)
        
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
            
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)
        
        x = []
        y = []
        
        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            target = reward if done else reward + self.gamma * np.max(next_q_values[i])
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)
        
        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def process_state(self, state):
        return np.array([1 if x == 2 else -1 if x == 1 else 0 for x in state])

    def check_winning_move(self, board):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        return any(sum(board[i] for i in condition) == 3 for condition in win_conditions)

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=5000, save_interval=500, model_path="trained_sqn_model_v4.h5"):
    agent = SQNAgent()
    win_count = draw_count = loss_count = 0
    last_100_games = deque(maxlen=100)

    for episode in range(num_episodes):
        smartMovePlayer1_prob = min(0.9, episode / (num_episodes * 0.5))
        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False

        game.player1_move()
        state = np.array(game.board)

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.make_move(action, player=2)
            reward = 0
            
            if game.current_winner == 2:
                reward = 2.0
                done = True
            elif game.is_full():
                reward = 0.5
                done = True
            else:
                game.player1_move()
                if game.current_winner == 1:
                    reward = -2.0
                    done = True
                elif game.is_full():
                    reward = 0.5
                    done = True

            next_state = np.array(game.board)
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state

            if done:
                if game.current_winner == 2:
                    win_count += 1
                    last_100_games.append(1)
                elif game.current_winner == 1:
                    loss_count += 1
                    last_100_games.append(-1)
                else:
                    draw_count += 1
                    last_100_games.append(0)

        agent.train_from_experience()

        if episode % 100 == 0:
            win_rate = sum(1 for x in last_100_games if x == 1) / len(last_100_games) if last_100_games else 0
            draw_rate = sum(1 for x in last_100_games if x == 0) / len(last_100_games) if last_100_games else 0
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Draw Rate: {draw_rate:.2f}, Wins: {win_count}, Draws: {draw_count}")

        if episode % save_interval == 0:
            agent.epsilon = 1.0  # Reset epsilon after every save_interval episodes
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    return agent

train_sqn_agent()




Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Episode: 0, Win Rate: 0.00, Draw Rate: 0.00, Wins: 0, Draws: 0
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses po



Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Episode: 500, Win Rate: 0.49, Draw Rate: 0.12, Wins: 269, Draws: 33
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) choos



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Episode: 1000, Win Rate: 0.39, Draw Rate: 0.07, Wins: 475, Draws: 64
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) choo



Episode: 1500, Win Rate: 0.32, Draw Rate: 0.06, Wins: 626, Draws: 81
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) choo



Episode: 2000, Win Rate: 0.15, Draw Rate: 0.07, Wins: 739, Draws: 105
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Episode: 2500, Win Rate: 0.10, Draw Rate: 0.05, Wins: 796, Draws: 136
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Episode: 3000, Win Rate: 0.09, Draw Rate: 0.10, Wins: 857, Draws: 164
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Episode: 3500, Win Rate: 0.10, Draw Rate: 0.04, Wins: 911, Draws: 182
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) cho



Episode: 4000, Win Rate: 0.13, Draw Rate: 0.06, Wins: 966, Draws: 204
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) cho



Episode: 4500, Win Rate: 0.13, Draw Rate: 0.07, Wins: 1018, Draws: 224
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) ch



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7


<__main__.SQNAgent at 0x17980ce30>

In [27]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgentV5:
    def __init__(self, state_size=9, action_size=9, gamma=0.99, learning_rate=0.001, batch_size=64, 
                 epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.997, replay_buffer_size=20000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        # Main network
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        
        # Target network
        self.target_model = clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def update_target_model(self):
        """Update target model weights from main model."""
        self.target_model.set_weights(self.model.get_weights())

    def select_action(self, state, valid_actions):
        processed_state = self.process_state(state)
        
        if np.random.rand() < self.epsilon:
            strategic_moves = []
            if 4 in valid_actions:
                strategic_moves.append(4)
            corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
            strategic_moves.extend(corners)
            edges = [pos for pos in valid_actions if pos in [1, 3, 5, 7]]
            strategic_moves.extend(edges)
            
            return strategic_moves[0] if strategic_moves else random.choice(valid_actions)
        
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        # Filter prioritized experiences
        prioritized_experiences = [exp for exp in self.replay_buffer if exp[2] < 0 or exp[2] == 2.0]
        
        # Ensure enough samples exist for both prioritized and general experiences
        num_prioritized = min(len(prioritized_experiences), self.batch_size // 2)
        num_regular = self.batch_size - num_prioritized
        
        # Randomly sample prioritized experiences and fill the rest with regular experiences
        sample_batch = random.sample(prioritized_experiences, num_prioritized) + \
                       random.sample(self.replay_buffer, num_regular)
        
        states = np.array([self.process_state(exp[0]) for exp in sample_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in sample_batch])
        
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)
        
        x = []
        y = []
        
        for i, (state, action, reward, next_state, done) in enumerate(sample_batch):
            target = reward if done else reward + self.gamma * np.max(next_q_values[i])
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)
        
        # Train model with adjusted epochs
        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=2, verbose=0)
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)


    def process_state(self, state):
        return np.array([1 if x == 2 else -1 if x == 1 else 0 for x in state])

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent_v5(num_episodes=5000, save_interval=500, model_path="trained_sqn_model_v5.h5"):
    agent = SQNAgentV5()
    win_count = draw_count = loss_count = 0
    last_100_games = deque(maxlen=100)

    for episode in range(num_episodes):
        smartMovePlayer1_prob = min(0.9, episode / (num_episodes * 0.5))
        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False

        game.player1_move()
        state = np.array(game.board)

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.make_move(action, player=2)
            reward = 0
            
            if game.current_winner == 2:
                reward = 2.0
                done = True
            elif game.is_full():
                reward = 0.5
                done = True
            else:
                game.player1_move()
                if game.current_winner == 1:
                    reward = -2.5  # Increased penalty for loss
                    done = True
                elif game.is_full():
                    reward = 0.5
                    done = True

            next_state = np.array(game.board)
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state

            if done:
                if game.current_winner == 2:
                    win_count += 1
                    last_100_games.append(1)
                elif game.current_winner == 1:
                    loss_count += 1
                    last_100_games.append(-1)
                else:
                    draw_count += 1
                    last_100_games.append(0)

        agent.train_from_experience()

        if episode % 10 == 0:  # Update target model every 10 episodes
            agent.update_target_model()

        if episode % 100 == 0:
            win_rate = sum(1 for x in last_100_games if x == 1) / len(last_100_games) if last_100_games else 0
            draw_rate = sum(1 for x in last_100_games if x == 0) / len(last_100_games) if last_100_games else 0
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Draw Rate: {draw_rate:.2f}, Wins: {win_count}, Draws: {draw_count}")

        if episode % save_interval == 0:
            agent.epsilon = 1.0  # Reset epsilon every save_interval episodes
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    return agent

train_sqn_agent_v5()




Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Episode: 0, Win Rate: 1.00, Draw Rate: 0.00, Wins: 1, Draws: 0
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses po



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Episode: 500, Win Rate: 0.58, Draw Rate: 0.11, Wins: 284, Draws: 38
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) choos



Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Episode: 1000, Win Rate: 0.31, Draw Rate: 0.25, Wins: 496, Draws: 114
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) cho



Episode: 1500, Win Rate: 0.29, Draw Rate: 0.31, Wins: 668, Draws: 211
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Episode: 2000, Win Rate: 0.23, Draw Rate: 0.45, Wins: 785, Draws: 342
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 9
Episode: 2500, Win Rate: 0.17, Draw Rate: 0.48, Wins: 855, Draws: 490
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Episode: 3000, Win Rate: 0.12, Draw Rate: 0.59, Wins: 920, Draws: 652
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) cho



Episode: 3500, Win Rate: 0.06, Draw Rate: 0.45, Wins: 972, Draws: 797
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) cho



Episode: 4000, Win Rate: 0.13, Draw Rate: 0.44, Wins: 1034, Draws: 938
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) ch



Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Episode: 4500, Win Rate: 0.07, Draw Rate: 0.57, Wins: 1082, Draws: 1120
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) c



Player 1 (Smart/Random) chooses position 1


<__main__.SQNAgentV5 at 0x309d3f530>

In [31]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.99, learning_rate=0.001, batch_size=64, epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.995, replay_buffer_size=20000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        processed_state = self.process_state(state)
        
        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = 1
            if self.check_winning_move(temp_state):
                return action

        for action in valid_actions:
            temp_state = processed_state.copy()
            temp_state[action] = -1
            if self.check_winning_move(temp_state):
                return action

        if len(valid_actions) == 8:  
            if state[4] == 1:
                corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
                if corners:
                    return random.choice(corners)
            if state[0] == 1 or state[2] == 1 or state[6] == 1 or state[8] == 1:
                if 4 in valid_actions:
                    return 4

        if len(valid_actions) == 6:
            opposite_corners = {0: 8, 2: 6, 6: 2, 8: 0}
            for corner in [0, 2, 6, 8]:
                if state[corner] == 1 and opposite_corners[corner] in valid_actions:
                    return opposite_corners[corner]

        if np.random.rand() < self.epsilon:
            strategic_moves = []
            if 4 in valid_actions:
                strategic_moves.append(4)
            corners = [pos for pos in valid_actions if pos in [0, 2, 6, 8]]
            strategic_moves.extend(corners)
            edges = [pos for pos in valid_actions if pos in [1, 3, 5, 7]]
            strategic_moves.extend(edges)
            
            return strategic_moves[0] if strategic_moves else random.choice(valid_actions)
        
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
            
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)
        
        x = []
        y = []
        
        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            target = reward if done else reward + self.gamma * np.max(next_q_values[i])
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)
        
        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def process_state(self, state):
        return np.array([1 if x == 2 else -1 if x == 1 else 0 for x in state])

    def check_winning_move(self, board):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        return any(sum(board[i] for i in condition) == 3 for condition in win_conditions)

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=5000, save_interval=500, model_path="trained_sqn_model_v7.h5"):
    agent = SQNAgent()
    win_count = draw_count = loss_count = 0
    last_100_games = deque(maxlen=100)

    for episode in range(num_episodes):
        smartMovePlayer1_prob = min(0.9, episode / (num_episodes * 0.5))
        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False

        game.player1_move()
        state = np.array(game.board)

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.make_move(action, player=2)
            reward = 0
            
            if game.current_winner == 2:
                reward = 1
                done = True
            elif game.is_full():
                reward = 0  # Draw reward given only if board is full after Player 1's move
                done = True
            else:
                game.player1_move()
                if game.current_winner == 1:
                    reward = -1
                    done = True
                elif game.is_full():
                    reward = 0  # Draw reward when board is full after Player 1's move
                    done = True

            next_state = np.array(game.board)
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state

            if done:
                if game.current_winner == 2:
                    win_count += 1
                    last_100_games.append(1)
                elif game.current_winner == 1:
                    loss_count += 1
                    last_100_games.append(-1)
                else:
                    draw_count += 1
                    last_100_games.append(0)

        agent.train_from_experience()

        if episode % 100 == 0:
            win_rate = sum(1 for x in last_100_games if x == 1) / len(last_100_games) if last_100_games else 0
            draw_rate = sum(1 for x in last_100_games if x == 0) / len(last_100_games) if last_100_games else 0
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Draw Rate: {draw_rate:.2f}, Wins: {win_count}, Draws: {draw_count}")

        if episode % save_interval == 0:
            agent.epsilon = 1.0  # Reset epsilon after every save_interval episodes
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    return agent

train_sqn_agent()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Episode: 0, Win Rate: 1.00, Draw Rate: 0.00, Wins: 1, Draws: 0
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses po



Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Episode: 500, Win Rate: 0.42, Draw Rate: 0.04, Wins: 270, Draws: 26
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) choos



Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Episode: 1000, Win Rate: 0.35, Draw Rate: 0.06, Wins: 483, Draws: 59
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) choo



Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Episode: 1500, Win Rate: 0.30, Draw Rate: 0.05, Wins: 652, Draws: 83
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) choo



Episode: 2000, Win Rate: 0.13, Draw Rate: 0.01, Wins: 752, Draws: 105
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Episode: 2500, Win Rate: 0.08, Draw Rate: 0.03, Wins: 810, Draws: 129
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Episode: 3000, Win Rate: 0.05, Draw Rate: 0.07, Wins: 857, Draws: 155
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 8
Episode: 3500, Win Rate: 0.12, Draw Rate: 0.04, Wins: 906, Draws: 174
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) cho



Episode: 4000, Win Rate: 0.09, Draw Rate: 0.02, Wins: 950, Draws: 197
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) cho



Episode: 4500, Win Rate: 0.04, Draw Rate: 0.05, Wins: 994, Draws: 225
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) cho



Player 1 (Smart/Random) chooses position 8


<__main__.SQNAgent at 0x3049b81d0>