In [5]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    
    
    for episode in range(episodes):
        agent.epsilon=min(min_epsilon,agent.epsilon*agent.epsilon_decay)
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = 0.0
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model1_episode_{episode}.h5')
    agent.model.save('model1.h5')
agent = train_agent()





Episode: 0, Win Rate: 0.00, Epsilon: 0.010, smartmove0.0
Wins: 0, Losses: 1, Draws: 0
Episode: 100, Win Rate: 0.30, Epsilon: 0.010, smartmove0.016666666666666666
Wins: 30, Losses: 56, Draws: 15
Episode: 200, Win Rate: 0.29, Epsilon: 0.009, smartmove0.03333333333333333
Wins: 58, Losses: 106, Draws: 37
Episode: 300, Win Rate: 0.33, Epsilon: 0.009, smartmove0.05
Wins: 98, Losses: 155, Draws: 48
Episode: 400, Win Rate: 0.32, Epsilon: 0.008, smartmove0.06666666666666667
Wins: 130, Losses: 210, Draws: 61
Episode: 500, Win Rate: 0.34, Epsilon: 0.008, smartmove0.08333333333333333
Wins: 169, Losses: 257, Draws: 75
Episode: 600, Win Rate: 0.34, Epsilon: 0.007, smartmove0.1
Wins: 202, Losses: 315, Draws: 84
Episode: 700, Win Rate: 0.33, Epsilon: 0.007, smartmove0.11666666666666667
Wins: 232, Losses: 371, Draws: 98
Episode: 800, Win Rate: 0.33, Epsilon: 0.007, smartmove0.13333333333333333
Wins: 268, Losses: 431, Draws: 102
Episode: 900, Win Rate: 0.33, Epsilon: 0.006, smartmove0.15
Wins: 297, Loss



Episode: 1000, Win Rate: 0.32, Epsilon: 0.006, smartmove0.16666666666666666
Wins: 324, Losses: 542, Draws: 135
Episode: 1100, Win Rate: 0.32, Epsilon: 0.006, smartmove0.18333333333333332
Wins: 351, Losses: 600, Draws: 150
Episode: 1200, Win Rate: 0.32, Epsilon: 0.005, smartmove0.2
Wins: 384, Losses: 656, Draws: 161
Episode: 1300, Win Rate: 0.32, Epsilon: 0.005, smartmove0.21666666666666667
Wins: 410, Losses: 709, Draws: 182
Episode: 1400, Win Rate: 0.31, Epsilon: 0.005, smartmove0.23333333333333334
Wins: 428, Losses: 776, Draws: 197
Episode: 1500, Win Rate: 0.30, Epsilon: 0.005, smartmove0.25
Wins: 447, Losses: 837, Draws: 217
Episode: 1600, Win Rate: 0.30, Epsilon: 0.004, smartmove0.26666666666666666
Wins: 478, Losses: 888, Draws: 235
Episode: 1700, Win Rate: 0.30, Epsilon: 0.004, smartmove0.2833333333333333
Wins: 503, Losses: 944, Draws: 254
Episode: 1800, Win Rate: 0.30, Epsilon: 0.004, smartmove0.3
Wins: 533, Losses: 997, Draws: 271
Episode: 1900, Win Rate: 0.29, Epsilon: 0.004, sm



Episode: 2000, Win Rate: 0.28, Epsilon: 0.004, smartmove0.3333333333333333
Wins: 566, Losses: 1134, Draws: 301
Episode: 2100, Win Rate: 0.28, Epsilon: 0.003, smartmove0.35
Wins: 594, Losses: 1193, Draws: 314
Episode: 2200, Win Rate: 0.28, Epsilon: 0.003, smartmove0.36666666666666664
Wins: 617, Losses: 1262, Draws: 322
Episode: 2300, Win Rate: 0.28, Epsilon: 0.003, smartmove0.38333333333333336
Wins: 641, Losses: 1320, Draws: 340
Episode: 2400, Win Rate: 0.28, Epsilon: 0.003, smartmove0.4
Wins: 661, Losses: 1377, Draws: 363
Episode: 2500, Win Rate: 0.28, Epsilon: 0.003, smartmove0.4166666666666667
Wins: 690, Losses: 1433, Draws: 378
Episode: 2600, Win Rate: 0.27, Epsilon: 0.003, smartmove0.43333333333333335
Wins: 710, Losses: 1494, Draws: 397
Episode: 2700, Win Rate: 0.27, Epsilon: 0.003, smartmove0.45
Wins: 728, Losses: 1554, Draws: 419
Episode: 2800, Win Rate: 0.27, Epsilon: 0.002, smartmove0.4666666666666667
Wins: 747, Losses: 1611, Draws: 443
Episode: 2900, Win Rate: 0.27, Epsilon: 0



Episode: 3000, Win Rate: 0.27, Epsilon: 0.002, smartmove0.5
Wins: 799, Losses: 1732, Draws: 470
Episode: 3100, Win Rate: 0.27, Epsilon: 0.002, smartmove0.5166666666666667
Wins: 823, Losses: 1790, Draws: 488
Episode: 3200, Win Rate: 0.26, Epsilon: 0.002, smartmove0.5333333333333333
Wins: 845, Losses: 1849, Draws: 507
Episode: 3300, Win Rate: 0.26, Epsilon: 0.002, smartmove0.55
Wins: 861, Losses: 1901, Draws: 539
Episode: 3400, Win Rate: 0.26, Epsilon: 0.002, smartmove0.5666666666666667
Wins: 879, Losses: 1963, Draws: 559
Episode: 3500, Win Rate: 0.26, Epsilon: 0.002, smartmove0.5833333333333334
Wins: 896, Losses: 2018, Draws: 587
Episode: 3600, Win Rate: 0.25, Epsilon: 0.002, smartmove0.6
Wins: 910, Losses: 2084, Draws: 607
Episode: 3700, Win Rate: 0.25, Epsilon: 0.002, smartmove0.6166666666666667
Wins: 920, Losses: 2144, Draws: 637
Episode: 3800, Win Rate: 0.25, Epsilon: 0.001, smartmove0.6333333333333333
Wins: 940, Losses: 2201, Draws: 660
Episode: 3900, Win Rate: 0.24, Epsilon: 0.001



Episode: 4000, Win Rate: 0.24, Epsilon: 0.001, smartmove0.6666666666666666
Wins: 966, Losses: 2322, Draws: 713
Episode: 4100, Win Rate: 0.24, Epsilon: 0.001, smartmove0.6833333333333333
Wins: 983, Losses: 2378, Draws: 740
Episode: 4200, Win Rate: 0.24, Epsilon: 0.001, smartmove0.7
Wins: 995, Losses: 2438, Draws: 768
Episode: 4300, Win Rate: 0.23, Epsilon: 0.001, smartmove0.7166666666666667
Wins: 1009, Losses: 2495, Draws: 797
Episode: 4400, Win Rate: 0.23, Epsilon: 0.001, smartmove0.7333333333333333
Wins: 1026, Losses: 2543, Draws: 832
Episode: 4500, Win Rate: 0.23, Epsilon: 0.001, smartmove0.75
Wins: 1040, Losses: 2590, Draws: 871
Episode: 4600, Win Rate: 0.23, Epsilon: 0.001, smartmove0.7666666666666667
Wins: 1054, Losses: 2643, Draws: 904
Episode: 4700, Win Rate: 0.23, Epsilon: 0.001, smartmove0.7833333333333333
Wins: 1066, Losses: 2708, Draws: 927
Episode: 4800, Win Rate: 0.22, Epsilon: 0.001, smartmove0.8
Wins: 1079, Losses: 2758, Draws: 964
Episode: 4900, Win Rate: 0.22, Epsilon:



Episode: 5000, Win Rate: 0.22, Epsilon: 0.001, smartmove0.8333333333333334
Wins: 1106, Losses: 2876, Draws: 1019
Episode: 5100, Win Rate: 0.22, Epsilon: 0.001, smartmove0.85
Wins: 1111, Losses: 2937, Draws: 1053
Episode: 5200, Win Rate: 0.22, Epsilon: 0.001, smartmove0.8666666666666667
Wins: 1120, Losses: 3001, Draws: 1080
Episode: 5300, Win Rate: 0.21, Epsilon: 0.001, smartmove0.8833333333333333
Wins: 1133, Losses: 3058, Draws: 1110
Episode: 5400, Win Rate: 0.21, Epsilon: 0.001, smartmove0.9
Wins: 1138, Losses: 3117, Draws: 1146
Episode: 5500, Win Rate: 0.21, Epsilon: 0.001, smartmove0.9166666666666666
Wins: 1149, Losses: 3178, Draws: 1174
Episode: 5600, Win Rate: 0.21, Epsilon: 0.001, smartmove0.9333333333333333
Wins: 1156, Losses: 3243, Draws: 1202
Episode: 5700, Win Rate: 0.20, Epsilon: 0.001, smartmove0.95
Wins: 1162, Losses: 3310, Draws: 1229
Episode: 5800, Win Rate: 0.20, Epsilon: 0.001, smartmove0.9666666666666667
Wins: 1164, Losses: 3382, Draws: 1255
Episode: 5900, Win Rate: 0



Episode: 6000, Win Rate: 0.19, Epsilon: 0.000, smartmove1
Wins: 1168, Losses: 3531, Draws: 1302
Episode: 6100, Win Rate: 0.19, Epsilon: 0.000, smartmove1
Wins: 1173, Losses: 3587, Draws: 1341
Episode: 6200, Win Rate: 0.19, Epsilon: 0.000, smartmove1
Wins: 1174, Losses: 3653, Draws: 1374
Episode: 6300, Win Rate: 0.19, Epsilon: 0.000, smartmove1
Wins: 1179, Losses: 3713, Draws: 1409
Episode: 6400, Win Rate: 0.18, Epsilon: 0.000, smartmove1
Wins: 1183, Losses: 3779, Draws: 1439
Episode: 6500, Win Rate: 0.18, Epsilon: 0.000, smartmove1
Wins: 1188, Losses: 3846, Draws: 1467
Episode: 6600, Win Rate: 0.18, Epsilon: 0.000, smartmove1
Wins: 1193, Losses: 3913, Draws: 1495
Episode: 6700, Win Rate: 0.18, Epsilon: 0.000, smartmove1
Wins: 1196, Losses: 3972, Draws: 1533
Episode: 6800, Win Rate: 0.18, Epsilon: 0.000, smartmove1
Wins: 1201, Losses: 4028, Draws: 1572
Episode: 6900, Win Rate: 0.17, Epsilon: 0.000, smartmove1
Wins: 1204, Losses: 4095, Draws: 1602




Episode: 7000, Win Rate: 0.17, Epsilon: 0.000, smartmove1
Wins: 1206, Losses: 4150, Draws: 1645
Episode: 7100, Win Rate: 0.17, Epsilon: 0.000, smartmove1
Wins: 1210, Losses: 4214, Draws: 1677
Episode: 7200, Win Rate: 0.17, Epsilon: 0.000, smartmove1
Wins: 1220, Losses: 4273, Draws: 1708
Episode: 7300, Win Rate: 0.17, Epsilon: 0.000, smartmove1
Wins: 1227, Losses: 4332, Draws: 1742
Episode: 7400, Win Rate: 0.17, Epsilon: 0.000, smartmove1
Wins: 1232, Losses: 4390, Draws: 1779
Episode: 7500, Win Rate: 0.16, Epsilon: 0.000, smartmove1
Wins: 1234, Losses: 4460, Draws: 1807
Episode: 7600, Win Rate: 0.16, Epsilon: 0.000, smartmove1
Wins: 1234, Losses: 4540, Draws: 1827
Episode: 7700, Win Rate: 0.16, Epsilon: 0.000, smartmove1
Wins: 1238, Losses: 4603, Draws: 1860
Episode: 7800, Win Rate: 0.16, Epsilon: 0.000, smartmove1
Wins: 1242, Losses: 4677, Draws: 1882
Episode: 7900, Win Rate: 0.16, Epsilon: 0.000, smartmove1
Wins: 1243, Losses: 4746, Draws: 1912




Episode: 8000, Win Rate: 0.16, Epsilon: 0.000, smartmove1
Wins: 1247, Losses: 4810, Draws: 1944
Episode: 8100, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1251, Losses: 4883, Draws: 1967
Episode: 8200, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1254, Losses: 4947, Draws: 2000
Episode: 8300, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1264, Losses: 5013, Draws: 2024
Episode: 8400, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1271, Losses: 5078, Draws: 2052
Episode: 8500, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1279, Losses: 5146, Draws: 2076
Episode: 8600, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1283, Losses: 5211, Draws: 2107
Episode: 8700, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1286, Losses: 5285, Draws: 2130
Episode: 8800, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1290, Losses: 5354, Draws: 2157
Episode: 8900, Win Rate: 0.15, Epsilon: 0.000, smartmove1
Wins: 1293, Losses: 5425, Draws: 2183




Episode: 9000, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1300, Losses: 5498, Draws: 2203
Episode: 9100, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1308, Losses: 5572, Draws: 2221
Episode: 9200, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1315, Losses: 5638, Draws: 2248
Episode: 9300, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1324, Losses: 5706, Draws: 2271
Episode: 9400, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1330, Losses: 5778, Draws: 2293
Episode: 9500, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1336, Losses: 5842, Draws: 2323
Episode: 9600, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1341, Losses: 5913, Draws: 2347
Episode: 9700, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1347, Losses: 5972, Draws: 2382
Episode: 9800, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1351, Losses: 6049, Draws: 2401
Episode: 9900, Win Rate: 0.14, Epsilon: 0.000, smartmove1
Wins: 1356, Losses: 6122, Draws: 2423




In [7]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon +=0.4
            print("Increasing Epsillion")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = 0.0
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model2_episode_{episode}.h5')
    agent.model.save('model2.h5')
agent = train_agent()





Episode: 0, Win Rate: 0.00, Epsilon: 1.000, smartmove0.0
Wins: 0, Losses: 1, Draws: 0
Episode: 100, Win Rate: 0.29, Epsilon: 0.901, smartmove0.016666666666666666
Wins: 29, Losses: 57, Draws: 15
Episode: 200, Win Rate: 0.28, Epsilon: 0.802, smartmove0.03333333333333333
Wins: 57, Losses: 116, Draws: 28
Episode: 300, Win Rate: 0.28, Epsilon: 0.703, smartmove0.05
Wins: 85, Losses: 180, Draws: 36
Episode: 400, Win Rate: 0.30, Epsilon: 0.604, smartmove0.06666666666666667
Wins: 120, Losses: 238, Draws: 43
Episode: 500, Win Rate: 0.30, Epsilon: 0.505, smartmove0.08333333333333333
Wins: 152, Losses: 297, Draws: 52
Episode: 600, Win Rate: 0.32, Epsilon: 0.406, smartmove0.1
Wins: 192, Losses: 350, Draws: 59
Episode: 700, Win Rate: 0.32, Epsilon: 0.307, smartmove0.11666666666666667
Wins: 226, Losses: 408, Draws: 67
Episode: 800, Win Rate: 0.31, Epsilon: 0.208, smartmove0.13333333333333333
Wins: 252, Losses: 471, Draws: 78
Episode: 900, Win Rate: 0.31, Epsilon: 0.109, smartmove0.15
Wins: 282, Losse



Episode: 1000, Win Rate: 0.32, Epsilon: 0.010, smartmove0.16666666666666666
Wins: 320, Losses: 583, Draws: 98
Episode: 1100, Win Rate: 0.32, Epsilon: 0.010, smartmove0.18333333333333332
Wins: 352, Losses: 643, Draws: 106
Episode: 1200, Win Rate: 0.32, Epsilon: 0.010, smartmove0.2
Wins: 382, Losses: 703, Draws: 116
Episode: 1300, Win Rate: 0.31, Epsilon: 0.010, smartmove0.21666666666666667
Wins: 409, Losses: 765, Draws: 127
Episode: 1400, Win Rate: 0.32, Epsilon: 0.010, smartmove0.23333333333333334
Wins: 444, Losses: 818, Draws: 139
Episode: 1500, Win Rate: 0.32, Epsilon: 0.010, smartmove0.25
Wins: 482, Losses: 874, Draws: 145
Episode: 1600, Win Rate: 0.32, Epsilon: 0.010, smartmove0.26666666666666666
Wins: 515, Losses: 936, Draws: 150
Episode: 1700, Win Rate: 0.32, Epsilon: 0.010, smartmove0.2833333333333333
Wins: 538, Losses: 1001, Draws: 162
Episode: 1800, Win Rate: 0.31, Epsilon: 0.010, smartmove0.3
Wins: 563, Losses: 1064, Draws: 174
Episode: 1900, Win Rate: 0.31, Epsilon: 0.010, s



Increasing Epsillion
Episode: 2000, Win Rate: 0.31, Epsilon: 0.410, smartmove0.3333333333333333
Wins: 625, Losses: 1187, Draws: 189
Episode: 2100, Win Rate: 0.31, Epsilon: 0.901, smartmove0.35
Wins: 651, Losses: 1253, Draws: 197
Episode: 2200, Win Rate: 0.30, Epsilon: 0.802, smartmove0.36666666666666664
Wins: 668, Losses: 1322, Draws: 211
Episode: 2300, Win Rate: 0.30, Epsilon: 0.703, smartmove0.38333333333333336
Wins: 690, Losses: 1387, Draws: 224
Episode: 2400, Win Rate: 0.30, Epsilon: 0.604, smartmove0.4
Wins: 712, Losses: 1458, Draws: 231
Episode: 2500, Win Rate: 0.30, Epsilon: 0.505, smartmove0.4166666666666667
Wins: 739, Losses: 1520, Draws: 242
Episode: 2600, Win Rate: 0.29, Epsilon: 0.406, smartmove0.43333333333333335
Wins: 765, Losses: 1587, Draws: 249
Episode: 2700, Win Rate: 0.29, Epsilon: 0.307, smartmove0.45
Wins: 787, Losses: 1656, Draws: 258
Episode: 2800, Win Rate: 0.29, Epsilon: 0.208, smartmove0.4666666666666667
Wins: 814, Losses: 1722, Draws: 265
Episode: 2900, Win R



Episode: 3000, Win Rate: 0.28, Epsilon: 0.010, smartmove0.5
Wins: 854, Losses: 1862, Draws: 285
Episode: 3100, Win Rate: 0.28, Epsilon: 0.010, smartmove0.5166666666666667
Wins: 873, Losses: 1933, Draws: 295
Episode: 3200, Win Rate: 0.28, Epsilon: 0.010, smartmove0.5333333333333333
Wins: 904, Losses: 1999, Draws: 298
Episode: 3300, Win Rate: 0.28, Epsilon: 0.010, smartmove0.55
Wins: 936, Losses: 2055, Draws: 310
Episode: 3400, Win Rate: 0.28, Epsilon: 0.010, smartmove0.5666666666666667
Wins: 947, Losses: 2129, Draws: 325
Episode: 3500, Win Rate: 0.27, Epsilon: 0.010, smartmove0.5833333333333334
Wins: 961, Losses: 2205, Draws: 335
Episode: 3600, Win Rate: 0.27, Epsilon: 0.010, smartmove0.6
Wins: 986, Losses: 2260, Draws: 355
Episode: 3700, Win Rate: 0.27, Epsilon: 0.010, smartmove0.6166666666666667
Wins: 1013, Losses: 2322, Draws: 366
Episode: 3800, Win Rate: 0.27, Epsilon: 0.010, smartmove0.6333333333333333
Wins: 1034, Losses: 2383, Draws: 384
Episode: 3900, Win Rate: 0.27, Epsilon: 0.0



Increasing Epsillion
Episode: 4000, Win Rate: 0.27, Epsilon: 0.410, smartmove0.6666666666666666
Wins: 1073, Losses: 2514, Draws: 414
Episode: 4100, Win Rate: 0.26, Epsilon: 0.901, smartmove0.6833333333333333
Wins: 1076, Losses: 2598, Draws: 427
Episode: 4200, Win Rate: 0.26, Epsilon: 0.802, smartmove0.7
Wins: 1086, Losses: 2675, Draws: 440
Episode: 4300, Win Rate: 0.25, Epsilon: 0.703, smartmove0.7166666666666667
Wins: 1094, Losses: 2760, Draws: 447
Episode: 4400, Win Rate: 0.25, Epsilon: 0.604, smartmove0.7333333333333333
Wins: 1108, Losses: 2833, Draws: 460
Episode: 4500, Win Rate: 0.25, Epsilon: 0.505, smartmove0.75
Wins: 1120, Losses: 2913, Draws: 468
Episode: 4600, Win Rate: 0.25, Epsilon: 0.406, smartmove0.7666666666666667
Wins: 1128, Losses: 2986, Draws: 487
Episode: 4700, Win Rate: 0.24, Epsilon: 0.307, smartmove0.7833333333333333
Wins: 1140, Losses: 3056, Draws: 505
Episode: 4800, Win Rate: 0.24, Epsilon: 0.208, smartmove0.8
Wins: 1160, Losses: 3114, Draws: 527
Episode: 4900, 



Episode: 5000, Win Rate: 0.24, Epsilon: 0.010, smartmove0.8333333333333334
Wins: 1187, Losses: 3246, Draws: 568
Episode: 5100, Win Rate: 0.24, Epsilon: 0.010, smartmove0.85
Wins: 1203, Losses: 3299, Draws: 599
Episode: 5200, Win Rate: 0.23, Epsilon: 0.010, smartmove0.8666666666666667
Wins: 1218, Losses: 3365, Draws: 618
Episode: 5300, Win Rate: 0.23, Epsilon: 0.010, smartmove0.8833333333333333
Wins: 1234, Losses: 3420, Draws: 647
Episode: 5400, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9
Wins: 1251, Losses: 3475, Draws: 675
Episode: 5500, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9166666666666666
Wins: 1267, Losses: 3519, Draws: 715
Episode: 5600, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9333333333333333
Wins: 1281, Losses: 3575, Draws: 745
Episode: 5700, Win Rate: 0.23, Epsilon: 0.010, smartmove0.95
Wins: 1292, Losses: 3632, Draws: 777
Episode: 5800, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9666666666666667
Wins: 1307, Losses: 3687, Draws: 807
Episode: 5900, Win Rate: 0.22, Epsi



Increasing Epsillion
Episode: 6000, Win Rate: 0.22, Epsilon: 0.410, smartmove1
Wins: 1323, Losses: 3795, Draws: 883
Episode: 6100, Win Rate: 0.22, Epsilon: 0.901, smartmove1
Wins: 1324, Losses: 3881, Draws: 896
Episode: 6200, Win Rate: 0.21, Epsilon: 0.802, smartmove1
Wins: 1325, Losses: 3967, Draws: 909
Episode: 6300, Win Rate: 0.21, Epsilon: 0.703, smartmove1
Wins: 1328, Losses: 4053, Draws: 920
Episode: 6400, Win Rate: 0.21, Epsilon: 0.604, smartmove1
Wins: 1329, Losses: 4138, Draws: 934
Episode: 6500, Win Rate: 0.21, Epsilon: 0.505, smartmove1
Wins: 1333, Losses: 4216, Draws: 952
Episode: 6600, Win Rate: 0.20, Epsilon: 0.406, smartmove1
Wins: 1336, Losses: 4291, Draws: 974
Episode: 6700, Win Rate: 0.20, Epsilon: 0.307, smartmove1
Wins: 1337, Losses: 4369, Draws: 995
Episode: 6800, Win Rate: 0.20, Epsilon: 0.208, smartmove1
Wins: 1344, Losses: 4436, Draws: 1021
Episode: 6900, Win Rate: 0.20, Epsilon: 0.109, smartmove1
Wins: 1348, Losses: 4497, Draws: 1056




Episode: 7000, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1352, Losses: 4557, Draws: 1092
Episode: 7100, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1360, Losses: 4604, Draws: 1137
Episode: 7200, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1364, Losses: 4655, Draws: 1182
Episode: 7300, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1371, Losses: 4698, Draws: 1232
Episode: 7400, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1378, Losses: 4749, Draws: 1274
Episode: 7500, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1382, Losses: 4789, Draws: 1330
Episode: 7600, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1388, Losses: 4828, Draws: 1385
Episode: 7700, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1397, Losses: 4876, Draws: 1428
Episode: 7800, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1405, Losses: 4921, Draws: 1475
Episode: 7900, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1409, Losses: 4961, Draws: 1531




Increasing Epsillion
Episode: 8000, Win Rate: 0.18, Epsilon: 0.410, smartmove1
Wins: 1412, Losses: 5006, Draws: 1583
Episode: 8100, Win Rate: 0.17, Epsilon: 0.901, smartmove1
Wins: 1413, Losses: 5095, Draws: 1593
Episode: 8200, Win Rate: 0.17, Epsilon: 0.802, smartmove1
Wins: 1413, Losses: 5182, Draws: 1606
Episode: 8300, Win Rate: 0.17, Epsilon: 0.703, smartmove1
Wins: 1415, Losses: 5263, Draws: 1623
Episode: 8400, Win Rate: 0.17, Epsilon: 0.604, smartmove1
Wins: 1415, Losses: 5350, Draws: 1636
Episode: 8500, Win Rate: 0.17, Epsilon: 0.505, smartmove1
Wins: 1416, Losses: 5436, Draws: 1649
Episode: 8600, Win Rate: 0.16, Epsilon: 0.406, smartmove1
Wins: 1418, Losses: 5512, Draws: 1671
Episode: 8700, Win Rate: 0.16, Epsilon: 0.307, smartmove1
Wins: 1421, Losses: 5574, Draws: 1706
Episode: 8800, Win Rate: 0.16, Epsilon: 0.208, smartmove1
Wins: 1423, Losses: 5633, Draws: 1745
Episode: 8900, Win Rate: 0.16, Epsilon: 0.109, smartmove1
Wins: 1429, Losses: 5686, Draws: 1786




Episode: 9000, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1433, Losses: 5728, Draws: 1840
Episode: 9100, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1439, Losses: 5756, Draws: 1906
Episode: 9200, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1445, Losses: 5799, Draws: 1957
Episode: 9300, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1453, Losses: 5836, Draws: 2012
Episode: 9400, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1459, Losses: 5868, Draws: 2074
Episode: 9500, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1464, Losses: 5909, Draws: 2128
Episode: 9600, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1469, Losses: 5948, Draws: 2184
Episode: 9700, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1477, Losses: 5973, Draws: 2251
Episode: 9800, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1482, Losses: 6005, Draws: 2314
Episode: 9900, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1492, Losses: 6042, Draws: 2367




In [8]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = 0.0
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model3_episode_{episode}.h5')
    agent.model.save('model3.h5')
agent = train_agent()





Episode: 0, Win Rate: 1.00, Epsilon: 1.000, smartmove0.0
Wins: 1, Losses: 0, Draws: 0
Episode: 100, Win Rate: 0.26, Epsilon: 0.901, smartmove0.016666666666666666
Wins: 26, Losses: 59, Draws: 16
Episode: 200, Win Rate: 0.27, Epsilon: 0.802, smartmove0.03333333333333333
Wins: 54, Losses: 120, Draws: 27
Episode: 300, Win Rate: 0.27, Epsilon: 0.703, smartmove0.05
Wins: 82, Losses: 179, Draws: 40
Episode: 400, Win Rate: 0.29, Epsilon: 0.604, smartmove0.06666666666666667
Wins: 115, Losses: 239, Draws: 47
Episode: 500, Win Rate: 0.29, Epsilon: 0.505, smartmove0.08333333333333333
Wins: 146, Losses: 296, Draws: 59
Episode: 600, Win Rate: 0.29, Epsilon: 0.406, smartmove0.1
Wins: 177, Losses: 358, Draws: 66
Episode: 700, Win Rate: 0.30, Epsilon: 0.307, smartmove0.11666666666666667
Wins: 207, Losses: 422, Draws: 72
Episode: 800, Win Rate: 0.30, Epsilon: 0.208, smartmove0.13333333333333333
Wins: 244, Losses: 477, Draws: 80
Episode: 900, Win Rate: 0.31, Epsilon: 0.109, smartmove0.15
Wins: 283, Losse



Episode: 1000, Win Rate: 0.32, Epsilon: 0.010, smartmove0.16666666666666666
Wins: 325, Losses: 581, Draws: 95
Episode: 1100, Win Rate: 0.33, Epsilon: 0.010, smartmove0.18333333333333332
Wins: 364, Losses: 632, Draws: 105
Episode: 1200, Win Rate: 0.33, Epsilon: 0.010, smartmove0.2
Wins: 398, Losses: 691, Draws: 112
Episode: 1300, Win Rate: 0.33, Epsilon: 0.010, smartmove0.21666666666666667
Wins: 430, Losses: 754, Draws: 117
Episode: 1400, Win Rate: 0.33, Epsilon: 0.010, smartmove0.23333333333333334
Wins: 459, Losses: 815, Draws: 127
Episode: 1500, Win Rate: 0.33, Epsilon: 0.010, smartmove0.25
Wins: 495, Losses: 865, Draws: 141
Episode: 1600, Win Rate: 0.33, Epsilon: 0.010, smartmove0.26666666666666666
Wins: 524, Losses: 924, Draws: 153
Episode: 1700, Win Rate: 0.33, Epsilon: 0.010, smartmove0.2833333333333333
Wins: 564, Losses: 973, Draws: 164
Episode: 1800, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3
Wins: 603, Losses: 1023, Draws: 175
Episode: 1900, Win Rate: 0.34, Epsilon: 0.010, sm



Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.34, Epsilon: 1.000, smartmove0.3333333333333333
Wins: 673, Losses: 1139, Draws: 189
Episode: 2100, Win Rate: 0.33, Epsilon: 0.901, smartmove0.35
Wins: 698, Losses: 1210, Draws: 193
Episode: 2200, Win Rate: 0.33, Epsilon: 0.802, smartmove0.36666666666666664
Wins: 716, Losses: 1276, Draws: 209
Episode: 2300, Win Rate: 0.32, Epsilon: 0.703, smartmove0.38333333333333336
Wins: 730, Losses: 1351, Draws: 220
Episode: 2400, Win Rate: 0.31, Epsilon: 0.604, smartmove0.4
Wins: 746, Losses: 1426, Draws: 229
Episode: 2500, Win Rate: 0.31, Epsilon: 0.505, smartmove0.4166666666666667
Wins: 772, Losses: 1488, Draws: 241
Episode: 2600, Win Rate: 0.31, Epsilon: 0.406, smartmove0.43333333333333335
Wins: 797, Losses: 1557, Draws: 247
Episode: 2700, Win Rate: 0.30, Epsilon: 0.307, smartmove0.45
Wins: 819, Losses: 1630, Draws: 252
Episode: 2800, Win Rate: 0.30, Epsilon: 0.208, smartmove0.4666666666666667
Wins: 846, Losses: 1698, Draws: 257
Episode: 2900, W



Episode: 3000, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5
Wins: 901, Losses: 1822, Draws: 278
Episode: 3100, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5166666666666667
Wins: 916, Losses: 1899, Draws: 286
Episode: 3200, Win Rate: 0.29, Epsilon: 0.010, smartmove0.5333333333333333
Wins: 942, Losses: 1962, Draws: 297
Episode: 3300, Win Rate: 0.29, Epsilon: 0.010, smartmove0.55
Wins: 967, Losses: 2028, Draws: 306
Episode: 3400, Win Rate: 0.29, Epsilon: 0.010, smartmove0.5666666666666667
Wins: 998, Losses: 2087, Draws: 316
Episode: 3500, Win Rate: 0.29, Epsilon: 0.010, smartmove0.5833333333333334
Wins: 1013, Losses: 2157, Draws: 331
Episode: 3600, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6
Wins: 1027, Losses: 2231, Draws: 343
Episode: 3700, Win Rate: 0.28, Epsilon: 0.010, smartmove0.6166666666666667
Wins: 1048, Losses: 2303, Draws: 350
Episode: 3800, Win Rate: 0.28, Epsilon: 0.010, smartmove0.6333333333333333
Wins: 1072, Losses: 2368, Draws: 361
Episode: 3900, Win Rate: 0.28, Epsilon: 0



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.28, Epsilon: 1.000, smartmove0.6666666666666666
Wins: 1106, Losses: 2501, Draws: 394
Episode: 4100, Win Rate: 0.27, Epsilon: 0.901, smartmove0.6833333333333333
Wins: 1117, Losses: 2582, Draws: 402
Episode: 4200, Win Rate: 0.27, Epsilon: 0.802, smartmove0.7
Wins: 1129, Losses: 2663, Draws: 409
Episode: 4300, Win Rate: 0.27, Epsilon: 0.703, smartmove0.7166666666666667
Wins: 1142, Losses: 2739, Draws: 420
Episode: 4400, Win Rate: 0.26, Epsilon: 0.604, smartmove0.7333333333333333
Wins: 1155, Losses: 2808, Draws: 438
Episode: 4500, Win Rate: 0.26, Epsilon: 0.505, smartmove0.75
Wins: 1164, Losses: 2892, Draws: 445
Episode: 4600, Win Rate: 0.26, Epsilon: 0.406, smartmove0.7666666666666667
Wins: 1174, Losses: 2973, Draws: 454
Episode: 4700, Win Rate: 0.25, Epsilon: 0.307, smartmove0.7833333333333333
Wins: 1180, Losses: 3051, Draws: 470
Episode: 4800, Win Rate: 0.25, Epsilon: 0.208, smartmove0.8
Wins: 1187, Losses: 3127, Draws: 487
Episode: 49



Episode: 5000, Win Rate: 0.24, Epsilon: 0.010, smartmove0.8333333333333334
Wins: 1210, Losses: 3270, Draws: 521
Episode: 5100, Win Rate: 0.24, Epsilon: 0.010, smartmove0.85
Wins: 1224, Losses: 3332, Draws: 545
Episode: 5200, Win Rate: 0.24, Epsilon: 0.010, smartmove0.8666666666666667
Wins: 1241, Losses: 3390, Draws: 570
Episode: 5300, Win Rate: 0.24, Epsilon: 0.010, smartmove0.8833333333333333
Wins: 1253, Losses: 3462, Draws: 586
Episode: 5400, Win Rate: 0.24, Epsilon: 0.010, smartmove0.9
Wins: 1270, Losses: 3518, Draws: 613
Episode: 5500, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9166666666666666
Wins: 1287, Losses: 3568, Draws: 646
Episode: 5600, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9333333333333333
Wins: 1291, Losses: 3644, Draws: 666
Episode: 5700, Win Rate: 0.23, Epsilon: 0.010, smartmove0.95
Wins: 1296, Losses: 3714, Draws: 691
Episode: 5800, Win Rate: 0.22, Epsilon: 0.010, smartmove0.9666666666666667
Wins: 1299, Losses: 3780, Draws: 722
Episode: 5900, Win Rate: 0.22, Epsi



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.22, Epsilon: 1.000, smartmove1
Wins: 1305, Losses: 3907, Draws: 789
Episode: 6100, Win Rate: 0.21, Epsilon: 0.901, smartmove1
Wins: 1305, Losses: 3998, Draws: 798
Episode: 6200, Win Rate: 0.21, Epsilon: 0.802, smartmove1
Wins: 1308, Losses: 4080, Draws: 813
Episode: 6300, Win Rate: 0.21, Epsilon: 0.703, smartmove1
Wins: 1310, Losses: 4168, Draws: 823
Episode: 6400, Win Rate: 0.21, Epsilon: 0.604, smartmove1
Wins: 1314, Losses: 4247, Draws: 840
Episode: 6500, Win Rate: 0.20, Epsilon: 0.505, smartmove1
Wins: 1316, Losses: 4329, Draws: 856
Episode: 6600, Win Rate: 0.20, Epsilon: 0.406, smartmove1
Wins: 1317, Losses: 4402, Draws: 882
Episode: 6700, Win Rate: 0.20, Epsilon: 0.307, smartmove1
Wins: 1322, Losses: 4466, Draws: 913
Episode: 6800, Win Rate: 0.19, Epsilon: 0.208, smartmove1
Wins: 1325, Losses: 4534, Draws: 942
Episode: 6900, Win Rate: 0.19, Epsilon: 0.109, smartmove1
Wins: 1326, Losses: 4608, Draws: 967




Episode: 7000, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1329, Losses: 4664, Draws: 1008
Episode: 7100, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1331, Losses: 4712, Draws: 1058
Episode: 7200, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1334, Losses: 4761, Draws: 1106
Episode: 7300, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1342, Losses: 4816, Draws: 1143
Episode: 7400, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1349, Losses: 4853, Draws: 1199
Episode: 7500, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1354, Losses: 4900, Draws: 1247
Episode: 7600, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1359, Losses: 4939, Draws: 1303
Episode: 7700, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1360, Losses: 4977, Draws: 1364
Episode: 7800, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1364, Losses: 5016, Draws: 1421
Episode: 7900, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1367, Losses: 5061, Draws: 1473




Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.17, Epsilon: 1.000, smartmove1
Wins: 1373, Losses: 5101, Draws: 1527
Episode: 8100, Win Rate: 0.17, Epsilon: 0.901, smartmove1
Wins: 1375, Losses: 5192, Draws: 1534
Episode: 8200, Win Rate: 0.17, Epsilon: 0.802, smartmove1
Wins: 1377, Losses: 5275, Draws: 1549
Episode: 8300, Win Rate: 0.17, Epsilon: 0.703, smartmove1
Wins: 1380, Losses: 5357, Draws: 1564
Episode: 8400, Win Rate: 0.16, Epsilon: 0.604, smartmove1
Wins: 1382, Losses: 5439, Draws: 1580
Episode: 8500, Win Rate: 0.16, Epsilon: 0.505, smartmove1
Wins: 1382, Losses: 5520, Draws: 1599
Episode: 8600, Win Rate: 0.16, Epsilon: 0.406, smartmove1
Wins: 1385, Losses: 5590, Draws: 1626
Episode: 8700, Win Rate: 0.16, Epsilon: 0.307, smartmove1
Wins: 1388, Losses: 5653, Draws: 1660
Episode: 8800, Win Rate: 0.16, Epsilon: 0.208, smartmove1
Wins: 1392, Losses: 5712, Draws: 1697
Episode: 8900, Win Rate: 0.16, Epsilon: 0.109, smartmove1
Wins: 1398, Losses: 5752, Draws: 1751




Episode: 9000, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1405, Losses: 5788, Draws: 1808
Episode: 9100, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1416, Losses: 5818, Draws: 1867
Episode: 9200, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1420, Losses: 5849, Draws: 1932
Episode: 9300, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1430, Losses: 5874, Draws: 1997
Episode: 9400, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1437, Losses: 5899, Draws: 2065
Episode: 9500, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1441, Losses: 5922, Draws: 2138
Episode: 9600, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1443, Losses: 5949, Draws: 2209
Episode: 9700, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1447, Losses: 5978, Draws: 2276
Episode: 9800, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1455, Losses: 6004, Draws: 2342
Episode: 9900, Win Rate: 0.15, Epsilon: 0.010, smartmove1
Wins: 1461, Losses: 6030, Draws: 2410




In [9]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 10.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -10.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = 0.0
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model4_episode_{episode}.h5')
    agent.model.save('model4.h5')
agent = train_agent()





Episode: 0, Win Rate: 0.00, Epsilon: 1.000, smartmove0.0
Wins: 0, Losses: 0, Draws: 1
Episode: 100, Win Rate: 0.35, Epsilon: 0.901, smartmove0.016666666666666666
Wins: 35, Losses: 54, Draws: 12
Episode: 200, Win Rate: 0.32, Epsilon: 0.802, smartmove0.03333333333333333
Wins: 64, Losses: 114, Draws: 23
Episode: 300, Win Rate: 0.31, Epsilon: 0.703, smartmove0.05
Wins: 92, Losses: 173, Draws: 36
Episode: 400, Win Rate: 0.34, Epsilon: 0.604, smartmove0.06666666666666667
Wins: 136, Losses: 222, Draws: 43
Episode: 500, Win Rate: 0.34, Epsilon: 0.505, smartmove0.08333333333333333
Wins: 169, Losses: 277, Draws: 55
Episode: 600, Win Rate: 0.33, Epsilon: 0.406, smartmove0.1
Wins: 200, Losses: 339, Draws: 62
Episode: 700, Win Rate: 0.34, Epsilon: 0.307, smartmove0.11666666666666667
Wins: 236, Losses: 393, Draws: 72
Episode: 800, Win Rate: 0.35, Epsilon: 0.208, smartmove0.13333333333333333
Wins: 278, Losses: 442, Draws: 81
Episode: 900, Win Rate: 0.35, Epsilon: 0.109, smartmove0.15
Wins: 318, Losse



Episode: 1000, Win Rate: 0.37, Epsilon: 0.010, smartmove0.16666666666666666
Wins: 369, Losses: 543, Draws: 89
Episode: 1100, Win Rate: 0.37, Epsilon: 0.010, smartmove0.18333333333333332
Wins: 408, Losses: 601, Draws: 92
Episode: 1200, Win Rate: 0.38, Epsilon: 0.010, smartmove0.2
Wins: 457, Losses: 644, Draws: 100
Episode: 1300, Win Rate: 0.39, Epsilon: 0.010, smartmove0.21666666666666667
Wins: 505, Losses: 690, Draws: 106
Episode: 1400, Win Rate: 0.38, Epsilon: 0.010, smartmove0.23333333333333334
Wins: 534, Losses: 755, Draws: 112
Episode: 1500, Win Rate: 0.37, Epsilon: 0.010, smartmove0.25
Wins: 562, Losses: 817, Draws: 122
Episode: 1600, Win Rate: 0.37, Epsilon: 0.010, smartmove0.26666666666666666
Wins: 598, Losses: 872, Draws: 131
Episode: 1700, Win Rate: 0.37, Epsilon: 0.010, smartmove0.2833333333333333
Wins: 629, Losses: 935, Draws: 137
Episode: 1800, Win Rate: 0.36, Epsilon: 0.010, smartmove0.3
Wins: 653, Losses: 998, Draws: 150
Episode: 1900, Win Rate: 0.36, Epsilon: 0.010, smar



Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.36, Epsilon: 1.000, smartmove0.3333333333333333
Wins: 718, Losses: 1121, Draws: 162
Episode: 2100, Win Rate: 0.35, Epsilon: 0.901, smartmove0.35
Wins: 735, Losses: 1192, Draws: 174
Episode: 2200, Win Rate: 0.34, Epsilon: 0.802, smartmove0.36666666666666664
Wins: 753, Losses: 1264, Draws: 184
Episode: 2300, Win Rate: 0.34, Epsilon: 0.703, smartmove0.38333333333333336
Wins: 774, Losses: 1335, Draws: 192
Episode: 2400, Win Rate: 0.33, Epsilon: 0.604, smartmove0.4
Wins: 786, Losses: 1412, Draws: 203
Episode: 2500, Win Rate: 0.32, Epsilon: 0.505, smartmove0.4166666666666667
Wins: 801, Losses: 1487, Draws: 213
Episode: 2600, Win Rate: 0.32, Epsilon: 0.406, smartmove0.43333333333333335
Wins: 826, Losses: 1557, Draws: 218
Episode: 2700, Win Rate: 0.31, Epsilon: 0.307, smartmove0.45
Wins: 842, Losses: 1635, Draws: 224
Episode: 2800, Win Rate: 0.30, Epsilon: 0.208, smartmove0.4666666666666667
Wins: 854, Losses: 1714, Draws: 233
Episode: 2900, W



Episode: 3000, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5
Wins: 896, Losses: 1855, Draws: 250
Episode: 3100, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5166666666666667
Wins: 921, Losses: 1923, Draws: 257
Episode: 3200, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5333333333333333
Wins: 945, Losses: 1989, Draws: 267
Episode: 3300, Win Rate: 0.29, Epsilon: 0.010, smartmove0.55
Wins: 967, Losses: 2055, Draws: 279
Episode: 3400, Win Rate: 0.29, Epsilon: 0.010, smartmove0.5666666666666667
Wins: 987, Losses: 2129, Draws: 285
Episode: 3500, Win Rate: 0.29, Epsilon: 0.010, smartmove0.5833333333333334
Wins: 1003, Losses: 2200, Draws: 298
Episode: 3600, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6
Wins: 1031, Losses: 2262, Draws: 308
Episode: 3700, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6166666666666667
Wins: 1061, Losses: 2324, Draws: 316
Episode: 3800, Win Rate: 0.28, Epsilon: 0.010, smartmove0.6333333333333333
Wins: 1081, Losses: 2390, Draws: 330
Episode: 3900, Win Rate: 0.28, Epsilon: 0



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.28, Epsilon: 1.000, smartmove0.6666666666666666
Wins: 1128, Losses: 2526, Draws: 347
Episode: 4100, Win Rate: 0.28, Epsilon: 0.901, smartmove0.6833333333333333
Wins: 1137, Losses: 2614, Draws: 350
Episode: 4200, Win Rate: 0.27, Epsilon: 0.802, smartmove0.7
Wins: 1148, Losses: 2698, Draws: 355
Episode: 4300, Win Rate: 0.27, Epsilon: 0.703, smartmove0.7166666666666667
Wins: 1156, Losses: 2782, Draws: 363
Episode: 4400, Win Rate: 0.27, Epsilon: 0.604, smartmove0.7333333333333333
Wins: 1169, Losses: 2856, Draws: 376
Episode: 4500, Win Rate: 0.26, Epsilon: 0.505, smartmove0.75
Wins: 1181, Losses: 2933, Draws: 387
Episode: 4600, Win Rate: 0.26, Epsilon: 0.406, smartmove0.7666666666666667
Wins: 1191, Losses: 3011, Draws: 399
Episode: 4700, Win Rate: 0.26, Epsilon: 0.307, smartmove0.7833333333333333
Wins: 1208, Losses: 3082, Draws: 411
Episode: 4800, Win Rate: 0.25, Epsilon: 0.208, smartmove0.8
Wins: 1221, Losses: 3163, Draws: 417
Episode: 49



Episode: 5000, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8333333333333334
Wins: 1250, Losses: 3311, Draws: 440
Episode: 5100, Win Rate: 0.25, Epsilon: 0.010, smartmove0.85
Wins: 1259, Losses: 3392, Draws: 450
Episode: 5200, Win Rate: 0.24, Epsilon: 0.010, smartmove0.8666666666666667
Wins: 1273, Losses: 3469, Draws: 459
Episode: 5300, Win Rate: 0.24, Epsilon: 0.010, smartmove0.8833333333333333
Wins: 1294, Losses: 3535, Draws: 472
Episode: 5400, Win Rate: 0.24, Epsilon: 0.010, smartmove0.9
Wins: 1307, Losses: 3610, Draws: 484
Episode: 5500, Win Rate: 0.24, Epsilon: 0.010, smartmove0.9166666666666666
Wins: 1320, Losses: 3674, Draws: 507
Episode: 5600, Win Rate: 0.24, Epsilon: 0.010, smartmove0.9333333333333333
Wins: 1327, Losses: 3744, Draws: 530
Episode: 5700, Win Rate: 0.23, Epsilon: 0.010, smartmove0.95
Wins: 1334, Losses: 3822, Draws: 545
Episode: 5800, Win Rate: 0.23, Epsilon: 0.010, smartmove0.9666666666666667
Wins: 1345, Losses: 3889, Draws: 567
Episode: 5900, Win Rate: 0.23, Epsi



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.23, Epsilon: 1.000, smartmove1
Wins: 1360, Losses: 4027, Draws: 614
Episode: 6100, Win Rate: 0.22, Epsilon: 0.901, smartmove1
Wins: 1361, Losses: 4116, Draws: 624
Episode: 6200, Win Rate: 0.22, Epsilon: 0.802, smartmove1
Wins: 1362, Losses: 4209, Draws: 630
Episode: 6300, Win Rate: 0.22, Epsilon: 0.703, smartmove1
Wins: 1364, Losses: 4296, Draws: 641
Episode: 6400, Win Rate: 0.21, Epsilon: 0.604, smartmove1
Wins: 1365, Losses: 4379, Draws: 657
Episode: 6500, Win Rate: 0.21, Epsilon: 0.505, smartmove1
Wins: 1370, Losses: 4457, Draws: 674
Episode: 6600, Win Rate: 0.21, Epsilon: 0.406, smartmove1
Wins: 1373, Losses: 4539, Draws: 689
Episode: 6700, Win Rate: 0.21, Epsilon: 0.307, smartmove1
Wins: 1376, Losses: 4618, Draws: 707
Episode: 6800, Win Rate: 0.20, Epsilon: 0.208, smartmove1
Wins: 1379, Losses: 4697, Draws: 725
Episode: 6900, Win Rate: 0.20, Epsilon: 0.109, smartmove1
Wins: 1384, Losses: 4770, Draws: 747




Episode: 7000, Win Rate: 0.20, Epsilon: 0.010, smartmove1
Wins: 1390, Losses: 4843, Draws: 768
Episode: 7100, Win Rate: 0.20, Epsilon: 0.010, smartmove1
Wins: 1399, Losses: 4904, Draws: 798
Episode: 7200, Win Rate: 0.20, Epsilon: 0.010, smartmove1
Wins: 1409, Losses: 4963, Draws: 829
Episode: 7300, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1412, Losses: 5027, Draws: 862
Episode: 7400, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1417, Losses: 5084, Draws: 900
Episode: 7500, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1428, Losses: 5142, Draws: 931
Episode: 7600, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1437, Losses: 5190, Draws: 974
Episode: 7700, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1443, Losses: 5250, Draws: 1008
Episode: 7800, Win Rate: 0.19, Epsilon: 0.010, smartmove1
Wins: 1449, Losses: 5299, Draws: 1053
Episode: 7900, Win Rate: 0.18, Epsilon: 0.010, smartmove1
Wins: 1459, Losses: 5350, Draws: 1092




Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.18, Epsilon: 1.000, smartmove1
Wins: 1466, Losses: 5401, Draws: 1134
Episode: 8100, Win Rate: 0.18, Epsilon: 0.901, smartmove1
Wins: 1467, Losses: 5493, Draws: 1141
Episode: 8200, Win Rate: 0.18, Epsilon: 0.802, smartmove1
Wins: 1467, Losses: 5589, Draws: 1145
Episode: 8300, Win Rate: 0.18, Epsilon: 0.703, smartmove1
Wins: 1468, Losses: 5682, Draws: 1151
Episode: 8400, Win Rate: 0.17, Epsilon: 0.604, smartmove1
Wins: 1469, Losses: 5767, Draws: 1165
Episode: 8500, Win Rate: 0.17, Epsilon: 0.505, smartmove1
Wins: 1472, Losses: 5850, Draws: 1179
Episode: 8600, Win Rate: 0.17, Epsilon: 0.406, smartmove1
Wins: 1475, Losses: 5924, Draws: 1202
Episode: 8700, Win Rate: 0.17, Epsilon: 0.307, smartmove1
Wins: 1478, Losses: 6004, Draws: 1219
Episode: 8800, Win Rate: 0.17, Epsilon: 0.208, smartmove1
Wins: 1487, Losses: 6068, Draws: 1246
Episode: 8900, Win Rate: 0.17, Epsilon: 0.109, smartmove1
Wins: 1494, Losses: 6123, Draws: 1284




Episode: 9000, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1503, Losses: 6172, Draws: 1326
Episode: 9100, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1516, Losses: 6219, Draws: 1366
Episode: 9200, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1527, Losses: 6259, Draws: 1415
Episode: 9300, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1537, Losses: 6299, Draws: 1465
Episode: 9400, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1549, Losses: 6343, Draws: 1509
Episode: 9500, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1557, Losses: 6395, Draws: 1549
Episode: 9600, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1565, Losses: 6436, Draws: 1600
Episode: 9700, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1575, Losses: 6473, Draws: 1653
Episode: 9800, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1589, Losses: 6520, Draws: 1692
Episode: 9900, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1599, Losses: 6570, Draws: 1732




In [10]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = -0.5
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model5_episode_{episode}.h5')
    agent.model.save('model5.h5')
agent = train_agent()



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 0, Win Rate: 1.00, Epsilon: 1.000, smartmove0.0
Wins: 1, Losses: 0, Draws: 0
Episode: 100, Win Rate: 0.32, Epsilon: 0.901, smartmove0.016666666666666666
Wins: 32, Losses: 59, Draws: 10
Episode: 200, Win Rate: 0.32, Epsilon: 0.802, smartmove0.03333333333333333
Wins: 65, Losses: 118, Draws: 18
Episode: 300, Win Rate: 0.31, Epsilon: 0.703, smartmove0.05
Wins: 93, Losses: 180, Draws: 28
Episode: 400, Win Rate: 0.30, Epsilon: 0.604, smartmove0.06666666666666667
Wins: 120, Losses: 243, Draws: 38
Episode: 500, Win Rate: 0.29, Epsilon: 0.505, smartmove0.08333333333333333
Wins: 145, Losses: 310, Draws: 46
Episode: 600, Win Rate: 0.29, Epsilon: 0.406, smartmove0.1
Wins: 175, Losses: 370, Draws: 56
Episode: 700, Win Rate: 0.29, Epsilon: 0.307, smartmove0.11666666666666667
Wins: 202, Losses: 435, Draws: 64
Episode: 800, Win Rate: 0.29, Epsilon: 0.208, smartmove0.13333333333333333
Wins: 231, Losses: 500, Draws: 70
Episode: 900, Win Rate: 0.30, Epsilon: 0.109, smartmove0.15
Wins: 266, Losse



Episode: 1000, Win Rate: 0.30, Epsilon: 0.010, smartmove0.16666666666666666
Wins: 299, Losses: 622, Draws: 80
Episode: 1100, Win Rate: 0.30, Epsilon: 0.010, smartmove0.18333333333333332
Wins: 328, Losses: 688, Draws: 85
Episode: 1200, Win Rate: 0.30, Epsilon: 0.010, smartmove0.2
Wins: 362, Losses: 750, Draws: 89
Episode: 1300, Win Rate: 0.30, Epsilon: 0.010, smartmove0.21666666666666667
Wins: 391, Losses: 817, Draws: 93
Episode: 1400, Win Rate: 0.30, Epsilon: 0.010, smartmove0.23333333333333334
Wins: 422, Losses: 884, Draws: 95
Episode: 1500, Win Rate: 0.29, Epsilon: 0.010, smartmove0.25
Wins: 439, Losses: 953, Draws: 109
Episode: 1600, Win Rate: 0.29, Epsilon: 0.010, smartmove0.26666666666666666
Wins: 463, Losses: 1021, Draws: 117
Episode: 1700, Win Rate: 0.29, Epsilon: 0.010, smartmove0.2833333333333333
Wins: 489, Losses: 1083, Draws: 129
Episode: 1800, Win Rate: 0.29, Epsilon: 0.010, smartmove0.3
Wins: 515, Losses: 1149, Draws: 137
Episode: 1900, Win Rate: 0.28, Epsilon: 0.010, smar



Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.28, Epsilon: 1.000, smartmove0.3333333333333333
Wins: 567, Losses: 1281, Draws: 153
Episode: 2100, Win Rate: 0.28, Epsilon: 0.901, smartmove0.35
Wins: 588, Losses: 1347, Draws: 166
Episode: 2200, Win Rate: 0.28, Epsilon: 0.802, smartmove0.36666666666666664
Wins: 610, Losses: 1414, Draws: 177
Episode: 2300, Win Rate: 0.27, Epsilon: 0.703, smartmove0.38333333333333336
Wins: 630, Losses: 1489, Draws: 182
Episode: 2400, Win Rate: 0.27, Epsilon: 0.604, smartmove0.4
Wins: 648, Losses: 1558, Draws: 195
Episode: 2500, Win Rate: 0.27, Epsilon: 0.505, smartmove0.4166666666666667
Wins: 667, Losses: 1629, Draws: 205
Episode: 2600, Win Rate: 0.26, Epsilon: 0.406, smartmove0.43333333333333335
Wins: 689, Losses: 1700, Draws: 212
Episode: 2700, Win Rate: 0.26, Epsilon: 0.307, smartmove0.45
Wins: 710, Losses: 1768, Draws: 223
Episode: 2800, Win Rate: 0.26, Epsilon: 0.208, smartmove0.4666666666666667
Wins: 728, Losses: 1839, Draws: 234
Episode: 2900, W



Episode: 3000, Win Rate: 0.26, Epsilon: 0.010, smartmove0.5
Wins: 768, Losses: 1985, Draws: 248
Episode: 3100, Win Rate: 0.25, Epsilon: 0.010, smartmove0.5166666666666667
Wins: 785, Losses: 2062, Draws: 254
Episode: 3200, Win Rate: 0.25, Epsilon: 0.010, smartmove0.5333333333333333
Wins: 811, Losses: 2130, Draws: 260
Episode: 3300, Win Rate: 0.25, Epsilon: 0.010, smartmove0.55
Wins: 834, Losses: 2201, Draws: 266
Episode: 3400, Win Rate: 0.25, Epsilon: 0.010, smartmove0.5666666666666667
Wins: 852, Losses: 2270, Draws: 279
Episode: 3500, Win Rate: 0.25, Epsilon: 0.010, smartmove0.5833333333333334
Wins: 878, Losses: 2335, Draws: 288
Episode: 3600, Win Rate: 0.25, Epsilon: 0.010, smartmove0.6
Wins: 897, Losses: 2408, Draws: 296
Episode: 3700, Win Rate: 0.25, Epsilon: 0.010, smartmove0.6166666666666667
Wins: 919, Losses: 2471, Draws: 311
Episode: 3800, Win Rate: 0.24, Epsilon: 0.010, smartmove0.6333333333333333
Wins: 926, Losses: 2549, Draws: 326
Episode: 3900, Win Rate: 0.24, Epsilon: 0.010



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.24, Epsilon: 1.000, smartmove0.6666666666666666
Wins: 961, Losses: 2697, Draws: 343
Episode: 4100, Win Rate: 0.24, Epsilon: 0.901, smartmove0.6833333333333333
Wins: 967, Losses: 2782, Draws: 352
Episode: 4200, Win Rate: 0.23, Epsilon: 0.802, smartmove0.7
Wins: 977, Losses: 2865, Draws: 359
Episode: 4300, Win Rate: 0.23, Epsilon: 0.703, smartmove0.7166666666666667
Wins: 989, Losses: 2944, Draws: 368
Episode: 4400, Win Rate: 0.23, Epsilon: 0.604, smartmove0.7333333333333333
Wins: 998, Losses: 3022, Draws: 381
Episode: 4500, Win Rate: 0.23, Epsilon: 0.505, smartmove0.75
Wins: 1014, Losses: 3097, Draws: 390
Episode: 4600, Win Rate: 0.22, Epsilon: 0.406, smartmove0.7666666666666667
Wins: 1022, Losses: 3176, Draws: 403
Episode: 4700, Win Rate: 0.22, Epsilon: 0.307, smartmove0.7833333333333333
Wins: 1032, Losses: 3259, Draws: 410
Episode: 4800, Win Rate: 0.22, Epsilon: 0.208, smartmove0.8
Wins: 1042, Losses: 3339, Draws: 420
Episode: 4900, W



Episode: 5000, Win Rate: 0.21, Epsilon: 0.010, smartmove0.8333333333333334
Wins: 1060, Losses: 3485, Draws: 456
Episode: 5100, Win Rate: 0.21, Epsilon: 0.010, smartmove0.85
Wins: 1070, Losses: 3553, Draws: 478
Episode: 5200, Win Rate: 0.21, Epsilon: 0.010, smartmove0.8666666666666667
Wins: 1092, Losses: 3616, Draws: 493
Episode: 5300, Win Rate: 0.21, Epsilon: 0.010, smartmove0.8833333333333333
Wins: 1105, Losses: 3684, Draws: 512
Episode: 5400, Win Rate: 0.21, Epsilon: 0.010, smartmove0.9
Wins: 1121, Losses: 3750, Draws: 530
Episode: 5500, Win Rate: 0.21, Epsilon: 0.010, smartmove0.9166666666666666
Wins: 1134, Losses: 3818, Draws: 549
Episode: 5600, Win Rate: 0.20, Epsilon: 0.010, smartmove0.9333333333333333
Wins: 1146, Losses: 3882, Draws: 573
Episode: 5700, Win Rate: 0.20, Epsilon: 0.010, smartmove0.95
Wins: 1151, Losses: 3946, Draws: 604
Episode: 5800, Win Rate: 0.20, Epsilon: 0.010, smartmove0.9666666666666667
Wins: 1156, Losses: 4005, Draws: 640
Episode: 5900, Win Rate: 0.20, Epsi



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.20, Epsilon: 1.000, smartmove1
Wins: 1179, Losses: 4123, Draws: 699
Episode: 6100, Win Rate: 0.19, Epsilon: 0.901, smartmove1
Wins: 1183, Losses: 4205, Draws: 713
Episode: 6200, Win Rate: 0.19, Epsilon: 0.802, smartmove1
Wins: 1184, Losses: 4294, Draws: 723
Episode: 6300, Win Rate: 0.19, Epsilon: 0.703, smartmove1
Wins: 1185, Losses: 4383, Draws: 733
Episode: 6400, Win Rate: 0.19, Epsilon: 0.604, smartmove1
Wins: 1186, Losses: 4472, Draws: 743
Episode: 6500, Win Rate: 0.18, Epsilon: 0.505, smartmove1
Wins: 1188, Losses: 4557, Draws: 756
Episode: 6600, Win Rate: 0.18, Epsilon: 0.406, smartmove1
Wins: 1193, Losses: 4627, Draws: 781
Episode: 6700, Win Rate: 0.18, Epsilon: 0.307, smartmove1
Wins: 1197, Losses: 4700, Draws: 804
Episode: 6800, Win Rate: 0.18, Epsilon: 0.208, smartmove1
Wins: 1199, Losses: 4781, Draws: 821
Episode: 6900, Win Rate: 0.17, Epsilon: 0.109, smartmove1
Wins: 1200, Losses: 4846, Draws: 855




Episode: 7000, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1201, Losses: 4915, Draws: 885
Episode: 7100, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1212, Losses: 4955, Draws: 934
Episode: 7200, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1221, Losses: 5007, Draws: 973
Episode: 7300, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1228, Losses: 5056, Draws: 1017
Episode: 7400, Win Rate: 0.17, Epsilon: 0.010, smartmove1
Wins: 1230, Losses: 5114, Draws: 1057
Episode: 7500, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1236, Losses: 5155, Draws: 1110
Episode: 7600, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1240, Losses: 5201, Draws: 1160
Episode: 7700, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1241, Losses: 5246, Draws: 1214
Episode: 7800, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1245, Losses: 5288, Draws: 1268
Episode: 7900, Win Rate: 0.16, Epsilon: 0.010, smartmove1
Wins: 1251, Losses: 5342, Draws: 1308




Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.16, Epsilon: 1.000, smartmove1
Wins: 1254, Losses: 5391, Draws: 1356
Episode: 8100, Win Rate: 0.16, Epsilon: 0.901, smartmove1
Wins: 1257, Losses: 5479, Draws: 1365
Episode: 8200, Win Rate: 0.15, Epsilon: 0.802, smartmove1
Wins: 1258, Losses: 5569, Draws: 1374
Episode: 8300, Win Rate: 0.15, Epsilon: 0.703, smartmove1
Wins: 1262, Losses: 5653, Draws: 1386
Episode: 8400, Win Rate: 0.15, Epsilon: 0.604, smartmove1
Wins: 1264, Losses: 5734, Draws: 1403
Episode: 8500, Win Rate: 0.15, Epsilon: 0.505, smartmove1
Wins: 1265, Losses: 5809, Draws: 1427
Episode: 8600, Win Rate: 0.15, Epsilon: 0.406, smartmove1
Wins: 1268, Losses: 5885, Draws: 1448
Episode: 8700, Win Rate: 0.15, Epsilon: 0.307, smartmove1
Wins: 1268, Losses: 5964, Draws: 1469
Episode: 8800, Win Rate: 0.14, Epsilon: 0.208, smartmove1
Wins: 1272, Losses: 6025, Draws: 1504
Episode: 8900, Win Rate: 0.14, Epsilon: 0.109, smartmove1
Wins: 1280, Losses: 6066, Draws: 1555




Episode: 9000, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1281, Losses: 6109, Draws: 1611
Episode: 9100, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1283, Losses: 6142, Draws: 1676
Episode: 9200, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1288, Losses: 6173, Draws: 1740
Episode: 9300, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1292, Losses: 6208, Draws: 1801
Episode: 9400, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1292, Losses: 6248, Draws: 1861
Episode: 9500, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1295, Losses: 6288, Draws: 1918
Episode: 9600, Win Rate: 0.14, Epsilon: 0.010, smartmove1
Wins: 1298, Losses: 6317, Draws: 1986
Episode: 9700, Win Rate: 0.13, Epsilon: 0.010, smartmove1
Wins: 1299, Losses: 6349, Draws: 2053
Episode: 9800, Win Rate: 0.13, Epsilon: 0.010, smartmove1
Wins: 1304, Losses: 6383, Draws: 2114
Episode: 9900, Win Rate: 0.13, Epsilon: 0.010, smartmove1
Wins: 1305, Losses: 6427, Draws: 2169




In [14]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    
    for episode in range(episodes):
        if episode % 1000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 1000) * epsilon_decay)
        
        smartness = min(1, episode / (episodes * 0.6))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = min(-0.1,-0.5*smartness)
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model6_episode_{episode}.h5')
    agent.model.save('model6.h5')
agent = train_agent()





Episode: 0, Win Rate: 0.00, Epsilon: 1.000, smartmove0.0
Wins: 0, Losses: 1, Draws: 0
Episode: 100, Win Rate: 0.30, Epsilon: 0.901, smartmove0.016666666666666666
Wins: 30, Losses: 63, Draws: 8
Episode: 200, Win Rate: 0.28, Epsilon: 0.802, smartmove0.03333333333333333
Wins: 57, Losses: 127, Draws: 17
Episode: 300, Win Rate: 0.31, Epsilon: 0.703, smartmove0.05
Wins: 94, Losses: 173, Draws: 34
Episode: 400, Win Rate: 0.31, Epsilon: 0.604, smartmove0.06666666666666667
Wins: 123, Losses: 239, Draws: 39
Episode: 500, Win Rate: 0.32, Epsilon: 0.505, smartmove0.08333333333333333
Wins: 158, Losses: 291, Draws: 52
Episode: 600, Win Rate: 0.32, Epsilon: 0.406, smartmove0.1
Wins: 190, Losses: 345, Draws: 66
Episode: 700, Win Rate: 0.32, Epsilon: 0.307, smartmove0.11666666666666667
Wins: 221, Losses: 405, Draws: 75
Episode: 800, Win Rate: 0.32, Epsilon: 0.208, smartmove0.13333333333333333
Wins: 257, Losses: 461, Draws: 83
Episode: 900, Win Rate: 0.32, Epsilon: 0.109, smartmove0.15
Wins: 285, Losses



Resetting epsilon to 1.0
Episode: 1000, Win Rate: 0.32, Epsilon: 1.000, smartmove0.16666666666666666
Wins: 318, Losses: 575, Draws: 108
Episode: 1100, Win Rate: 0.31, Epsilon: 0.901, smartmove0.18333333333333332
Wins: 341, Losses: 639, Draws: 121
Episode: 1200, Win Rate: 0.30, Epsilon: 0.802, smartmove0.2
Wins: 361, Losses: 704, Draws: 136
Episode: 1300, Win Rate: 0.30, Epsilon: 0.703, smartmove0.21666666666666667
Wins: 385, Losses: 769, Draws: 147
Episode: 1400, Win Rate: 0.29, Epsilon: 0.604, smartmove0.23333333333333334
Wins: 412, Losses: 830, Draws: 159
Episode: 1500, Win Rate: 0.30, Epsilon: 0.505, smartmove0.25
Wins: 447, Losses: 884, Draws: 170
Episode: 1600, Win Rate: 0.30, Epsilon: 0.406, smartmove0.26666666666666666
Wins: 473, Losses: 947, Draws: 181
Episode: 1700, Win Rate: 0.30, Epsilon: 0.307, smartmove0.2833333333333333
Wins: 502, Losses: 1011, Draws: 188
Episode: 1800, Win Rate: 0.30, Epsilon: 0.208, smartmove0.3
Wins: 534, Losses: 1074, Draws: 193
Episode: 1900, Win Rat



Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.29, Epsilon: 1.000, smartmove0.3333333333333333
Wins: 582, Losses: 1209, Draws: 210
Episode: 2100, Win Rate: 0.29, Epsilon: 0.901, smartmove0.35
Wins: 601, Losses: 1277, Draws: 223
Episode: 2200, Win Rate: 0.28, Epsilon: 0.802, smartmove0.36666666666666664
Wins: 624, Losses: 1343, Draws: 234
Episode: 2300, Win Rate: 0.28, Epsilon: 0.703, smartmove0.38333333333333336
Wins: 644, Losses: 1411, Draws: 246
Episode: 2400, Win Rate: 0.28, Epsilon: 0.604, smartmove0.4
Wins: 671, Losses: 1472, Draws: 258
Episode: 2500, Win Rate: 0.28, Epsilon: 0.505, smartmove0.4166666666666667
Wins: 693, Losses: 1547, Draws: 261
Episode: 2600, Win Rate: 0.28, Epsilon: 0.406, smartmove0.43333333333333335
Wins: 718, Losses: 1613, Draws: 270
Episode: 2700, Win Rate: 0.28, Epsilon: 0.307, smartmove0.45
Wins: 748, Losses: 1666, Draws: 287
Episode: 2800, Win Rate: 0.27, Epsilon: 0.208, smartmove0.4666666666666667
Wins: 766, Losses: 1736, Draws: 299
Episode: 2900, W



Resetting epsilon to 1.0
Episode: 3000, Win Rate: 0.27, Epsilon: 1.000, smartmove0.5
Wins: 803, Losses: 1884, Draws: 314
Episode: 3100, Win Rate: 0.26, Epsilon: 0.901, smartmove0.5166666666666667
Wins: 819, Losses: 1959, Draws: 323
Episode: 3200, Win Rate: 0.26, Epsilon: 0.802, smartmove0.5333333333333333
Wins: 837, Losses: 2030, Draws: 334
Episode: 3300, Win Rate: 0.26, Epsilon: 0.703, smartmove0.55
Wins: 859, Losses: 2096, Draws: 346
Episode: 3400, Win Rate: 0.26, Epsilon: 0.604, smartmove0.5666666666666667
Wins: 875, Losses: 2175, Draws: 351
Episode: 3500, Win Rate: 0.25, Epsilon: 0.505, smartmove0.5833333333333334
Wins: 891, Losses: 2249, Draws: 361
Episode: 3600, Win Rate: 0.25, Epsilon: 0.406, smartmove0.6
Wins: 908, Losses: 2328, Draws: 365
Episode: 3700, Win Rate: 0.25, Epsilon: 0.307, smartmove0.6166666666666667
Wins: 920, Losses: 2399, Draws: 382
Episode: 3800, Win Rate: 0.25, Epsilon: 0.208, smartmove0.6333333333333333
Wins: 935, Losses: 2466, Draws: 400
Episode: 3900, Win R



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.24, Epsilon: 1.000, smartmove0.6666666666666666
Wins: 974, Losses: 2607, Draws: 420
Episode: 4100, Win Rate: 0.24, Epsilon: 0.901, smartmove0.6833333333333333
Wins: 986, Losses: 2689, Draws: 426
Episode: 4200, Win Rate: 0.24, Epsilon: 0.802, smartmove0.7
Wins: 994, Losses: 2770, Draws: 437
Episode: 4300, Win Rate: 0.23, Epsilon: 0.703, smartmove0.7166666666666667
Wins: 1004, Losses: 2846, Draws: 451
Episode: 4400, Win Rate: 0.23, Epsilon: 0.604, smartmove0.7333333333333333
Wins: 1011, Losses: 2931, Draws: 459
Episode: 4500, Win Rate: 0.23, Epsilon: 0.505, smartmove0.75
Wins: 1025, Losses: 3011, Draws: 465
Episode: 4600, Win Rate: 0.23, Epsilon: 0.406, smartmove0.7666666666666667
Wins: 1036, Losses: 3086, Draws: 479
Episode: 4700, Win Rate: 0.22, Epsilon: 0.307, smartmove0.7833333333333333
Wins: 1049, Losses: 3165, Draws: 487
Episode: 4800, Win Rate: 0.22, Epsilon: 0.208, smartmove0.8
Wins: 1058, Losses: 3242, Draws: 501
Episode: 4900,



Resetting epsilon to 1.0
Episode: 5000, Win Rate: 0.21, Epsilon: 1.000, smartmove0.8333333333333334
Wins: 1073, Losses: 3383, Draws: 545
Episode: 5100, Win Rate: 0.21, Epsilon: 0.901, smartmove0.85
Wins: 1078, Losses: 3468, Draws: 555
Episode: 5200, Win Rate: 0.21, Epsilon: 0.802, smartmove0.8666666666666667
Wins: 1084, Losses: 3553, Draws: 564
Episode: 5300, Win Rate: 0.20, Epsilon: 0.703, smartmove0.8833333333333333
Wins: 1084, Losses: 3640, Draws: 577
Episode: 5400, Win Rate: 0.20, Epsilon: 0.604, smartmove0.9
Wins: 1093, Losses: 3716, Draws: 592
Episode: 5500, Win Rate: 0.20, Epsilon: 0.505, smartmove0.9166666666666666
Wins: 1099, Losses: 3800, Draws: 602
Episode: 5600, Win Rate: 0.20, Epsilon: 0.406, smartmove0.9333333333333333
Wins: 1103, Losses: 3882, Draws: 616
Episode: 5700, Win Rate: 0.19, Epsilon: 0.307, smartmove0.95
Wins: 1108, Losses: 3960, Draws: 633
Episode: 5800, Win Rate: 0.19, Epsilon: 0.208, smartmove0.9666666666666667
Wins: 1112, Losses: 4042, Draws: 647
Episode: 5



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.19, Epsilon: 1.000, smartmove1
Wins: 1116, Losses: 4194, Draws: 691
Episode: 6100, Win Rate: 0.18, Epsilon: 0.901, smartmove1
Wins: 1118, Losses: 4284, Draws: 699
Episode: 6200, Win Rate: 0.18, Epsilon: 0.802, smartmove1
Wins: 1120, Losses: 4373, Draws: 708
Episode: 6300, Win Rate: 0.18, Epsilon: 0.703, smartmove1
Wins: 1120, Losses: 4459, Draws: 722
Episode: 6400, Win Rate: 0.18, Epsilon: 0.604, smartmove1
Wins: 1121, Losses: 4539, Draws: 741
Episode: 6500, Win Rate: 0.17, Epsilon: 0.505, smartmove1
Wins: 1122, Losses: 4623, Draws: 756
Episode: 6600, Win Rate: 0.17, Epsilon: 0.406, smartmove1
Wins: 1125, Losses: 4710, Draws: 766
Episode: 6700, Win Rate: 0.17, Epsilon: 0.307, smartmove1
Wins: 1128, Losses: 4783, Draws: 790
Episode: 6800, Win Rate: 0.17, Epsilon: 0.208, smartmove1
Wins: 1130, Losses: 4858, Draws: 813
Episode: 6900, Win Rate: 0.16, Epsilon: 0.109, smartmove1
Wins: 1133, Losses: 4923, Draws: 845




Resetting epsilon to 1.0
Episode: 7000, Win Rate: 0.16, Epsilon: 1.000, smartmove1
Wins: 1138, Losses: 4988, Draws: 875
Episode: 7100, Win Rate: 0.16, Epsilon: 0.901, smartmove1
Wins: 1139, Losses: 5079, Draws: 883
Episode: 7200, Win Rate: 0.16, Epsilon: 0.802, smartmove1
Wins: 1143, Losses: 5165, Draws: 893
Episode: 7300, Win Rate: 0.16, Epsilon: 0.703, smartmove1
Wins: 1145, Losses: 5254, Draws: 902
Episode: 7400, Win Rate: 0.15, Epsilon: 0.604, smartmove1
Wins: 1146, Losses: 5337, Draws: 918
Episode: 7500, Win Rate: 0.15, Epsilon: 0.505, smartmove1
Wins: 1149, Losses: 5416, Draws: 936
Episode: 7600, Win Rate: 0.15, Epsilon: 0.406, smartmove1
Wins: 1149, Losses: 5500, Draws: 952
Episode: 7700, Win Rate: 0.15, Epsilon: 0.307, smartmove1
Wins: 1155, Losses: 5557, Draws: 989
Episode: 7800, Win Rate: 0.15, Epsilon: 0.208, smartmove1
Wins: 1158, Losses: 5618, Draws: 1025
Episode: 7900, Win Rate: 0.15, Epsilon: 0.109, smartmove1
Wins: 1165, Losses: 5672, Draws: 1064




Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.15, Epsilon: 1.000, smartmove1
Wins: 1172, Losses: 5729, Draws: 1100
Episode: 8100, Win Rate: 0.15, Epsilon: 0.901, smartmove1
Wins: 1175, Losses: 5812, Draws: 1114
Episode: 8200, Win Rate: 0.14, Epsilon: 0.802, smartmove1
Wins: 1177, Losses: 5896, Draws: 1128
Episode: 8300, Win Rate: 0.14, Epsilon: 0.703, smartmove1
Wins: 1180, Losses: 5981, Draws: 1140
Episode: 8400, Win Rate: 0.14, Epsilon: 0.604, smartmove1
Wins: 1184, Losses: 6057, Draws: 1160
Episode: 8500, Win Rate: 0.14, Epsilon: 0.505, smartmove1
Wins: 1188, Losses: 6130, Draws: 1183
Episode: 8600, Win Rate: 0.14, Epsilon: 0.406, smartmove1
Wins: 1190, Losses: 6202, Draws: 1209
Episode: 8700, Win Rate: 0.14, Epsilon: 0.307, smartmove1
Wins: 1194, Losses: 6253, Draws: 1254
Episode: 8800, Win Rate: 0.14, Epsilon: 0.208, smartmove1
Wins: 1200, Losses: 6304, Draws: 1297
Episode: 8900, Win Rate: 0.14, Epsilon: 0.109, smartmove1
Wins: 1207, Losses: 6354, Draws: 1340




Resetting epsilon to 1.0
Episode: 9000, Win Rate: 0.13, Epsilon: 1.000, smartmove1
Wins: 1211, Losses: 6403, Draws: 1387
Episode: 9100, Win Rate: 0.13, Epsilon: 0.901, smartmove1
Wins: 1211, Losses: 6491, Draws: 1399
Episode: 9200, Win Rate: 0.13, Epsilon: 0.802, smartmove1
Wins: 1213, Losses: 6578, Draws: 1410
Episode: 9300, Win Rate: 0.13, Epsilon: 0.703, smartmove1
Wins: 1216, Losses: 6663, Draws: 1422
Episode: 9400, Win Rate: 0.13, Epsilon: 0.604, smartmove1
Wins: 1217, Losses: 6744, Draws: 1440
Episode: 9500, Win Rate: 0.13, Epsilon: 0.505, smartmove1
Wins: 1219, Losses: 6821, Draws: 1461
Episode: 9600, Win Rate: 0.13, Epsilon: 0.406, smartmove1
Wins: 1224, Losses: 6885, Draws: 1492
Episode: 9700, Win Rate: 0.13, Epsilon: 0.307, smartmove1
Wins: 1229, Losses: 6948, Draws: 1524
Episode: 9800, Win Rate: 0.13, Epsilon: 0.208, smartmove1
Wins: 1233, Losses: 7004, Draws: 1564
Episode: 9900, Win Rate: 0.13, Epsilon: 0.109, smartmove1
Wins: 1239, Losses: 7056, Draws: 1606




In [2]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    
    for episode in range(episodes):
        if episode % 1000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)
        
        smartness = min(1, episode / (episodes))
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = -max(1,0.5*smartness)*8/10
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model7_episode_{episode}.h5')
    agent.model.save('model7.h5')
agent = train_agent()



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 0, Win Rate: 1.00, Epsilon: 1.000, smartmove0.0
Wins: 1, Losses: 0, Draws: 0
Episode: 100, Win Rate: 0.27, Epsilon: 0.901, smartmove0.01
Wins: 27, Losses: 58, Draws: 16
Episode: 200, Win Rate: 0.27, Epsilon: 0.802, smartmove0.02
Wins: 54, Losses: 116, Draws: 31
Episode: 300, Win Rate: 0.28, Epsilon: 0.703, smartmove0.03
Wins: 85, Losses: 173, Draws: 43
Episode: 400, Win Rate: 0.27, Epsilon: 0.604, smartmove0.04
Wins: 108, Losses: 239, Draws: 54
Episode: 500, Win Rate: 0.28, Epsilon: 0.505, smartmove0.05
Wins: 141, Losses: 299, Draws: 61
Episode: 600, Win Rate: 0.29, Epsilon: 0.406, smartmove0.06
Wins: 174, Losses: 356, Draws: 71
Episode: 700, Win Rate: 0.29, Epsilon: 0.307, smartmove0.07
Wins: 203, Losses: 420, Draws: 78
Episode: 800, Win Rate: 0.29, Epsilon: 0.208, smartmove0.08
Wins: 230, Losses: 486, Draws: 85
Episode: 900, Win Rate: 0.29, Epsilon: 0.109, smartmove0.09
Wins: 264, Losses: 548, Draws: 89




Resetting epsilon to 1.0
Episode: 1000, Win Rate: 0.29, Epsilon: 1.000, smartmove0.1
Wins: 289, Losses: 617, Draws: 95
Episode: 1100, Win Rate: 0.29, Epsilon: 0.010, smartmove0.11
Wins: 318, Losses: 684, Draws: 99
Episode: 1200, Win Rate: 0.29, Epsilon: 0.010, smartmove0.12
Wins: 351, Losses: 743, Draws: 107
Episode: 1300, Win Rate: 0.30, Epsilon: 0.010, smartmove0.13
Wins: 390, Losses: 803, Draws: 108
Episode: 1400, Win Rate: 0.30, Epsilon: 0.010, smartmove0.14
Wins: 426, Losses: 863, Draws: 112
Episode: 1500, Win Rate: 0.30, Epsilon: 0.010, smartmove0.15
Wins: 455, Losses: 931, Draws: 115
Episode: 1600, Win Rate: 0.32, Epsilon: 0.010, smartmove0.16
Wins: 507, Losses: 977, Draws: 117
Episode: 1700, Win Rate: 0.32, Epsilon: 0.010, smartmove0.17
Wins: 547, Losses: 1035, Draws: 119
Episode: 1800, Win Rate: 0.32, Epsilon: 0.010, smartmove0.18
Wins: 570, Losses: 1106, Draws: 125
Episode: 1900, Win Rate: 0.32, Epsilon: 0.010, smartmove0.19
Wins: 604, Losses: 1171, Draws: 126




Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.32, Epsilon: 1.000, smartmove0.2
Wins: 644, Losses: 1227, Draws: 130
Episode: 2100, Win Rate: 0.32, Epsilon: 0.901, smartmove0.21
Wins: 669, Losses: 1294, Draws: 138
Episode: 2200, Win Rate: 0.31, Epsilon: 0.802, smartmove0.22
Wins: 693, Losses: 1360, Draws: 148
Episode: 2300, Win Rate: 0.31, Epsilon: 0.703, smartmove0.23
Wins: 721, Losses: 1424, Draws: 156
Episode: 2400, Win Rate: 0.31, Epsilon: 0.604, smartmove0.24
Wins: 750, Losses: 1488, Draws: 163
Episode: 2500, Win Rate: 0.31, Epsilon: 0.505, smartmove0.25
Wins: 786, Losses: 1537, Draws: 178
Episode: 2600, Win Rate: 0.31, Epsilon: 0.406, smartmove0.26
Wins: 809, Losses: 1605, Draws: 187
Episode: 2700, Win Rate: 0.31, Epsilon: 0.307, smartmove0.27
Wins: 841, Losses: 1664, Draws: 196
Episode: 2800, Win Rate: 0.31, Epsilon: 0.208, smartmove0.28
Wins: 867, Losses: 1731, Draws: 203
Episode: 2900, Win Rate: 0.31, Epsilon: 0.109, smartmove0.29
Wins: 896, Losses: 1796, Draws: 209




Resetting epsilon to 1.0
Episode: 3000, Win Rate: 0.31, Epsilon: 1.000, smartmove0.3
Wins: 926, Losses: 1863, Draws: 212
Episode: 3100, Win Rate: 0.30, Epsilon: 0.010, smartmove0.31
Wins: 945, Losses: 1936, Draws: 220
Episode: 3200, Win Rate: 0.30, Epsilon: 0.010, smartmove0.32
Wins: 973, Losses: 1998, Draws: 230
Episode: 3300, Win Rate: 0.30, Epsilon: 0.010, smartmove0.33
Wins: 1004, Losses: 2060, Draws: 237
Episode: 3400, Win Rate: 0.30, Epsilon: 0.010, smartmove0.34
Wins: 1032, Losses: 2120, Draws: 249
Episode: 3500, Win Rate: 0.30, Epsilon: 0.010, smartmove0.35
Wins: 1057, Losses: 2188, Draws: 256
Episode: 3600, Win Rate: 0.30, Epsilon: 0.010, smartmove0.36
Wins: 1086, Losses: 2247, Draws: 268
Episode: 3700, Win Rate: 0.30, Epsilon: 0.010, smartmove0.37
Wins: 1107, Losses: 2320, Draws: 274
Episode: 3800, Win Rate: 0.30, Epsilon: 0.010, smartmove0.38
Wins: 1132, Losses: 2387, Draws: 282
Episode: 3900, Win Rate: 0.30, Epsilon: 0.010, smartmove0.39
Wins: 1166, Losses: 2445, Draws: 290



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.30, Epsilon: 1.000, smartmove0.4
Wins: 1186, Losses: 2513, Draws: 302
Episode: 4100, Win Rate: 0.29, Epsilon: 0.901, smartmove0.41
Wins: 1206, Losses: 2586, Draws: 309
Episode: 4200, Win Rate: 0.29, Epsilon: 0.802, smartmove0.42
Wins: 1221, Losses: 2663, Draws: 317
Episode: 4300, Win Rate: 0.29, Epsilon: 0.703, smartmove0.43
Wins: 1242, Losses: 2733, Draws: 326
Episode: 4400, Win Rate: 0.29, Epsilon: 0.604, smartmove0.44
Wins: 1262, Losses: 2807, Draws: 332
Episode: 4500, Win Rate: 0.29, Epsilon: 0.505, smartmove0.45
Wins: 1284, Losses: 2875, Draws: 342
Episode: 4600, Win Rate: 0.28, Epsilon: 0.406, smartmove0.46
Wins: 1301, Losses: 2947, Draws: 353
Episode: 4700, Win Rate: 0.28, Epsilon: 0.307, smartmove0.47
Wins: 1321, Losses: 3013, Draws: 367
Episode: 4800, Win Rate: 0.28, Epsilon: 0.208, smartmove0.48
Wins: 1339, Losses: 3085, Draws: 377
Episode: 4900, Win Rate: 0.28, Epsilon: 0.109, smartmove0.49
Wins: 1361, Losses: 3152, Draws: 



Resetting epsilon to 1.0
Episode: 5000, Win Rate: 0.28, Epsilon: 1.000, smartmove0.5
Wins: 1383, Losses: 3215, Draws: 403
Episode: 5100, Win Rate: 0.28, Epsilon: 0.010, smartmove0.51
Wins: 1410, Losses: 3278, Draws: 413
Episode: 5200, Win Rate: 0.28, Epsilon: 0.010, smartmove0.52
Wins: 1439, Losses: 3338, Draws: 424
Episode: 5300, Win Rate: 0.28, Epsilon: 0.010, smartmove0.53
Wins: 1463, Losses: 3400, Draws: 438
Episode: 5400, Win Rate: 0.27, Epsilon: 0.010, smartmove0.54
Wins: 1479, Losses: 3465, Draws: 457
Episode: 5500, Win Rate: 0.28, Epsilon: 0.010, smartmove0.55
Wins: 1514, Losses: 3522, Draws: 465
Episode: 5600, Win Rate: 0.28, Epsilon: 0.010, smartmove0.56
Wins: 1544, Losses: 3581, Draws: 476
Episode: 5700, Win Rate: 0.28, Epsilon: 0.010, smartmove0.57
Wins: 1571, Losses: 3642, Draws: 488
Episode: 5800, Win Rate: 0.28, Epsilon: 0.010, smartmove0.58
Wins: 1599, Losses: 3699, Draws: 503
Episode: 5900, Win Rate: 0.27, Epsilon: 0.010, smartmove0.59
Wins: 1620, Losses: 3761, Draws: 



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.27, Epsilon: 1.000, smartmove0.6
Wins: 1643, Losses: 3825, Draws: 533
Episode: 6100, Win Rate: 0.27, Epsilon: 0.901, smartmove0.61
Wins: 1651, Losses: 3904, Draws: 546
Episode: 6200, Win Rate: 0.27, Epsilon: 0.802, smartmove0.62
Wins: 1670, Losses: 3973, Draws: 558
Episode: 6300, Win Rate: 0.27, Epsilon: 0.703, smartmove0.63
Wins: 1675, Losses: 4058, Draws: 568
Episode: 6400, Win Rate: 0.26, Epsilon: 0.604, smartmove0.64
Wins: 1686, Losses: 4137, Draws: 578
Episode: 6500, Win Rate: 0.26, Epsilon: 0.505, smartmove0.65
Wins: 1701, Losses: 4208, Draws: 592
Episode: 6600, Win Rate: 0.26, Epsilon: 0.406, smartmove0.66
Wins: 1720, Losses: 4276, Draws: 605
Episode: 6700, Win Rate: 0.26, Epsilon: 0.307, smartmove0.67
Wins: 1736, Losses: 4349, Draws: 616
Episode: 6800, Win Rate: 0.26, Epsilon: 0.208, smartmove0.68
Wins: 1752, Losses: 4416, Draws: 633
Episode: 6900, Win Rate: 0.26, Epsilon: 0.109, smartmove0.69
Wins: 1777, Losses: 4475, Draws: 



Resetting epsilon to 1.0
Episode: 7000, Win Rate: 0.26, Epsilon: 1.000, smartmove0.7
Wins: 1797, Losses: 4538, Draws: 666
Episode: 7100, Win Rate: 0.25, Epsilon: 0.010, smartmove0.71
Wins: 1806, Losses: 4609, Draws: 686
Episode: 7200, Win Rate: 0.25, Epsilon: 0.010, smartmove0.72
Wins: 1827, Losses: 4672, Draws: 702
Episode: 7300, Win Rate: 0.25, Epsilon: 0.010, smartmove0.73
Wins: 1844, Losses: 4736, Draws: 721
Episode: 7400, Win Rate: 0.25, Epsilon: 0.010, smartmove0.74
Wins: 1866, Losses: 4794, Draws: 741
Episode: 7500, Win Rate: 0.25, Epsilon: 0.010, smartmove0.75
Wins: 1892, Losses: 4848, Draws: 761
Episode: 7600, Win Rate: 0.25, Epsilon: 0.010, smartmove0.76
Wins: 1910, Losses: 4901, Draws: 790
Episode: 7700, Win Rate: 0.25, Epsilon: 0.010, smartmove0.77
Wins: 1931, Losses: 4956, Draws: 814
Episode: 7800, Win Rate: 0.25, Epsilon: 0.010, smartmove0.78
Wins: 1946, Losses: 5018, Draws: 837
Episode: 7900, Win Rate: 0.25, Epsilon: 0.010, smartmove0.79
Wins: 1961, Losses: 5081, Draws: 



Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.25, Epsilon: 1.000, smartmove0.8
Wins: 1977, Losses: 5140, Draws: 884
Episode: 8100, Win Rate: 0.24, Epsilon: 0.901, smartmove0.81
Wins: 1981, Losses: 5223, Draws: 897
Episode: 8200, Win Rate: 0.24, Epsilon: 0.802, smartmove0.82
Wins: 1989, Losses: 5300, Draws: 912
Episode: 8300, Win Rate: 0.24, Epsilon: 0.703, smartmove0.83
Wins: 1997, Losses: 5381, Draws: 923
Episode: 8400, Win Rate: 0.24, Epsilon: 0.604, smartmove0.84
Wins: 2005, Losses: 5464, Draws: 932
Episode: 8500, Win Rate: 0.24, Epsilon: 0.505, smartmove0.85
Wins: 2016, Losses: 5535, Draws: 950
Episode: 8600, Win Rate: 0.24, Epsilon: 0.406, smartmove0.86
Wins: 2026, Losses: 5609, Draws: 966
Episode: 8700, Win Rate: 0.23, Epsilon: 0.307, smartmove0.87
Wins: 2035, Losses: 5677, Draws: 989
Episode: 8800, Win Rate: 0.23, Epsilon: 0.208, smartmove0.88
Wins: 2042, Losses: 5756, Draws: 1003
Episode: 8900, Win Rate: 0.23, Epsilon: 0.109, smartmove0.89
Wins: 2054, Losses: 5817, Draws:



Resetting epsilon to 1.0
Episode: 9000, Win Rate: 0.23, Epsilon: 1.000, smartmove0.9
Wins: 2061, Losses: 5872, Draws: 1068
Episode: 9100, Win Rate: 0.23, Epsilon: 0.010, smartmove0.91
Wins: 2073, Losses: 5926, Draws: 1102
Episode: 9200, Win Rate: 0.23, Epsilon: 0.010, smartmove0.92
Wins: 2085, Losses: 5981, Draws: 1135
Episode: 9300, Win Rate: 0.22, Epsilon: 0.010, smartmove0.93
Wins: 2089, Losses: 6040, Draws: 1172
Episode: 9400, Win Rate: 0.22, Epsilon: 0.010, smartmove0.94
Wins: 2099, Losses: 6104, Draws: 1198
Episode: 9500, Win Rate: 0.22, Epsilon: 0.010, smartmove0.95
Wins: 2103, Losses: 6170, Draws: 1228
Episode: 9600, Win Rate: 0.22, Epsilon: 0.010, smartmove0.96
Wins: 2106, Losses: 6234, Draws: 1261
Episode: 9700, Win Rate: 0.22, Epsilon: 0.010, smartmove0.97
Wins: 2115, Losses: 6288, Draws: 1298
Episode: 9800, Win Rate: 0.22, Epsilon: 0.010, smartmove0.98
Wins: 2127, Losses: 6338, Draws: 1336
Episode: 9900, Win Rate: 0.22, Epsilon: 0.010, smartmove0.99
Wins: 2131, Losses: 6385



In [3]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization, Dropout
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.99):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.0005  # Reduced learning rate for more stable learning
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.85
        self.batch_size = 64  # Increased batch size
        self.replay_buffer = deque(maxlen=50000)  # Increased buffer size
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_counter = 0
        self.update_target_frequency = 1000

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        
        # Prioritize winning moves and block opponent wins
        temp_board = np.array(state).copy()
        for action in valid_actions:
            # Check for winning move
            temp_board[action] = 2
            if self._check_win(temp_board, 2):
                return action
            temp_board[action] = 0
            
            # Check for blocking opponent win
            temp_board[action] = 1
            if self._check_win(temp_board, 1):
                return action
            temp_board[action] = 0

        # If no immediate winning/blocking moves, use Q-values
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def _check_win(self, board, player):
        win_combinations = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
            [0, 4, 8], [2, 4, 6]  # Diagonals
        ]
        return any(all(board[i] == player for i in combo) for combo in win_combinations)

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        
        # Use target network for more stable training
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    # Double DQN: Select action using online network, evaluate using target network
                    next_q = self.model.predict(next_states[i:i+1], verbose=0)[0]
                    best_action = max([(next_q[a], a) for a in next_valid_actions], key=lambda x: x[0])[1]
                    target = reward + self.gamma * next_q_values[i][best_action]
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)
        
        # Update target network periodically
        self.update_target_counter += 1
        if self.update_target_counter >= self.update_target_frequency:
            self.target_model.set_weights(self.model.get_weights())
            self.update_target_counter = 0

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    best_win_rate = 0
    no_improvement_counter = 0
    smartness = 0
    for episode in range(episodes):
        # Decrease epsilon every 100 iterations
        if episode % 100 == 0 and episode%1000!=0:
            agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)

        # Increase smartness and partially reset epsilon every 2000 iterations
        if episode % 1000 == 0 and episode > 0:
            smartness = min(0.8, smartness + 0.1)
            agent.epsilon = 1  # Partial epsilon reset
            
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1 + (0.1 * smartness)  # Higher reward for winning against smarter opponent
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            if not game.is_full():
                game.player1_move()
                
            if game.current_winner == 1:
                reward = -1
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = -0.1 * smartness  # Small negative reward for draws against smart opponent
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        # Evaluation and model saving logic
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, Smartness: {smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
            
            if win_rate > best_win_rate:
                best_win_rate = win_rate
                no_improvement_counter = 0
            else:
                no_improvement_counter += 1
            
            if episode % 1000 == 0:
                agent.model.save(f'model9_episode_{episode}.h5')
        
    agent.model.save('model9.h5')
    return agent

agent = train_agent()




Episode: 0, Win Rate: 0.00, Epsilon: 1.000, Smartness: 0
Wins: 0, Losses: 1, Draws: 0
Episode: 100, Win Rate: 0.24, Epsilon: 0.850, Smartness: 0
Wins: 24, Losses: 60, Draws: 17
Episode: 200, Win Rate: 0.26, Epsilon: 0.722, Smartness: 0
Wins: 52, Losses: 120, Draws: 29
Episode: 300, Win Rate: 0.28, Epsilon: 0.614, Smartness: 0
Wins: 85, Losses: 169, Draws: 47
Episode: 400, Win Rate: 0.30, Epsilon: 0.522, Smartness: 0
Wins: 122, Losses: 210, Draws: 69
Episode: 500, Win Rate: 0.33, Epsilon: 0.444, Smartness: 0
Wins: 164, Losses: 240, Draws: 97
Episode: 600, Win Rate: 0.33, Epsilon: 0.377, Smartness: 0
Wins: 198, Losses: 277, Draws: 126
Episode: 700, Win Rate: 0.35, Epsilon: 0.321, Smartness: 0
Wins: 247, Losses: 301, Draws: 153
Episode: 800, Win Rate: 0.36, Epsilon: 0.272, Smartness: 0
Wins: 291, Losses: 332, Draws: 178
Episode: 900, Win Rate: 0.37, Epsilon: 0.232, Smartness: 0
Wins: 335, Losses: 356, Draws: 210




Episode: 1000, Win Rate: 0.39, Epsilon: 1.000, Smartness: 0.1
Wins: 391, Losses: 382, Draws: 228
Episode: 1100, Win Rate: 0.38, Epsilon: 0.850, Smartness: 0.1
Wins: 420, Losses: 441, Draws: 240
Episode: 1200, Win Rate: 0.37, Epsilon: 0.722, Smartness: 0.1
Wins: 449, Losses: 498, Draws: 254
Episode: 1300, Win Rate: 0.38, Epsilon: 0.614, Smartness: 0.1
Wins: 488, Losses: 547, Draws: 266
Episode: 1400, Win Rate: 0.37, Epsilon: 0.522, Smartness: 0.1
Wins: 525, Losses: 591, Draws: 285
Episode: 1500, Win Rate: 0.37, Epsilon: 0.444, Smartness: 0.1
Wins: 560, Losses: 631, Draws: 310
Episode: 1600, Win Rate: 0.38, Epsilon: 0.377, Smartness: 0.1
Wins: 612, Losses: 657, Draws: 332
Episode: 1700, Win Rate: 0.39, Epsilon: 0.321, Smartness: 0.1
Wins: 657, Losses: 691, Draws: 353
Episode: 1800, Win Rate: 0.39, Epsilon: 0.272, Smartness: 0.1
Wins: 697, Losses: 720, Draws: 384
Episode: 1900, Win Rate: 0.39, Epsilon: 0.232, Smartness: 0.1
Wins: 748, Losses: 747, Draws: 406




Episode: 2000, Win Rate: 0.39, Epsilon: 1.000, Smartness: 0.2
Wins: 789, Losses: 775, Draws: 437
Episode: 2100, Win Rate: 0.38, Epsilon: 0.850, Smartness: 0.2
Wins: 808, Losses: 848, Draws: 445
Episode: 2200, Win Rate: 0.38, Epsilon: 0.722, Smartness: 0.2
Wins: 838, Losses: 903, Draws: 460
Episode: 2300, Win Rate: 0.38, Epsilon: 0.614, Smartness: 0.2
Wins: 868, Losses: 954, Draws: 479
Episode: 2400, Win Rate: 0.37, Epsilon: 0.522, Smartness: 0.2
Wins: 897, Losses: 1010, Draws: 494
Episode: 2500, Win Rate: 0.37, Epsilon: 0.444, Smartness: 0.2
Wins: 928, Losses: 1056, Draws: 517
Episode: 2600, Win Rate: 0.37, Epsilon: 0.377, Smartness: 0.2
Wins: 961, Losses: 1103, Draws: 537
Episode: 2700, Win Rate: 0.37, Epsilon: 0.321, Smartness: 0.2
Wins: 1010, Losses: 1135, Draws: 556
Episode: 2800, Win Rate: 0.38, Epsilon: 0.272, Smartness: 0.2
Wins: 1059, Losses: 1167, Draws: 575
Episode: 2900, Win Rate: 0.38, Epsilon: 0.232, Smartness: 0.2
Wins: 1106, Losses: 1193, Draws: 602




Episode: 3000, Win Rate: 0.38, Epsilon: 1.000, Smartness: 0.30000000000000004
Wins: 1150, Losses: 1226, Draws: 625
Episode: 3100, Win Rate: 0.38, Epsilon: 0.850, Smartness: 0.30000000000000004
Wins: 1174, Losses: 1294, Draws: 633
Episode: 3200, Win Rate: 0.37, Epsilon: 0.722, Smartness: 0.30000000000000004
Wins: 1189, Losses: 1362, Draws: 650
Episode: 3300, Win Rate: 0.37, Epsilon: 0.614, Smartness: 0.30000000000000004
Wins: 1220, Losses: 1416, Draws: 665
Episode: 3400, Win Rate: 0.37, Epsilon: 0.522, Smartness: 0.30000000000000004
Wins: 1250, Losses: 1471, Draws: 680
Episode: 3500, Win Rate: 0.37, Epsilon: 0.444, Smartness: 0.30000000000000004
Wins: 1282, Losses: 1509, Draws: 710
Episode: 3600, Win Rate: 0.36, Epsilon: 0.377, Smartness: 0.30000000000000004
Wins: 1313, Losses: 1552, Draws: 736
Episode: 3700, Win Rate: 0.36, Epsilon: 0.321, Smartness: 0.30000000000000004
Wins: 1347, Losses: 1593, Draws: 761
Episode: 3800, Win Rate: 0.36, Epsilon: 0.272, Smartness: 0.30000000000000004
Wi



Episode: 4000, Win Rate: 0.37, Epsilon: 1.000, Smartness: 0.4
Wins: 1466, Losses: 1704, Draws: 831
Episode: 4100, Win Rate: 0.36, Epsilon: 0.850, Smartness: 0.4
Wins: 1479, Losses: 1776, Draws: 846
Episode: 4200, Win Rate: 0.36, Epsilon: 0.722, Smartness: 0.4
Wins: 1497, Losses: 1846, Draws: 858
Episode: 4300, Win Rate: 0.35, Epsilon: 0.614, Smartness: 0.4
Wins: 1520, Losses: 1909, Draws: 872
Episode: 4400, Win Rate: 0.35, Epsilon: 0.522, Smartness: 0.4
Wins: 1546, Losses: 1966, Draws: 889
Episode: 4500, Win Rate: 0.35, Epsilon: 0.444, Smartness: 0.4
Wins: 1571, Losses: 2011, Draws: 919
Episode: 4600, Win Rate: 0.35, Epsilon: 0.377, Smartness: 0.4
Wins: 1603, Losses: 2052, Draws: 946
Episode: 4700, Win Rate: 0.35, Epsilon: 0.321, Smartness: 0.4
Wins: 1637, Losses: 2092, Draws: 972
Episode: 4800, Win Rate: 0.35, Epsilon: 0.272, Smartness: 0.4
Wins: 1675, Losses: 2130, Draws: 996
Episode: 4900, Win Rate: 0.35, Epsilon: 0.232, Smartness: 0.4
Wins: 1708, Losses: 2162, Draws: 1031




Episode: 5000, Win Rate: 0.35, Epsilon: 1.000, Smartness: 0.5
Wins: 1741, Losses: 2203, Draws: 1057
Episode: 5100, Win Rate: 0.34, Epsilon: 0.850, Smartness: 0.5
Wins: 1746, Losses: 2289, Draws: 1066
Episode: 5200, Win Rate: 0.34, Epsilon: 0.722, Smartness: 0.5
Wins: 1765, Losses: 2357, Draws: 1079
Episode: 5300, Win Rate: 0.34, Epsilon: 0.614, Smartness: 0.5
Wins: 1784, Losses: 2421, Draws: 1096
Episode: 5400, Win Rate: 0.33, Epsilon: 0.522, Smartness: 0.5
Wins: 1807, Losses: 2476, Draws: 1118
Episode: 5500, Win Rate: 0.33, Epsilon: 0.444, Smartness: 0.5
Wins: 1831, Losses: 2527, Draws: 1143
Episode: 5600, Win Rate: 0.33, Epsilon: 0.377, Smartness: 0.5
Wins: 1865, Losses: 2574, Draws: 1162
Episode: 5700, Win Rate: 0.33, Epsilon: 0.321, Smartness: 0.5
Wins: 1895, Losses: 2611, Draws: 1195
Episode: 5800, Win Rate: 0.33, Epsilon: 0.272, Smartness: 0.5
Wins: 1924, Losses: 2652, Draws: 1225
Episode: 5900, Win Rate: 0.33, Epsilon: 0.232, Smartness: 0.5
Wins: 1950, Losses: 2695, Draws: 1256




Episode: 6000, Win Rate: 0.33, Epsilon: 1.000, Smartness: 0.6
Wins: 1985, Losses: 2729, Draws: 1287
Episode: 6100, Win Rate: 0.33, Epsilon: 0.850, Smartness: 0.6
Wins: 1999, Losses: 2799, Draws: 1303
Episode: 6200, Win Rate: 0.32, Epsilon: 0.722, Smartness: 0.6
Wins: 2012, Losses: 2872, Draws: 1317
Episode: 6300, Win Rate: 0.32, Epsilon: 0.614, Smartness: 0.6
Wins: 2029, Losses: 2939, Draws: 1333
Episode: 6400, Win Rate: 0.32, Epsilon: 0.522, Smartness: 0.6
Wins: 2044, Losses: 3007, Draws: 1350
Episode: 6500, Win Rate: 0.32, Epsilon: 0.444, Smartness: 0.6
Wins: 2069, Losses: 3063, Draws: 1369
Episode: 6600, Win Rate: 0.32, Epsilon: 0.377, Smartness: 0.6
Wins: 2092, Losses: 3113, Draws: 1396
Episode: 6700, Win Rate: 0.32, Epsilon: 0.321, Smartness: 0.6
Wins: 2112, Losses: 3165, Draws: 1424
Episode: 6800, Win Rate: 0.31, Epsilon: 0.272, Smartness: 0.6
Wins: 2135, Losses: 3216, Draws: 1450
Episode: 6900, Win Rate: 0.31, Epsilon: 0.232, Smartness: 0.6
Wins: 2159, Losses: 3259, Draws: 1483




Episode: 7000, Win Rate: 0.31, Epsilon: 1.000, Smartness: 0.7
Wins: 2190, Losses: 3298, Draws: 1513
Episode: 7100, Win Rate: 0.31, Epsilon: 0.850, Smartness: 0.7
Wins: 2198, Losses: 3381, Draws: 1522
Episode: 7200, Win Rate: 0.31, Epsilon: 0.722, Smartness: 0.7
Wins: 2209, Losses: 3455, Draws: 1537
Episode: 7300, Win Rate: 0.30, Epsilon: 0.614, Smartness: 0.7
Wins: 2218, Losses: 3531, Draws: 1552
Episode: 7400, Win Rate: 0.30, Epsilon: 0.522, Smartness: 0.7
Wins: 2229, Losses: 3596, Draws: 1576
Episode: 7500, Win Rate: 0.30, Epsilon: 0.444, Smartness: 0.7
Wins: 2249, Losses: 3648, Draws: 1604
Episode: 7600, Win Rate: 0.30, Epsilon: 0.377, Smartness: 0.7
Wins: 2268, Losses: 3704, Draws: 1629
Episode: 7700, Win Rate: 0.30, Epsilon: 0.321, Smartness: 0.7
Wins: 2281, Losses: 3763, Draws: 1657
Episode: 7800, Win Rate: 0.29, Epsilon: 0.272, Smartness: 0.7
Wins: 2301, Losses: 3804, Draws: 1696
Episode: 7900, Win Rate: 0.29, Epsilon: 0.232, Smartness: 0.7
Wins: 2322, Losses: 3835, Draws: 1744




Episode: 8000, Win Rate: 0.29, Epsilon: 1.000, Smartness: 0.7999999999999999
Wins: 2337, Losses: 3881, Draws: 1783
Episode: 8100, Win Rate: 0.29, Epsilon: 0.850, Smartness: 0.7999999999999999
Wins: 2344, Losses: 3961, Draws: 1796
Episode: 8200, Win Rate: 0.29, Epsilon: 0.722, Smartness: 0.7999999999999999
Wins: 2354, Losses: 4037, Draws: 1810
Episode: 8300, Win Rate: 0.28, Epsilon: 0.614, Smartness: 0.7999999999999999
Wins: 2363, Losses: 4102, Draws: 1836
Episode: 8400, Win Rate: 0.28, Epsilon: 0.522, Smartness: 0.7999999999999999
Wins: 2378, Losses: 4169, Draws: 1854
Episode: 8500, Win Rate: 0.28, Epsilon: 0.444, Smartness: 0.7999999999999999
Wins: 2388, Losses: 4233, Draws: 1880
Episode: 8600, Win Rate: 0.28, Epsilon: 0.377, Smartness: 0.7999999999999999
Wins: 2396, Losses: 4295, Draws: 1910
Episode: 8700, Win Rate: 0.28, Epsilon: 0.321, Smartness: 0.7999999999999999
Wins: 2408, Losses: 4352, Draws: 1941
Episode: 8800, Win Rate: 0.28, Epsilon: 0.272, Smartness: 0.7999999999999999
Win



Episode: 9000, Win Rate: 0.27, Epsilon: 1.000, Smartness: 0.8
Wins: 2438, Losses: 4497, Draws: 2066
Episode: 9100, Win Rate: 0.27, Epsilon: 0.850, Smartness: 0.8
Wins: 2442, Losses: 4585, Draws: 2074
Episode: 9200, Win Rate: 0.27, Epsilon: 0.722, Smartness: 0.8
Wins: 2448, Losses: 4660, Draws: 2093
Episode: 9300, Win Rate: 0.26, Epsilon: 0.614, Smartness: 0.8
Wins: 2457, Losses: 4735, Draws: 2109
Episode: 9400, Win Rate: 0.26, Epsilon: 0.522, Smartness: 0.8
Wins: 2469, Losses: 4799, Draws: 2133
Episode: 9500, Win Rate: 0.26, Epsilon: 0.444, Smartness: 0.8
Wins: 2482, Losses: 4850, Draws: 2169
Episode: 9600, Win Rate: 0.26, Epsilon: 0.377, Smartness: 0.8
Wins: 2490, Losses: 4908, Draws: 2203
Episode: 9700, Win Rate: 0.26, Epsilon: 0.321, Smartness: 0.8
Wins: 2503, Losses: 4963, Draws: 2235
Episode: 9800, Win Rate: 0.26, Epsilon: 0.272, Smartness: 0.8
Wins: 2513, Losses: 5012, Draws: 2276
Episode: 9900, Win Rate: 0.26, Epsilon: 0.232, Smartness: 0.8
Wins: 2529, Losses: 5056, Draws: 2316




In [4]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.99):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.0005  # Reduced learning rate for more stable learning
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.85
        self.batch_size = 64  # Increased batch size
        self.replay_buffer = deque(maxlen=50000)  # Increased buffer size
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_counter = 0
        self.update_target_frequency = 1000

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        
        # Prioritize winning moves and block opponent wins
        temp_board = np.array(state).copy()
        for action in valid_actions:
            # Check for winning move
            temp_board[action] = 2
            if self._check_win(temp_board, 2):
                return action
            temp_board[action] = 0
            
            # Check for blocking opponent win
            temp_board[action] = 1
            if self._check_win(temp_board, 1):
                return action
            temp_board[action] = 0

        # If no immediate winning/blocking moves, use Q-values
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def _check_win(self, board, player):
        win_combinations = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
            [0, 4, 8], [2, 4, 6]  # Diagonals
        ]
        return any(all(board[i] == player for i in combo) for combo in win_combinations)

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        
        # Directly use the target network's Q-values
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                target = reward + self.gamma * max(next_q_values[i])
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)
        
        # Update target network periodically
        self.update_target_counter += 1
        if self.update_target_counter >= self.update_target_frequency:
            self.target_model.set_weights(self.model.get_weights())
            self.update_target_counter = 0

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    best_win_rate = 0
    no_improvement_counter = 0
    smartness = 0
    for episode in range(episodes):
        # Decrease epsilon every 100 iterations
        if episode % 100 == 0 and episode % 1000 != 0:
            agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)

        # Increase smartness and partially reset epsilon every 2000 iterations
        if episode % 1000 == 0 and episode > 0:
            smartness = min(0.8, smartness + 0.1)
            agent.epsilon = 1  # Partial epsilon reset
            
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1 + (0.1 * smartness)  # Higher reward for winning against smarter opponent
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            if not game.is_full():
                game.player1_move()
                
            if game.current_winner == 1:
                reward = -1
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = -0.1 * smartness  # Small negative reward for draws against smart opponent
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        # Evaluation and model saving logic
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, Smartness: {smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
            
            if win_rate > best_win_rate:
                best_win_rate = win_rate
                no_improvement_counter = 0
            else:
                no_improvement_counter += 1
            
            if episode % 1000 == 0:
                agent.model.save(f'model10_episode_{episode}.h5')
        
    agent.model.save('model10.h5')
    return agent

agent = train_agent()




Episode: 0, Win Rate: 0.00, Epsilon: 1.000, Smartness: 0
Wins: 0, Losses: 0, Draws: 1
Episode: 100, Win Rate: 0.28, Epsilon: 0.850, Smartness: 0
Wins: 28, Losses: 57, Draws: 16
Episode: 200, Win Rate: 0.28, Epsilon: 0.722, Smartness: 0
Wins: 57, Losses: 108, Draws: 36
Episode: 300, Win Rate: 0.32, Epsilon: 0.614, Smartness: 0
Wins: 95, Losses: 157, Draws: 49
Episode: 400, Win Rate: 0.34, Epsilon: 0.522, Smartness: 0
Wins: 136, Losses: 198, Draws: 67
Episode: 500, Win Rate: 0.36, Epsilon: 0.444, Smartness: 0
Wins: 180, Losses: 238, Draws: 83
Episode: 600, Win Rate: 0.38, Epsilon: 0.377, Smartness: 0
Wins: 231, Losses: 270, Draws: 100
Episode: 700, Win Rate: 0.39, Epsilon: 0.321, Smartness: 0
Wins: 274, Losses: 301, Draws: 126
Episode: 800, Win Rate: 0.39, Epsilon: 0.272, Smartness: 0
Wins: 310, Losses: 336, Draws: 155
Episode: 900, Win Rate: 0.39, Epsilon: 0.232, Smartness: 0
Wins: 350, Losses: 367, Draws: 184




Episode: 1000, Win Rate: 0.39, Epsilon: 1.000, Smartness: 0.1
Wins: 393, Losses: 397, Draws: 211
Episode: 1100, Win Rate: 0.38, Epsilon: 0.850, Smartness: 0.1
Wins: 417, Losses: 466, Draws: 218
Episode: 1200, Win Rate: 0.38, Epsilon: 0.722, Smartness: 0.1
Wins: 452, Losses: 517, Draws: 232
Episode: 1300, Win Rate: 0.38, Epsilon: 0.614, Smartness: 0.1
Wins: 491, Losses: 563, Draws: 247
Episode: 1400, Win Rate: 0.38, Epsilon: 0.522, Smartness: 0.1
Wins: 534, Losses: 604, Draws: 263
Episode: 1500, Win Rate: 0.38, Epsilon: 0.444, Smartness: 0.1
Wins: 568, Losses: 651, Draws: 282
Episode: 1600, Win Rate: 0.39, Epsilon: 0.377, Smartness: 0.1
Wins: 617, Losses: 679, Draws: 305
Episode: 1700, Win Rate: 0.39, Epsilon: 0.321, Smartness: 0.1
Wins: 656, Losses: 712, Draws: 333
Episode: 1800, Win Rate: 0.39, Epsilon: 0.272, Smartness: 0.1
Wins: 710, Losses: 736, Draws: 355
Episode: 1900, Win Rate: 0.41, Epsilon: 0.232, Smartness: 0.1
Wins: 773, Losses: 758, Draws: 370




Episode: 2000, Win Rate: 0.42, Epsilon: 1.000, Smartness: 0.2
Wins: 835, Losses: 775, Draws: 391
Episode: 2100, Win Rate: 0.41, Epsilon: 0.850, Smartness: 0.2
Wins: 857, Losses: 846, Draws: 398
Episode: 2200, Win Rate: 0.40, Epsilon: 0.722, Smartness: 0.2
Wins: 882, Losses: 907, Draws: 412
Episode: 2300, Win Rate: 0.40, Epsilon: 0.614, Smartness: 0.2
Wins: 922, Losses: 952, Draws: 427
Episode: 2400, Win Rate: 0.40, Epsilon: 0.522, Smartness: 0.2
Wins: 963, Losses: 990, Draws: 448
Episode: 2500, Win Rate: 0.40, Epsilon: 0.444, Smartness: 0.2
Wins: 997, Losses: 1032, Draws: 472
Episode: 2600, Win Rate: 0.40, Epsilon: 0.377, Smartness: 0.2
Wins: 1040, Losses: 1068, Draws: 493
Episode: 2700, Win Rate: 0.40, Epsilon: 0.321, Smartness: 0.2
Wins: 1083, Losses: 1096, Draws: 522
Episode: 2800, Win Rate: 0.40, Epsilon: 0.272, Smartness: 0.2
Wins: 1128, Losses: 1125, Draws: 548
Episode: 2900, Win Rate: 0.41, Epsilon: 0.232, Smartness: 0.2
Wins: 1177, Losses: 1146, Draws: 578




Episode: 3000, Win Rate: 0.41, Epsilon: 1.000, Smartness: 0.30000000000000004
Wins: 1228, Losses: 1171, Draws: 602
Episode: 3100, Win Rate: 0.40, Epsilon: 0.850, Smartness: 0.30000000000000004
Wins: 1254, Losses: 1234, Draws: 613
Episode: 3200, Win Rate: 0.40, Epsilon: 0.722, Smartness: 0.30000000000000004
Wins: 1269, Losses: 1300, Draws: 632
Episode: 3300, Win Rate: 0.40, Epsilon: 0.614, Smartness: 0.30000000000000004
Wins: 1304, Losses: 1345, Draws: 652
Episode: 3400, Win Rate: 0.39, Epsilon: 0.522, Smartness: 0.30000000000000004
Wins: 1334, Losses: 1396, Draws: 671
Episode: 3500, Win Rate: 0.39, Epsilon: 0.444, Smartness: 0.30000000000000004
Wins: 1363, Losses: 1451, Draws: 687
Episode: 3600, Win Rate: 0.39, Epsilon: 0.377, Smartness: 0.30000000000000004
Wins: 1402, Losses: 1489, Draws: 710
Episode: 3700, Win Rate: 0.39, Epsilon: 0.321, Smartness: 0.30000000000000004
Wins: 1440, Losses: 1532, Draws: 729
Episode: 3800, Win Rate: 0.39, Epsilon: 0.272, Smartness: 0.30000000000000004
Wi



Episode: 4000, Win Rate: 0.39, Epsilon: 1.000, Smartness: 0.4
Wins: 1575, Losses: 1607, Draws: 819
Episode: 4100, Win Rate: 0.39, Epsilon: 0.850, Smartness: 0.4
Wins: 1589, Losses: 1682, Draws: 830
Episode: 4200, Win Rate: 0.38, Epsilon: 0.722, Smartness: 0.4
Wins: 1610, Losses: 1741, Draws: 850
Episode: 4300, Win Rate: 0.38, Epsilon: 0.614, Smartness: 0.4
Wins: 1636, Losses: 1797, Draws: 868
Episode: 4400, Win Rate: 0.38, Epsilon: 0.522, Smartness: 0.4
Wins: 1663, Losses: 1847, Draws: 891
Episode: 4500, Win Rate: 0.38, Epsilon: 0.444, Smartness: 0.4
Wins: 1699, Losses: 1891, Draws: 911
Episode: 4600, Win Rate: 0.38, Epsilon: 0.377, Smartness: 0.4
Wins: 1732, Losses: 1934, Draws: 935
Episode: 4700, Win Rate: 0.38, Epsilon: 0.321, Smartness: 0.4
Wins: 1763, Losses: 1970, Draws: 968
Episode: 4800, Win Rate: 0.38, Epsilon: 0.272, Smartness: 0.4
Wins: 1803, Losses: 2008, Draws: 990
Episode: 4900, Win Rate: 0.38, Epsilon: 0.232, Smartness: 0.4
Wins: 1840, Losses: 2042, Draws: 1019




Episode: 5000, Win Rate: 0.38, Epsilon: 1.000, Smartness: 0.5
Wins: 1883, Losses: 2069, Draws: 1049
Episode: 5100, Win Rate: 0.37, Epsilon: 0.850, Smartness: 0.5
Wins: 1893, Losses: 2148, Draws: 1060
Episode: 5200, Win Rate: 0.37, Epsilon: 0.722, Smartness: 0.5
Wins: 1907, Losses: 2221, Draws: 1073
Episode: 5300, Win Rate: 0.36, Epsilon: 0.614, Smartness: 0.5
Wins: 1920, Losses: 2296, Draws: 1085
Episode: 5400, Win Rate: 0.36, Epsilon: 0.522, Smartness: 0.5
Wins: 1946, Losses: 2344, Draws: 1111
Episode: 5500, Win Rate: 0.36, Epsilon: 0.444, Smartness: 0.5
Wins: 1971, Losses: 2398, Draws: 1132
Episode: 5600, Win Rate: 0.36, Epsilon: 0.377, Smartness: 0.5
Wins: 1998, Losses: 2448, Draws: 1155
Episode: 5700, Win Rate: 0.36, Epsilon: 0.321, Smartness: 0.5
Wins: 2025, Losses: 2498, Draws: 1178
Episode: 5800, Win Rate: 0.35, Epsilon: 0.272, Smartness: 0.5
Wins: 2058, Losses: 2539, Draws: 1204
Episode: 5900, Win Rate: 0.35, Epsilon: 0.232, Smartness: 0.5
Wins: 2088, Losses: 2573, Draws: 1240




Episode: 6000, Win Rate: 0.35, Epsilon: 1.000, Smartness: 0.6
Wins: 2122, Losses: 2608, Draws: 1271
Episode: 6100, Win Rate: 0.35, Epsilon: 0.850, Smartness: 0.6
Wins: 2134, Losses: 2685, Draws: 1282
Episode: 6200, Win Rate: 0.35, Epsilon: 0.722, Smartness: 0.6
Wins: 2145, Losses: 2760, Draws: 1296
Episode: 6300, Win Rate: 0.34, Epsilon: 0.614, Smartness: 0.6
Wins: 2162, Losses: 2824, Draws: 1315
Episode: 6400, Win Rate: 0.34, Epsilon: 0.522, Smartness: 0.6
Wins: 2177, Losses: 2890, Draws: 1334
Episode: 6500, Win Rate: 0.34, Epsilon: 0.444, Smartness: 0.6
Wins: 2202, Losses: 2941, Draws: 1358
Episode: 6600, Win Rate: 0.34, Epsilon: 0.377, Smartness: 0.6
Wins: 2232, Losses: 2988, Draws: 1381
Episode: 6700, Win Rate: 0.34, Epsilon: 0.321, Smartness: 0.6
Wins: 2259, Losses: 3037, Draws: 1405
Episode: 6800, Win Rate: 0.34, Epsilon: 0.272, Smartness: 0.6
Wins: 2281, Losses: 3091, Draws: 1429
Episode: 6900, Win Rate: 0.33, Epsilon: 0.232, Smartness: 0.6
Wins: 2307, Losses: 3138, Draws: 1456




Episode: 7000, Win Rate: 0.33, Epsilon: 1.000, Smartness: 0.7
Wins: 2337, Losses: 3175, Draws: 1489
Episode: 7100, Win Rate: 0.33, Epsilon: 0.850, Smartness: 0.7
Wins: 2346, Losses: 3252, Draws: 1503
Episode: 7200, Win Rate: 0.33, Epsilon: 0.722, Smartness: 0.7
Wins: 2360, Losses: 3324, Draws: 1517
Episode: 7300, Win Rate: 0.32, Epsilon: 0.614, Smartness: 0.7
Wins: 2370, Losses: 3394, Draws: 1537
Episode: 7400, Win Rate: 0.32, Epsilon: 0.522, Smartness: 0.7
Wins: 2379, Losses: 3467, Draws: 1555
Episode: 7500, Win Rate: 0.32, Epsilon: 0.444, Smartness: 0.7
Wins: 2397, Losses: 3531, Draws: 1573
Episode: 7600, Win Rate: 0.32, Epsilon: 0.377, Smartness: 0.7
Wins: 2412, Losses: 3586, Draws: 1603
Episode: 7700, Win Rate: 0.32, Epsilon: 0.321, Smartness: 0.7
Wins: 2428, Losses: 3634, Draws: 1639
Episode: 7800, Win Rate: 0.31, Epsilon: 0.272, Smartness: 0.7
Wins: 2447, Losses: 3678, Draws: 1676
Episode: 7900, Win Rate: 0.31, Epsilon: 0.232, Smartness: 0.7
Wins: 2473, Losses: 3721, Draws: 1707




Episode: 8000, Win Rate: 0.31, Epsilon: 1.000, Smartness: 0.7999999999999999
Wins: 2493, Losses: 3767, Draws: 1741
Episode: 8100, Win Rate: 0.31, Epsilon: 0.850, Smartness: 0.7999999999999999
Wins: 2499, Losses: 3855, Draws: 1747
Episode: 8200, Win Rate: 0.31, Epsilon: 0.722, Smartness: 0.7999999999999999
Wins: 2507, Losses: 3934, Draws: 1760
Episode: 8300, Win Rate: 0.30, Epsilon: 0.614, Smartness: 0.7999999999999999
Wins: 2516, Losses: 4002, Draws: 1783
Episode: 8400, Win Rate: 0.30, Epsilon: 0.522, Smartness: 0.7999999999999999
Wins: 2527, Losses: 4070, Draws: 1804
Episode: 8500, Win Rate: 0.30, Epsilon: 0.444, Smartness: 0.7999999999999999
Wins: 2538, Losses: 4136, Draws: 1827
Episode: 8600, Win Rate: 0.30, Epsilon: 0.377, Smartness: 0.7999999999999999
Wins: 2545, Losses: 4201, Draws: 1855
Episode: 8700, Win Rate: 0.29, Epsilon: 0.321, Smartness: 0.7999999999999999
Wins: 2560, Losses: 4252, Draws: 1889
Episode: 8800, Win Rate: 0.29, Epsilon: 0.272, Smartness: 0.7999999999999999
Win



Episode: 9000, Win Rate: 0.29, Epsilon: 1.000, Smartness: 0.8
Wins: 2609, Losses: 4394, Draws: 1998
Episode: 9100, Win Rate: 0.29, Epsilon: 0.850, Smartness: 0.8
Wins: 2612, Losses: 4481, Draws: 2008
Episode: 9200, Win Rate: 0.29, Epsilon: 0.722, Smartness: 0.8
Wins: 2625, Losses: 4554, Draws: 2022
Episode: 9300, Win Rate: 0.28, Epsilon: 0.614, Smartness: 0.8
Wins: 2633, Losses: 4632, Draws: 2036
Episode: 9400, Win Rate: 0.28, Epsilon: 0.522, Smartness: 0.8
Wins: 2645, Losses: 4698, Draws: 2058
Episode: 9500, Win Rate: 0.28, Epsilon: 0.444, Smartness: 0.8
Wins: 2657, Losses: 4759, Draws: 2085
Episode: 9600, Win Rate: 0.28, Epsilon: 0.377, Smartness: 0.8
Wins: 2666, Losses: 4822, Draws: 2113
Episode: 9700, Win Rate: 0.28, Epsilon: 0.321, Smartness: 0.8
Wins: 2674, Losses: 4873, Draws: 2154
Episode: 9800, Win Rate: 0.27, Epsilon: 0.272, Smartness: 0.8
Wins: 2688, Losses: 4923, Draws: 2190
Episode: 9900, Win Rate: 0.27, Epsilon: 0.232, Smartness: 0.8
Wins: 2706, Losses: 4971, Draws: 2224




In [8]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=10000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    smartness=0
    for episode in range(episodes):
        if episode % 1000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 1000) * epsilon_decay)
        if episode%1000==0 and episode>0:
            smartness = min(0.8, episode / (episodes * 0.9))
            
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0+smartness
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0-smartness
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = min(-0.1,-0.5*smartness)
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model11_episode_{episode}.h5')
    agent.model.save('model11.h5')
agent = train_agent()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 0, Win Rate: 0.00, Epsilon: 1.000, smartmove0
Wins: 0, Losses: 1, Draws: 0
Episode: 100, Win Rate: 0.27, Epsilon: 0.901, smartmove0
Wins: 27, Losses: 56, Draws: 18
Episode: 200, Win Rate: 0.27, Epsilon: 0.802, smartmove0
Wins: 55, Losses: 113, Draws: 33
Episode: 300, Win Rate: 0.27, Epsilon: 0.703, smartmove0
Wins: 81, Losses: 169, Draws: 51
Episode: 400, Win Rate: 0.29, Epsilon: 0.604, smartmove0
Wins: 116, Losses: 222, Draws: 63
Episode: 500, Win Rate: 0.30, Epsilon: 0.505, smartmove0
Wins: 149, Losses: 281, Draws: 71
Episode: 600, Win Rate: 0.30, Epsilon: 0.406, smartmove0
Wins: 183, Losses: 341, Draws: 77
Episode: 700, Win Rate: 0.31, Epsilon: 0.307, smartmove0
Wins: 216, Losses: 398, Draws: 87
Episode: 800, Win Rate: 0.32, Epsilon: 0.208, smartmove0
Wins: 255, Losses: 451, Draws: 95
Episode: 900, Win Rate: 0.32, Epsilon: 0.109, smartmove0
Wins: 291, Losses: 505, Draws: 105




Resetting epsilon to 1.0
Episode: 1000, Win Rate: 0.32, Epsilon: 1.000, smartmove0.1111111111111111
Wins: 323, Losses: 565, Draws: 113
Episode: 1100, Win Rate: 0.32, Epsilon: 0.901, smartmove0.1111111111111111
Wins: 348, Losses: 627, Draws: 126
Episode: 1200, Win Rate: 0.31, Epsilon: 0.802, smartmove0.1111111111111111
Wins: 377, Losses: 691, Draws: 133
Episode: 1300, Win Rate: 0.31, Epsilon: 0.703, smartmove0.1111111111111111
Wins: 407, Losses: 749, Draws: 145
Episode: 1400, Win Rate: 0.31, Epsilon: 0.604, smartmove0.1111111111111111
Wins: 431, Losses: 816, Draws: 154
Episode: 1500, Win Rate: 0.31, Epsilon: 0.505, smartmove0.1111111111111111
Wins: 463, Losses: 873, Draws: 165
Episode: 1600, Win Rate: 0.32, Epsilon: 0.406, smartmove0.1111111111111111
Wins: 505, Losses: 920, Draws: 176
Episode: 1700, Win Rate: 0.32, Epsilon: 0.307, smartmove0.1111111111111111
Wins: 543, Losses: 973, Draws: 185
Episode: 1800, Win Rate: 0.32, Epsilon: 0.208, smartmove0.1111111111111111
Wins: 579, Losses: 1



Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.33, Epsilon: 1.000, smartmove0.2222222222222222
Wins: 659, Losses: 1132, Draws: 210
Episode: 2100, Win Rate: 0.32, Epsilon: 0.901, smartmove0.2222222222222222
Wins: 678, Losses: 1202, Draws: 221
Episode: 2200, Win Rate: 0.32, Epsilon: 0.802, smartmove0.2222222222222222
Wins: 698, Losses: 1269, Draws: 234
Episode: 2300, Win Rate: 0.32, Epsilon: 0.703, smartmove0.2222222222222222
Wins: 730, Losses: 1328, Draws: 243
Episode: 2400, Win Rate: 0.31, Epsilon: 0.604, smartmove0.2222222222222222
Wins: 752, Losses: 1394, Draws: 255
Episode: 2500, Win Rate: 0.31, Epsilon: 0.505, smartmove0.2222222222222222
Wins: 784, Losses: 1455, Draws: 262
Episode: 2600, Win Rate: 0.31, Epsilon: 0.406, smartmove0.2222222222222222
Wins: 818, Losses: 1512, Draws: 271
Episode: 2700, Win Rate: 0.32, Epsilon: 0.307, smartmove0.2222222222222222
Wins: 858, Losses: 1568, Draws: 275
Episode: 2800, Win Rate: 0.32, Epsilon: 0.208, smartmove0.2222222222222222
Wins: 884, L



Resetting epsilon to 1.0
Episode: 3000, Win Rate: 0.32, Epsilon: 1.000, smartmove0.3333333333333333
Wins: 947, Losses: 1751, Draws: 303
Episode: 3100, Win Rate: 0.31, Epsilon: 0.901, smartmove0.3333333333333333
Wins: 972, Losses: 1813, Draws: 316
Episode: 3200, Win Rate: 0.31, Epsilon: 0.802, smartmove0.3333333333333333
Wins: 987, Losses: 1889, Draws: 325
Episode: 3300, Win Rate: 0.30, Epsilon: 0.703, smartmove0.3333333333333333
Wins: 1005, Losses: 1957, Draws: 339
Episode: 3400, Win Rate: 0.30, Epsilon: 0.604, smartmove0.3333333333333333
Wins: 1032, Losses: 2018, Draws: 351
Episode: 3500, Win Rate: 0.30, Epsilon: 0.505, smartmove0.3333333333333333
Wins: 1052, Losses: 2086, Draws: 363
Episode: 3600, Win Rate: 0.30, Epsilon: 0.406, smartmove0.3333333333333333
Wins: 1083, Losses: 2146, Draws: 372
Episode: 3700, Win Rate: 0.30, Epsilon: 0.307, smartmove0.3333333333333333
Wins: 1116, Losses: 2203, Draws: 382
Episode: 3800, Win Rate: 0.30, Epsilon: 0.208, smartmove0.3333333333333333
Wins: 1



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.30, Epsilon: 1.000, smartmove0.4444444444444444
Wins: 1201, Losses: 2391, Draws: 409
Episode: 4100, Win Rate: 0.30, Epsilon: 0.901, smartmove0.4444444444444444
Wins: 1214, Losses: 2467, Draws: 420
Episode: 4200, Win Rate: 0.29, Epsilon: 0.802, smartmove0.4444444444444444
Wins: 1225, Losses: 2537, Draws: 439
Episode: 4300, Win Rate: 0.29, Epsilon: 0.703, smartmove0.4444444444444444
Wins: 1248, Losses: 2605, Draws: 448
Episode: 4400, Win Rate: 0.29, Epsilon: 0.604, smartmove0.4444444444444444
Wins: 1262, Losses: 2682, Draws: 457
Episode: 4500, Win Rate: 0.29, Epsilon: 0.505, smartmove0.4444444444444444
Wins: 1294, Losses: 2742, Draws: 465
Episode: 4600, Win Rate: 0.29, Epsilon: 0.406, smartmove0.4444444444444444
Wins: 1320, Losses: 2806, Draws: 475
Episode: 4700, Win Rate: 0.29, Epsilon: 0.307, smartmove0.4444444444444444
Wins: 1341, Losses: 2871, Draws: 489
Episode: 4800, Win Rate: 0.28, Epsilon: 0.208, smartmove0.4444444444444444
Wins



Resetting epsilon to 1.0
Episode: 5000, Win Rate: 0.28, Epsilon: 1.000, smartmove0.5555555555555556
Wins: 1410, Losses: 3066, Draws: 525
Episode: 5100, Win Rate: 0.28, Epsilon: 0.901, smartmove0.5555555555555556
Wins: 1421, Losses: 3144, Draws: 536
Episode: 5200, Win Rate: 0.28, Epsilon: 0.802, smartmove0.5555555555555556
Wins: 1435, Losses: 3224, Draws: 542
Episode: 5300, Win Rate: 0.27, Epsilon: 0.703, smartmove0.5555555555555556
Wins: 1445, Losses: 3296, Draws: 560
Episode: 5400, Win Rate: 0.27, Epsilon: 0.604, smartmove0.5555555555555556
Wins: 1460, Losses: 3369, Draws: 572
Episode: 5500, Win Rate: 0.27, Epsilon: 0.505, smartmove0.5555555555555556
Wins: 1481, Losses: 3433, Draws: 587
Episode: 5600, Win Rate: 0.27, Epsilon: 0.406, smartmove0.5555555555555556
Wins: 1499, Losses: 3506, Draws: 596
Episode: 5700, Win Rate: 0.27, Epsilon: 0.307, smartmove0.5555555555555556
Wins: 1520, Losses: 3574, Draws: 607
Episode: 5800, Win Rate: 0.27, Epsilon: 0.208, smartmove0.5555555555555556
Wins



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.27, Epsilon: 1.000, smartmove0.6666666666666666
Wins: 1592, Losses: 3763, Draws: 646
Episode: 6100, Win Rate: 0.26, Epsilon: 0.901, smartmove0.6666666666666666
Wins: 1599, Losses: 3845, Draws: 657
Episode: 6200, Win Rate: 0.26, Epsilon: 0.802, smartmove0.6666666666666666
Wins: 1609, Losses: 3926, Draws: 666
Episode: 6300, Win Rate: 0.26, Epsilon: 0.703, smartmove0.6666666666666666
Wins: 1617, Losses: 4007, Draws: 677
Episode: 6400, Win Rate: 0.25, Epsilon: 0.604, smartmove0.6666666666666666
Wins: 1628, Losses: 4080, Draws: 693
Episode: 6500, Win Rate: 0.25, Epsilon: 0.505, smartmove0.6666666666666666
Wins: 1650, Losses: 4148, Draws: 703
Episode: 6600, Win Rate: 0.25, Epsilon: 0.406, smartmove0.6666666666666666
Wins: 1664, Losses: 4222, Draws: 715
Episode: 6700, Win Rate: 0.25, Epsilon: 0.307, smartmove0.6666666666666666
Wins: 1683, Losses: 4295, Draws: 723
Episode: 6800, Win Rate: 0.25, Epsilon: 0.208, smartmove0.6666666666666666
Wins



Resetting epsilon to 1.0
Episode: 7000, Win Rate: 0.25, Epsilon: 1.000, smartmove0.7777777777777778
Wins: 1745, Losses: 4472, Draws: 784
Episode: 7100, Win Rate: 0.25, Epsilon: 0.901, smartmove0.7777777777777778
Wins: 1748, Losses: 4561, Draws: 792
Episode: 7200, Win Rate: 0.24, Epsilon: 0.802, smartmove0.7777777777777778
Wins: 1762, Losses: 4634, Draws: 805
Episode: 7300, Win Rate: 0.24, Epsilon: 0.703, smartmove0.7777777777777778
Wins: 1768, Losses: 4713, Draws: 820
Episode: 7400, Win Rate: 0.24, Epsilon: 0.604, smartmove0.7777777777777778
Wins: 1771, Losses: 4797, Draws: 833
Episode: 7500, Win Rate: 0.24, Epsilon: 0.505, smartmove0.7777777777777778
Wins: 1784, Losses: 4869, Draws: 848
Episode: 7600, Win Rate: 0.24, Epsilon: 0.406, smartmove0.7777777777777778
Wins: 1794, Losses: 4943, Draws: 864
Episode: 7700, Win Rate: 0.23, Epsilon: 0.307, smartmove0.7777777777777778
Wins: 1808, Losses: 5015, Draws: 878
Episode: 7800, Win Rate: 0.23, Epsilon: 0.208, smartmove0.7777777777777778
Wins



Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.23, Epsilon: 1.000, smartmove0.8
Wins: 1852, Losses: 5206, Draws: 943
Episode: 8100, Win Rate: 0.23, Epsilon: 0.901, smartmove0.8
Wins: 1859, Losses: 5291, Draws: 951
Episode: 8200, Win Rate: 0.23, Epsilon: 0.802, smartmove0.8
Wins: 1870, Losses: 5371, Draws: 960
Episode: 8300, Win Rate: 0.23, Epsilon: 0.703, smartmove0.8
Wins: 1884, Losses: 5447, Draws: 970
Episode: 8400, Win Rate: 0.22, Epsilon: 0.604, smartmove0.8
Wins: 1890, Losses: 5525, Draws: 986
Episode: 8500, Win Rate: 0.22, Epsilon: 0.505, smartmove0.8
Wins: 1901, Losses: 5596, Draws: 1004
Episode: 8600, Win Rate: 0.22, Epsilon: 0.406, smartmove0.8
Wins: 1913, Losses: 5670, Draws: 1018
Episode: 8700, Win Rate: 0.22, Epsilon: 0.307, smartmove0.8
Wins: 1925, Losses: 5739, Draws: 1037
Episode: 8800, Win Rate: 0.22, Epsilon: 0.208, smartmove0.8
Wins: 1931, Losses: 5815, Draws: 1055
Episode: 8900, Win Rate: 0.22, Epsilon: 0.109, smartmove0.8
Wins: 1949, Losses: 5876, Draws: 1076




Resetting epsilon to 1.0
Episode: 9000, Win Rate: 0.22, Epsilon: 1.000, smartmove0.8
Wins: 1964, Losses: 5937, Draws: 1100
Episode: 9100, Win Rate: 0.22, Epsilon: 0.901, smartmove0.8
Wins: 1968, Losses: 6025, Draws: 1108
Episode: 9200, Win Rate: 0.21, Epsilon: 0.802, smartmove0.8
Wins: 1975, Losses: 6105, Draws: 1121
Episode: 9300, Win Rate: 0.21, Epsilon: 0.703, smartmove0.8
Wins: 1981, Losses: 6183, Draws: 1137
Episode: 9400, Win Rate: 0.21, Epsilon: 0.604, smartmove0.8
Wins: 1991, Losses: 6254, Draws: 1156
Episode: 9500, Win Rate: 0.21, Epsilon: 0.505, smartmove0.8
Wins: 1996, Losses: 6332, Draws: 1173
Episode: 9600, Win Rate: 0.21, Epsilon: 0.406, smartmove0.8
Wins: 2008, Losses: 6403, Draws: 1190
Episode: 9700, Win Rate: 0.21, Epsilon: 0.307, smartmove0.8
Wins: 2021, Losses: 6471, Draws: 1209
Episode: 9800, Win Rate: 0.21, Epsilon: 0.208, smartmove0.8
Wins: 2037, Losses: 6528, Draws: 1236
Episode: 9900, Win Rate: 0.21, Epsilon: 0.109, smartmove0.8
Wins: 2056, Losses: 6589, Draws: 



In [12]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe
import os

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95, model_path=None):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)
        
        if model_path and os.path.isfile(model_path):
            print(f"Loading model from {model_path}")
            self.model = load_model(model_path)
        else:
            print("No existing model found; initializing a new model.")
            self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=32, epochs=1, verbose=0)

def train_agent(episodes=20000):
    agent = SQNAgent()
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = 1.0
    min_epsilon = 0.01
    decay_episodes = 1000
    smartness=0
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 2000) * epsilon_decay)
        if episode%2000==0 and episode>0:
            smartness = min(0.8, episode / (episodes * 0.9))
            
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0+smartness
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0-smartness
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = min(-0.1,-0.5*smartness)
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, smartmove{smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model11_retrain_episode_{episode}.h5')
    agent.model.save('model11_retrain.h5')
agent = train_agent()




No existing model found; initializing a new model.
Episode: 0, Win Rate: 0.00, Epsilon: 1.000, smartmove0
Wins: 0, Losses: 1, Draws: 0
Episode: 100, Win Rate: 0.30, Epsilon: 0.901, smartmove0
Wins: 30, Losses: 61, Draws: 10
Episode: 200, Win Rate: 0.27, Epsilon: 0.802, smartmove0
Wins: 54, Losses: 121, Draws: 26
Episode: 300, Win Rate: 0.28, Epsilon: 0.703, smartmove0
Wins: 85, Losses: 180, Draws: 36
Episode: 400, Win Rate: 0.30, Epsilon: 0.604, smartmove0
Wins: 122, Losses: 235, Draws: 44
Episode: 500, Win Rate: 0.32, Epsilon: 0.505, smartmove0
Wins: 159, Losses: 284, Draws: 58
Episode: 600, Win Rate: 0.31, Epsilon: 0.406, smartmove0
Wins: 185, Losses: 347, Draws: 69
Episode: 700, Win Rate: 0.32, Epsilon: 0.307, smartmove0
Wins: 223, Losses: 402, Draws: 76
Episode: 800, Win Rate: 0.32, Epsilon: 0.208, smartmove0
Wins: 258, Losses: 459, Draws: 84
Episode: 900, Win Rate: 0.33, Epsilon: 0.109, smartmove0
Wins: 299, Losses: 508, Draws: 94




Episode: 1000, Win Rate: 0.34, Epsilon: 0.010, smartmove0
Wins: 339, Losses: 561, Draws: 101
Episode: 1100, Win Rate: 0.34, Epsilon: 0.010, smartmove0
Wins: 377, Losses: 614, Draws: 110
Episode: 1200, Win Rate: 0.35, Epsilon: 0.010, smartmove0
Wins: 419, Losses: 666, Draws: 116
Episode: 1300, Win Rate: 0.36, Epsilon: 0.010, smartmove0
Wins: 463, Losses: 713, Draws: 125
Episode: 1400, Win Rate: 0.36, Epsilon: 0.010, smartmove0
Wins: 503, Losses: 765, Draws: 133
Episode: 1500, Win Rate: 0.37, Epsilon: 0.010, smartmove0
Wins: 549, Losses: 811, Draws: 141
Episode: 1600, Win Rate: 0.37, Epsilon: 0.010, smartmove0
Wins: 592, Losses: 858, Draws: 151
Episode: 1700, Win Rate: 0.37, Epsilon: 0.010, smartmove0
Wins: 621, Losses: 918, Draws: 162
Episode: 1800, Win Rate: 0.36, Epsilon: 0.010, smartmove0
Wins: 652, Losses: 981, Draws: 168
Episode: 1900, Win Rate: 0.36, Epsilon: 0.010, smartmove0
Wins: 690, Losses: 1031, Draws: 180




Resetting epsilon to 1.0
Episode: 2000, Win Rate: 0.36, Epsilon: 1.000, smartmove0.1111111111111111
Wins: 727, Losses: 1081, Draws: 193
Episode: 2100, Win Rate: 0.35, Epsilon: 0.901, smartmove0.1111111111111111
Wins: 744, Losses: 1146, Draws: 211
Episode: 2200, Win Rate: 0.35, Epsilon: 0.802, smartmove0.1111111111111111
Wins: 773, Losses: 1206, Draws: 222
Episode: 2300, Win Rate: 0.35, Epsilon: 0.703, smartmove0.1111111111111111
Wins: 801, Losses: 1265, Draws: 235
Episode: 2400, Win Rate: 0.35, Epsilon: 0.604, smartmove0.1111111111111111
Wins: 831, Losses: 1326, Draws: 244
Episode: 2500, Win Rate: 0.34, Epsilon: 0.505, smartmove0.1111111111111111
Wins: 862, Losses: 1386, Draws: 253
Episode: 2600, Win Rate: 0.34, Epsilon: 0.406, smartmove0.1111111111111111
Wins: 888, Losses: 1451, Draws: 262
Episode: 2700, Win Rate: 0.34, Epsilon: 0.307, smartmove0.1111111111111111
Wins: 922, Losses: 1500, Draws: 279
Episode: 2800, Win Rate: 0.34, Epsilon: 0.208, smartmove0.1111111111111111
Wins: 954, L



Episode: 3000, Win Rate: 0.34, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1028, Losses: 1647, Draws: 326
Episode: 3100, Win Rate: 0.35, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1074, Losses: 1691, Draws: 336
Episode: 3200, Win Rate: 0.35, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1110, Losses: 1743, Draws: 348
Episode: 3300, Win Rate: 0.35, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1153, Losses: 1790, Draws: 358
Episode: 3400, Win Rate: 0.35, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1196, Losses: 1831, Draws: 374
Episode: 3500, Win Rate: 0.35, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1237, Losses: 1873, Draws: 391
Episode: 3600, Win Rate: 0.36, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1281, Losses: 1917, Draws: 403
Episode: 3700, Win Rate: 0.35, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1312, Losses: 1967, Draws: 422
Episode: 3800, Win Rate: 0.36, Epsilon: 0.010, smartmove0.1111111111111111
Wins: 1359, Losses: 2012, Dra



Resetting epsilon to 1.0
Episode: 4000, Win Rate: 0.36, Epsilon: 1.000, smartmove0.2222222222222222
Wins: 1431, Losses: 2111, Draws: 459
Episode: 4100, Win Rate: 0.35, Epsilon: 0.901, smartmove0.2222222222222222
Wins: 1450, Losses: 2181, Draws: 470
Episode: 4200, Win Rate: 0.35, Epsilon: 0.802, smartmove0.2222222222222222
Wins: 1468, Losses: 2247, Draws: 486
Episode: 4300, Win Rate: 0.35, Epsilon: 0.703, smartmove0.2222222222222222
Wins: 1496, Losses: 2308, Draws: 497
Episode: 4400, Win Rate: 0.34, Epsilon: 0.604, smartmove0.2222222222222222
Wins: 1518, Losses: 2379, Draws: 504
Episode: 4500, Win Rate: 0.34, Epsilon: 0.505, smartmove0.2222222222222222
Wins: 1547, Losses: 2439, Draws: 515
Episode: 4600, Win Rate: 0.34, Epsilon: 0.406, smartmove0.2222222222222222
Wins: 1570, Losses: 2498, Draws: 533
Episode: 4700, Win Rate: 0.34, Epsilon: 0.307, smartmove0.2222222222222222
Wins: 1594, Losses: 2565, Draws: 542
Episode: 4800, Win Rate: 0.34, Epsilon: 0.208, smartmove0.2222222222222222
Wins



Episode: 5000, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1677, Losses: 2734, Draws: 590
Episode: 5100, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1712, Losses: 2777, Draws: 612
Episode: 5200, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1749, Losses: 2823, Draws: 629
Episode: 5300, Win Rate: 0.33, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1773, Losses: 2876, Draws: 652
Episode: 5400, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1814, Losses: 2923, Draws: 664
Episode: 5500, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1859, Losses: 2961, Draws: 681
Episode: 5600, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1894, Losses: 2999, Draws: 708
Episode: 5700, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1930, Losses: 3048, Draws: 723
Episode: 5800, Win Rate: 0.34, Epsilon: 0.010, smartmove0.2222222222222222
Wins: 1971, Losses: 3090, Dra



Resetting epsilon to 1.0
Episode: 6000, Win Rate: 0.34, Epsilon: 1.000, smartmove0.3333333333333333
Wins: 2041, Losses: 3185, Draws: 775
Episode: 6100, Win Rate: 0.34, Epsilon: 0.901, smartmove0.3333333333333333
Wins: 2058, Losses: 3253, Draws: 790
Episode: 6200, Win Rate: 0.34, Epsilon: 0.802, smartmove0.3333333333333333
Wins: 2087, Losses: 3307, Draws: 807
Episode: 6300, Win Rate: 0.33, Epsilon: 0.703, smartmove0.3333333333333333
Wins: 2106, Losses: 3378, Draws: 817
Episode: 6400, Win Rate: 0.33, Epsilon: 0.604, smartmove0.3333333333333333
Wins: 2133, Losses: 3435, Draws: 833
Episode: 6500, Win Rate: 0.33, Epsilon: 0.505, smartmove0.3333333333333333
Wins: 2164, Losses: 3489, Draws: 848
Episode: 6600, Win Rate: 0.33, Epsilon: 0.406, smartmove0.3333333333333333
Wins: 2190, Losses: 3544, Draws: 867
Episode: 6700, Win Rate: 0.33, Epsilon: 0.307, smartmove0.3333333333333333
Wins: 2219, Losses: 3592, Draws: 890
Episode: 6800, Win Rate: 0.33, Epsilon: 0.208, smartmove0.3333333333333333
Wins



Episode: 7000, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2313, Losses: 3748, Draws: 940
Episode: 7100, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2344, Losses: 3798, Draws: 959
Episode: 7200, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2384, Losses: 3844, Draws: 973
Episode: 7300, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2408, Losses: 3901, Draws: 992
Episode: 7400, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2442, Losses: 3947, Draws: 1012
Episode: 7500, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2479, Losses: 3991, Draws: 1031
Episode: 7600, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2508, Losses: 4042, Draws: 1051
Episode: 7700, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2532, Losses: 4099, Draws: 1070
Episode: 7800, Win Rate: 0.33, Epsilon: 0.010, smartmove0.3333333333333333
Wins: 2558, Losses: 4149,



Resetting epsilon to 1.0
Episode: 8000, Win Rate: 0.33, Epsilon: 1.000, smartmove0.4444444444444444
Wins: 2617, Losses: 4253, Draws: 1131
Episode: 8100, Win Rate: 0.33, Epsilon: 0.901, smartmove0.4444444444444444
Wins: 2633, Losses: 4325, Draws: 1143
Episode: 8200, Win Rate: 0.32, Epsilon: 0.802, smartmove0.4444444444444444
Wins: 2651, Losses: 4389, Draws: 1161
Episode: 8300, Win Rate: 0.32, Epsilon: 0.703, smartmove0.4444444444444444
Wins: 2680, Losses: 4452, Draws: 1169
Episode: 8400, Win Rate: 0.32, Epsilon: 0.604, smartmove0.4444444444444444
Wins: 2700, Losses: 4522, Draws: 1179
Episode: 8500, Win Rate: 0.32, Epsilon: 0.505, smartmove0.4444444444444444
Wins: 2724, Losses: 4586, Draws: 1191
Episode: 8600, Win Rate: 0.32, Epsilon: 0.406, smartmove0.4444444444444444
Wins: 2758, Losses: 4639, Draws: 1204
Episode: 8700, Win Rate: 0.32, Epsilon: 0.307, smartmove0.4444444444444444
Wins: 2775, Losses: 4705, Draws: 1221
Episode: 8800, Win Rate: 0.32, Epsilon: 0.208, smartmove0.4444444444444



Episode: 9000, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 2849, Losses: 4877, Draws: 1275
Episode: 9100, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 2887, Losses: 4920, Draws: 1294
Episode: 9200, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 2928, Losses: 4967, Draws: 1306
Episode: 9300, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 2952, Losses: 5014, Draws: 1335
Episode: 9400, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 2983, Losses: 5058, Draws: 1360
Episode: 9500, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 3009, Losses: 5113, Draws: 1379
Episode: 9600, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 3039, Losses: 5163, Draws: 1399
Episode: 9700, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 3076, Losses: 5200, Draws: 1425
Episode: 9800, Win Rate: 0.32, Epsilon: 0.010, smartmove0.4444444444444444
Wins: 3099, Losses: 5



Resetting epsilon to 1.0
Episode: 10000, Win Rate: 0.32, Epsilon: 1.000, smartmove0.5555555555555556
Wins: 3157, Losses: 5360, Draws: 1484
Episode: 10100, Win Rate: 0.31, Epsilon: 0.901, smartmove0.5555555555555556
Wins: 3169, Losses: 5436, Draws: 1496
Episode: 10200, Win Rate: 0.31, Epsilon: 0.802, smartmove0.5555555555555556
Wins: 3184, Losses: 5511, Draws: 1506
Episode: 10300, Win Rate: 0.31, Epsilon: 0.703, smartmove0.5555555555555556
Wins: 3198, Losses: 5588, Draws: 1515
Episode: 10400, Win Rate: 0.31, Epsilon: 0.604, smartmove0.5555555555555556
Wins: 3214, Losses: 5659, Draws: 1528
Episode: 10500, Win Rate: 0.31, Epsilon: 0.505, smartmove0.5555555555555556
Wins: 3228, Losses: 5733, Draws: 1540
Episode: 10600, Win Rate: 0.31, Epsilon: 0.406, smartmove0.5555555555555556
Wins: 3257, Losses: 5787, Draws: 1557
Episode: 10700, Win Rate: 0.31, Epsilon: 0.307, smartmove0.5555555555555556
Wins: 3278, Losses: 5848, Draws: 1575
Episode: 10800, Win Rate: 0.31, Epsilon: 0.208, smartmove0.5555



Episode: 11000, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3337, Losses: 6020, Draws: 1644
Episode: 11100, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3367, Losses: 6066, Draws: 1668
Episode: 11200, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3394, Losses: 6121, Draws: 1686
Episode: 11300, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3424, Losses: 6160, Draws: 1717
Episode: 11400, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3454, Losses: 6198, Draws: 1749
Episode: 11500, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3484, Losses: 6243, Draws: 1774
Episode: 11600, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3513, Losses: 6291, Draws: 1797
Episode: 11700, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3535, Losses: 6342, Draws: 1824
Episode: 11800, Win Rate: 0.30, Epsilon: 0.010, smartmove0.5555555555555556
Wins: 3560, 



Resetting epsilon to 1.0
Episode: 12000, Win Rate: 0.30, Epsilon: 1.000, smartmove0.6666666666666666
Wins: 3627, Losses: 6471, Draws: 1903
Episode: 12100, Win Rate: 0.30, Epsilon: 0.901, smartmove0.6666666666666666
Wins: 3635, Losses: 6550, Draws: 1916
Episode: 12200, Win Rate: 0.30, Epsilon: 0.802, smartmove0.6666666666666666
Wins: 3647, Losses: 6628, Draws: 1926
Episode: 12300, Win Rate: 0.30, Epsilon: 0.703, smartmove0.6666666666666666
Wins: 3665, Losses: 6694, Draws: 1942
Episode: 12400, Win Rate: 0.30, Epsilon: 0.604, smartmove0.6666666666666666
Wins: 3676, Losses: 6765, Draws: 1960
Episode: 12500, Win Rate: 0.30, Epsilon: 0.505, smartmove0.6666666666666666
Wins: 3689, Losses: 6830, Draws: 1982
Episode: 12600, Win Rate: 0.29, Epsilon: 0.406, smartmove0.6666666666666666
Wins: 3699, Losses: 6903, Draws: 1999
Episode: 12700, Win Rate: 0.29, Epsilon: 0.307, smartmove0.6666666666666666
Wins: 3717, Losses: 6962, Draws: 2022
Episode: 12800, Win Rate: 0.29, Epsilon: 0.208, smartmove0.6666



Episode: 13000, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3775, Losses: 7140, Draws: 2086
Episode: 13100, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3804, Losses: 7188, Draws: 2109
Episode: 13200, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3821, Losses: 7236, Draws: 2144
Episode: 13300, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3834, Losses: 7293, Draws: 2174
Episode: 13400, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3859, Losses: 7350, Draws: 2192
Episode: 13500, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3874, Losses: 7411, Draws: 2216
Episode: 13600, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3903, Losses: 7455, Draws: 2243
Episode: 13700, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3928, Losses: 7499, Draws: 2274
Episode: 13800, Win Rate: 0.29, Epsilon: 0.010, smartmove0.6666666666666666
Wins: 3958, 



Resetting epsilon to 1.0
Episode: 14000, Win Rate: 0.28, Epsilon: 1.000, smartmove0.7777777777777778
Wins: 3985, Losses: 7651, Draws: 2365
Episode: 14100, Win Rate: 0.28, Epsilon: 0.901, smartmove0.7777777777777778
Wins: 3996, Losses: 7733, Draws: 2372
Episode: 14200, Win Rate: 0.28, Epsilon: 0.802, smartmove0.7777777777777778
Wins: 4005, Losses: 7821, Draws: 2375
Episode: 14300, Win Rate: 0.28, Epsilon: 0.703, smartmove0.7777777777777778
Wins: 4012, Losses: 7903, Draws: 2386
Episode: 14400, Win Rate: 0.28, Epsilon: 0.604, smartmove0.7777777777777778
Wins: 4023, Losses: 7983, Draws: 2395
Episode: 14500, Win Rate: 0.28, Epsilon: 0.505, smartmove0.7777777777777778
Wins: 4040, Losses: 8047, Draws: 2414
Episode: 14600, Win Rate: 0.28, Epsilon: 0.406, smartmove0.7777777777777778
Wins: 4056, Losses: 8114, Draws: 2431
Episode: 14700, Win Rate: 0.28, Epsilon: 0.307, smartmove0.7777777777777778
Wins: 4068, Losses: 8181, Draws: 2452
Episode: 14800, Win Rate: 0.28, Epsilon: 0.208, smartmove0.7777



Episode: 15000, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4108, Losses: 8362, Draws: 2531
Episode: 15100, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4117, Losses: 8421, Draws: 2563
Episode: 15200, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4127, Losses: 8475, Draws: 2599
Episode: 15300, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4141, Losses: 8527, Draws: 2633
Episode: 15400, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4158, Losses: 8587, Draws: 2656
Episode: 15500, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4173, Losses: 8648, Draws: 2680
Episode: 15600, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4196, Losses: 8706, Draws: 2699
Episode: 15700, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4216, Losses: 8762, Draws: 2723
Episode: 15800, Win Rate: 0.27, Epsilon: 0.010, smartmove0.7777777777777778
Wins: 4242, 



Resetting epsilon to 1.0
Episode: 16000, Win Rate: 0.27, Epsilon: 1.000, smartmove0.8
Wins: 4287, Losses: 8921, Draws: 2793
Episode: 16100, Win Rate: 0.27, Epsilon: 0.901, smartmove0.8
Wins: 4291, Losses: 9008, Draws: 2802
Episode: 16200, Win Rate: 0.27, Epsilon: 0.802, smartmove0.8
Wins: 4295, Losses: 9094, Draws: 2812
Episode: 16300, Win Rate: 0.26, Epsilon: 0.703, smartmove0.8
Wins: 4305, Losses: 9173, Draws: 2823
Episode: 16400, Win Rate: 0.26, Epsilon: 0.604, smartmove0.8
Wins: 4320, Losses: 9248, Draws: 2833
Episode: 16500, Win Rate: 0.26, Epsilon: 0.505, smartmove0.8
Wins: 4327, Losses: 9321, Draws: 2853
Episode: 16600, Win Rate: 0.26, Epsilon: 0.406, smartmove0.8
Wins: 4334, Losses: 9392, Draws: 2875
Episode: 16700, Win Rate: 0.26, Epsilon: 0.307, smartmove0.8
Wins: 4343, Losses: 9453, Draws: 2905
Episode: 16800, Win Rate: 0.26, Epsilon: 0.208, smartmove0.8
Wins: 4355, Losses: 9516, Draws: 2930
Episode: 16900, Win Rate: 0.26, Epsilon: 0.109, smartmove0.8
Wins: 4368, Losses: 957



Episode: 17000, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4384, Losses: 9621, Draws: 2996
Episode: 17100, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4406, Losses: 9663, Draws: 3032
Episode: 17200, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4426, Losses: 9708, Draws: 3067
Episode: 17300, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4442, Losses: 9749, Draws: 3110
Episode: 17400, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4460, Losses: 9803, Draws: 3138
Episode: 17500, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4481, Losses: 9844, Draws: 3176
Episode: 17600, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4505, Losses: 9883, Draws: 3213
Episode: 17700, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4539, Losses: 9916, Draws: 3246
Episode: 17800, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4570, Losses: 9954, Draws: 3277
Episode: 17900, Win Rate: 0.26, Epsilon: 0.010, smartmove0.8
Wins: 4593, Losses: 9993, Draws: 3315




Resetting epsilon to 1.0
Episode: 18000, Win Rate: 0.26, Epsilon: 1.000, smartmove0.8
Wins: 4610, Losses: 10032, Draws: 3359
Episode: 18100, Win Rate: 0.26, Epsilon: 0.901, smartmove0.8
Wins: 4618, Losses: 10114, Draws: 3369
Episode: 18200, Win Rate: 0.25, Epsilon: 0.802, smartmove0.8
Wins: 4624, Losses: 10190, Draws: 3387
Episode: 18300, Win Rate: 0.25, Epsilon: 0.703, smartmove0.8
Wins: 4633, Losses: 10265, Draws: 3403
Episode: 18400, Win Rate: 0.25, Epsilon: 0.604, smartmove0.8
Wins: 4641, Losses: 10344, Draws: 3416
Episode: 18500, Win Rate: 0.25, Epsilon: 0.505, smartmove0.8
Wins: 4654, Losses: 10415, Draws: 3432
Episode: 18600, Win Rate: 0.25, Epsilon: 0.406, smartmove0.8
Wins: 4664, Losses: 10486, Draws: 3451
Episode: 18700, Win Rate: 0.25, Epsilon: 0.307, smartmove0.8
Wins: 4676, Losses: 10549, Draws: 3476
Episode: 18800, Win Rate: 0.25, Epsilon: 0.208, smartmove0.8
Wins: 4688, Losses: 10611, Draws: 3502
Episode: 18900, Win Rate: 0.25, Epsilon: 0.109, smartmove0.8
Wins: 4705, Lo



Episode: 19000, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4724, Losses: 10695, Draws: 3582
Episode: 19100, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4744, Losses: 10739, Draws: 3618
Episode: 19200, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4762, Losses: 10777, Draws: 3662
Episode: 19300, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4785, Losses: 10813, Draws: 3703
Episode: 19400, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4807, Losses: 10844, Draws: 3750
Episode: 19500, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4827, Losses: 10879, Draws: 3795
Episode: 19600, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4844, Losses: 10925, Draws: 3832
Episode: 19700, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4869, Losses: 10961, Draws: 3871
Episode: 19800, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4889, Losses: 10997, Draws: 3915
Episode: 19900, Win Rate: 0.25, Epsilon: 0.010, smartmove0.8
Wins: 4919, Losses: 11026, Draws: 3956




In [None]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe
import os

class SQNAgent:
    def __init__(self, model_path=None, state_size=9, action_size=9, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.replay_buffer = deque(maxlen=10000)

        if model_path and os.path.isfile(model_path):
            print(f"Loading model from {model_path}")
            self.model = load_model(model_path)
        else:
            print("No existing model found; initializing a new model.")
            self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if done:
                target = reward
            else:
                next_valid_actions = [j for j, val in enumerate(next_state) if val == 0]
                if next_valid_actions:
                    max_next_q = max(next_q_values[i][action] for action in next_valid_actions)
                    target = reward + self.gamma * max_next_q
                else:
                    target = reward
            
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)

def retrain_agent(model_path='model6.h5', episodes=10000):
    agent = SQNAgent(model_path=model_path)
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    initial_epsilon = agent.epsilon
    min_epsilon = agent.epsilon_min
    decay_episodes = 1000
    smartness = 0
    
    for episode in range(episodes):
        if episode % 2000 == 0 and episode > 0:
            agent.epsilon = 1.0
            print("Resetting epsilon to 1.0")
        else:
            epsilon_decay = (initial_epsilon - min_epsilon) / decay_episodes
            agent.epsilon = max(min_epsilon, initial_epsilon - (episode % 1000) * epsilon_decay)

        
        smartness = min(1, episode / (episodes * 0.6))
            
        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 0
            if game.current_winner == 2:
                reward = 1.0 + smartness
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            game.player1_move()
            if game.current_winner == 1:
                reward = -1.0 - smartness
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = min(-0.1, -0.5 * smartness)
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, Smartness: {smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
        
        if episode % 1000 == 0:
            agent.model.save(f'model6_retrain_episode_{episode}.h5')
    
    agent.model.save('model6_retrained.h5')
    return agent

# Retrain the agent using the existing model file 'model6.h5' for 10,000 more episodes
agent = retrain_agent(model_path='model6.h5', episodes=20000)


Loading model from model6.h5


TypeError: Could not locate function 'mse'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': 'keras.metrics', 'class_name': 'function', 'config': 'mse', 'registered_name': 'mse'}

In [None]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe
import os

class SQNAgent:
    def __init__(self, model_path=None, state_size=9, action_size=9, gamma=0.99):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  
        self.learning_rate = 0.0005
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.85
        self.batch_size = 64
        self.replay_buffer = deque(maxlen=50000)
        
        # Load model if path exists, otherwise create a new one
        if model_path and os.path.isfile(model_path):
            print(f"Loading model from {model_path}")
            self.model = load_model(model_path)
        else:
            print("No existing model found; initializing a new model.")
            self.model = self._build_model()
        
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())  # Synchronize target with main model
        self.update_target_counter = 0
        self.update_target_frequency = 1000

    def _build_model(self):
        model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def process_state(self, state):
        processed_state = np.array(state).copy()
        processed_state[processed_state == 0] = -1
        processed_state[processed_state == 1] = 0
        processed_state[processed_state == 2] = 1
        return processed_state

    def select_action(self, state, valid_actions):
        if not valid_actions:
            return None

        if random.random() < self.epsilon:
            return random.choice(valid_actions)
        
        processed_state = self.process_state(state)
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        
        # Prioritize winning moves and block opponent wins
        temp_board = np.array(state).copy()
        for action in valid_actions:
            temp_board[action] = 2
            if self._check_win(temp_board, 2):
                return action
            temp_board[action] = 0
            
            temp_board[action] = 1
            if self._check_win(temp_board, 1):
                return action
            temp_board[action] = 0

        valid_q_values = [(q_values[action], action) for action in valid_actions]
        return max(valid_q_values, key=lambda x: x[0])[1]

    def _check_win(self, board, player):
        win_combinations = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        return any(all(board[i] == player for i in combo) for combo in win_combinations)

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        states = np.array([self.process_state(exp[0]) for exp in mini_batch])
        next_states = np.array([self.process_state(exp[3]) for exp in mini_batch])
        
        current_q_values = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)

        x = []
        y = []

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            target = reward if done else reward + self.gamma * max(next_q_values[i])
            current_q = current_q_values[i].copy()
            current_q[action] = target
            x.append(self.process_state(state))
            y.append(current_q)

        self.model.fit(np.array(x), np.array(y), batch_size=self.batch_size, epochs=1, verbose=0)
        
        self.update_target_counter += 1
        if self.update_target_counter >= self.update_target_frequency:
            self.target_model.set_weights(self.model.get_weights())
            self.update_target_counter = 0

def retrain_agent(model_path='model10.h5', episodes=10000):
    agent = SQNAgent(model_path=model_path)
    history = {'wins': 0, 'losses': 0, 'draws': 0}
    best_win_rate = 0
    smartness = 0
    
    for episode in range(episodes):
        if episode % 100 == 0 and episode % 1000 != 0:
            agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)
            
        if episode % 2000 == 0 and episode > 0:
            smartness = min(0.8, smartness + 0.1)
            agent.epsilon = 1

        game = TicTacToe(smartMovePlayer1=smartness)
        state = np.array(game.board)
        
        game.player1_move()
        state = np.array(game.board)
        
        while True:
            valid_actions = game.empty_positions()
            if not valid_actions:
                history['draws'] += 1
                break
                
            action = agent.select_action(state, valid_actions)
            game.make_move(action, player=2)
            
            reward = 1 + (0.1 * smartness) if game.current_winner == 2 else 0
            if game.current_winner == 2:
                history['wins'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            if not game.is_full():
                game.player1_move()
                
            if game.current_winner == 1:
                reward = -1
                history['losses'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            elif game.is_full():
                reward = -0.1 * smartness
                history['draws'] += 1
                agent.store_experience(state, action, reward, game.board, True)
                break
            
            agent.store_experience(state, action, reward, game.board, False)
            state = np.array(game.board)
        
        agent.train()
        
        if episode % 100 == 0:
            win_rate = history['wins'] / (episode + 1)
            print(f"Episode: {episode}, Win Rate: {win_rate:.2f}, Epsilon: {agent.epsilon:.3f}, Smartness: {smartness}")
            print(f"Wins: {history['wins']}, Losses: {history['losses']}, Draws: {history['draws']}")
            
            if win_rate > best_win_rate:
                best_win_rate = win_rate

            if episode % 1000 == 0:
                agent.model.save(f'model10_retrain_episode_{episode}.h5')
        
    agent.model.save('model10_retrained.h5')
    return agent

# Retrain the agent using the existing model file 'model10.h5' for 10,000 more episodes
agent = retrain_agent(model_path='model10.h5', episodes=20000)
