In [1]:
import random
from TicTacToe import TicTacToe

def validBoard(game):
    x_count = game.board.count(1)
    o_count = game.board.count(2)
    
    
    if not (x_count == o_count or x_count == o_count + 1):
        return False
    
    
    if game.check_winner(1) and game.check_winner(2):
        return False
    
    # Check if O wins but X has more or equal moves
    if game.check_winner(2) and x_count != o_count:
        return False
    
    # Check if X wins but O has equal moves
    if game.check_winner(1) and x_count != o_count + 1:
        return False
    
    return True

# Example usage
game = TicTacToe()
game.board = [1, 2, 1, 2, 1, 2, 1, 0, 0]  # Example board state
print(validBoard(game))  # Output: True or False based on the board state


True


In [13]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95, learning_rate=0.001, batch_size=32, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.999, replay_buffer_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            return random.choice(valid_actions)
        q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
        q_values = [q_values[i] if i in valid_actions else float('-inf') for i in range(self.action_size)]
        return np.argmax(q_values)

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        
        states = np.zeros((self.batch_size, self.state_size))
        q_values_batch = np.zeros((self.batch_size, self.action_size))

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
            if done:
                q_target = reward
            else:
                next_q_values = self.model.predict(next_state.reshape(1, -1), verbose=0)[0]
                valid_actions = TicTacToe().empty_positions()
                max_next_q = max(next_q_values[a] for a in valid_actions) if valid_actions else 0
                q_target = reward + self.gamma * max_next_q
            
            q_values[action] = q_target
            states[i] = state
            q_values_batch[i] = q_values

        self.model.fit(states, q_values_batch, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=1000, save_interval=200, model_path="trained_sqn_model.h5"):
    agent = SQNAgent()
    loss_count = 0
    win_count = 0
    draw_count = 0

    for episode in range(num_episodes):
        # Linearly scale smartMovePlayer1_prob and epsilon based on progress in episodes
        smartMovePlayer1_prob = min(1.0, episode / num_episodes)  # Increase from 0 to 1 over the episodes
        agent.epsilon = max(agent.epsilon_min, 1.0 - (episode / num_episodes) * (1.0 - agent.epsilon_min))  # Decay from 1.0 to epsilon_min

        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False
        
        if episode % 10 == 0:
            print(f"Episode: {episode}, Wins: {win_count}, Losses: {loss_count}, Draws: {draw_count}, "
                  f"SmartMoveProb: {smartMovePlayer1_prob:.2f}, Epsilon: {agent.epsilon:.4f}")

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.player1_move()
            next_state = np.array(game.board)
            done = game.current_winner is not None or game.is_full()
            reward = game.get_reward() if done else 0
            
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state
            
            if done:
                if game.current_winner == 1:
                    loss_count += 1
                elif game.is_full():
                    draw_count += 1
                break

            game.make_move(action, player=2)
            done = game.current_winner is not None or game.is_full()
            state = np.array(game.board)
            
            if game.current_winner == 2:
                win_count += 1
                done = True

        agent.train_from_experience()

        if episode % save_interval == 0:
            agent.save_model(f"{model_path.split('.')[0]}_episode_{episode}.h5")

    agent.save_model(model_path)
    print("Training complete and model saved")
    return agent

train_sqn_agent()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 0, Wins: 0, Losses: 0, Draws: 0, SmartMoveProb: 0.00, Epsilon: 1.0000
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Ra



Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (S



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (S



Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 6
Player 1 (S



Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 7
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 9
Player 1 (S



Training complete and model saved


<__main__.SQNAgent at 0x34e501b80>

In [None]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class SQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95, learning_rate=0.001, batch_size=32, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.999, replay_buffer_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        self.model = Sequential([
            Dense(64, input_dim=self.state_size, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

    def select_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            return random.choice(valid_actions)
        
        for action in valid_actions:
            temp_state = state.copy()
            temp_state[action] = 2
            if TicTacToe(board=temp_state).check_winner() == 2:
                return action

        for action in valid_actions:
            temp_state = state.copy()
            temp_state[action] = 1
            if TicTacToe(board=temp_state).check_winner() == 1:
                return action

        q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
        q_values = [q_values[i] if i in valid_actions else float('-inf') for i in range(self.action_size)]
        return np.argmax(q_values)

    def store_experience(self, experience):
        self.replay_buffer.append(experience)

    def train_from_experience(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        mini_batch = random.sample(self.replay_buffer, self.batch_size)
        
        states = np.zeros((self.batch_size, self.state_size))
        q_values_batch = np.zeros((self.batch_size, self.action_size))

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            q_values = self.model.predict(state.reshape(1, -1), verbose=0)[0]
            if done:
                q_target = reward
            else:
                next_q_values = self.model.predict(next_state.reshape(1, -1), verbose=0)[0]
                valid_actions = TicTacToe().empty_positions()
                max_next_q = max(next_q_values[a] for a in valid_actions) if valid_actions else 0
                q_target = reward + self.gamma * max_next_q
            
            q_values[action] = q_target
            states[i] = state
            q_values_batch[i] = q_values

        self.model.fit(states, q_values_batch, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, filepath):
        self.model.save(filepath)

def train_sqn_agent(num_episodes=1000, save_interval=200, model_path="trained_sqn_model_action.h5"):
    agent = SQNAgent()
    loss_count = 0
    win_count = 0
    draw_count = 0

    for episode in range(num_episodes):
        smartMovePlayer1_prob = min(1.0, episode / num_episodes)
        agent.epsilon = max(agent.epsilon_min, 1.0 - (episode / num_episodes) * (1.0 - agent.epsilon_min))

        game = TicTacToe(smartMovePlayer1=smartMovePlayer1_prob)
        state = np.array(game.board)
        done = False
        
        if episode % 10 == 0:
            print(f"Episode: {episode}, Wins: {win_count}, Losses: {loss_count}, Draws: {draw_count}, "
                  f"SmartMoveProb: {smartMovePlayer1_prob:.2f}, Epsilon: {agent.epsilon:.4f}")

        while not done:
            valid_actions = game.empty_positions()
            action = agent.select_action(state, valid_actions)
            
            game.player1_move()
            next_state = np.array(game.board)
            done = game.current_winner is not None or game.is_full()
            reward = game.get_reward() if done else 0
            
            agent.store_experience((state, action, reward, next_state, done))
            state = next_state
            
            if done:
                if game.current_winner == 1:
                    loss_count += 1
                elif game.is_full():
                    draw_count += 1
                break

            game.make_move(action, player=2)
            done = game.current_winner is not None or game.is_full()
            state = np.array(game.board)
            
            if game.current_winner == 2:
                win_count += 1
                done = True

        agent.train_from_experience()

        if episode % save_interval == 0:
            agent.save_model(f"{model_path.split('.')[0]}_action_episode_{episode}.h5")

    agent.save_model(model_path)
    print("Training complete and model saved")
    return agent

train_sqn_agent()
