In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque, namedtuple
import random
from TicTacToe import TicTacToe

gamma = 0.95
learning_rate = 0.001
batch_size = 32
episodes = 5000
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995

Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))
replay_buffer = deque(maxlen=2000)

model = Sequential([
    Dense(64, input_dim=9, activation='relu'),
    Dense(64, activation='relu'),
    Dense(9, activation='linear')
])
model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))

def generate_random_state():
    env = TicTacToe(smartMovePlayer1=0.5)
    board = [0] * 9
    num_moves = random.randint(1, 9)
    moves_made = 0
    player_turn = 1

    while moves_made < num_moves:
        valid_actions = [i for i in range(9) if board[i] == 0]
        pos = random.choice(valid_actions)
        board[pos] = player_turn
        env.board = board
        if env.check_winner(player_turn):
            break
        moves_made += 1
        player_turn = 3 - player_turn

    env.board = board
    return env, player_turn

def epsilon_greedy_action(state, env, epsilon):
    valid_actions = env.empty_positions()
    if not valid_actions:
        return None
    if np.random.rand() <= epsilon:
        return random.choice(valid_actions)
    q_values = model.predict(state, verbose=0)
    q_values_valid = [q_values[0][i] for i in valid_actions]
    return valid_actions[np.argmax(q_values_valid)]

def train_on_replay():
    if len(replay_buffer) < batch_size:
        return
    transitions = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*transitions)

    states = np.array(states, dtype=float).reshape(batch_size, -1)
    next_states = np.array(next_states, dtype=float).reshape(batch_size, -1)
    target_q_values = rewards + gamma * np.max(model.predict(next_states, verbose=0), axis=1) * (1 - np.array(dones))
    target_f = model.predict(states, verbose=0)
    for i, action in enumerate(actions):
        target_f[i][action] = target_q_values[i]
    model.fit(states, target_f, epochs=1, verbose=0)

valid_episodes = []
for _ in range(5000):
    env, player_turn = generate_random_state()
    valid_episodes.append((env, player_turn))

epsilon = epsilon_start
win, draw, loss = 0, 0, 0
for e, (env, current_player) in enumerate(valid_episodes):
    state = np.reshape(env.board, [1, 9])
    done = False

    while not done:
        if current_player == 2:
            action = epsilon_greedy_action(state, env, epsilon)
            if action is None:
                break
            env.make_move(action, 2)
            reward = env.get_reward()
            done = env.is_full() or env.current_winner is not None
            next_state = np.reshape(env.board, [1, 9])
            replay_buffer.append(Transition(state, action, reward, next_state, done))
            state = next_state
        else:
            if not env.is_full() and env.current_winner is None:
                env.player1_move()
            done = env.is_full() or env.current_winner is not None

        if done:
            if reward == 1:
                loss += 1
            elif reward == -1:
                win += 1
            else:
                draw += 1

        current_player = 3 - current_player
        train_on_replay()
        epsilon = max(epsilon * epsilon_decay, epsilon_end)

    print(f"Episode {e+1}/{episodes}, Win: {win}, Draw: {draw}, Loss: {loss}")

model.save('YourBITSid_MODEL2.h5')


Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 6
Player 1 (Smart/Random) chooses position 1
Episode 1/5000, Win: 0, Draw: 0, Loss: 1
Player 1 (Smart/Random) chooses position 5
Episode 2/5000, Win: 0, Draw: 1, Loss: 1
Player 1 (Smart/Random) chooses position 3
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 7
Episode 3/5000, Win: 0, Draw: 2, Loss: 1
Player 1 (Smart/Random) chooses position 7
Episode 4/5000, Win: 0, Draw: 3, Loss: 1
Episode 5/5000, Win: 0, Draw: 3, Loss: 2
Player 1 (Smart/Random) chooses position 6
Episode 6/5000, Win: 0, Draw: 3, Loss: 3
Player 1 (Smart/Random) chooses position 1
Episode 7/5000, Win: 0, Draw: 3, Loss: 4
Episode 8/5000, Win: 0, Draw: 3, Loss: 5
Player 1 (Smart/Random) chooses position 5
Player 1 (Smart/Random) chooses position 2
Episode 9/5000, Win: 0, Draw: 4, Loss: 5
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) choos



Episode 4999/5000, Win: 0, Draw: 3804, Loss: 1122
Player 1 (Smart/Random) chooses position 6
Episode 5000/5000, Win: 0, Draw: 3805, Loss: 1122


In [23]:
import sys
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import register_keras_serializable
from TicTacToe import TicTacToe

@register_keras_serializable()
def mse(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

class PlayerSQN:
    def __init__(self, model_path='YourBITSid_MODEL2.h5'):
        """
        Initializes the PlayerSQN class and loads the pre-trained model.
        """
        try:
            custom_objects = {'mse': mse}
            self.model = load_model(model_path, custom_objects=custom_objects)
            print("Model loaded successfully!")
        except Exception as e:
            print(f"Error: Could not load model. {e}")
            sys.exit(1)
    
    def _preprocess_state(self, state):
        """
        Converts the state from [0,1,2] format to [-1,0,1] format for better learning.
        """
        processed_state = np.array(state).copy()
        processed_state[processed_state == 2] = -1
        return processed_state

    def move(self, state):
        """
        Selects the best move based on the current state.
        """
        processed_state = self._preprocess_state(state)
        valid_moves = [i for i, val in enumerate(state) if val == 0]
        
        q_values = self.model.predict(processed_state.reshape(1, -1), verbose=0)[0]
        # Mask invalid moves
        for i in range(len(q_values)):
            if i not in valid_moves:
                q_values[i] = float('-inf')
        return np.argmax(q_values)

def simulate_games(smartMovePlayer1, num_games=100):
    """
    Simulates multiple TicTacToe games and tracks the results.

    Parameters:
    smartMovePlayer1: Probability that Player 1 will make a smart move at each time step.
    num_games: Number of games to simulate.
    """
    playerSQN = PlayerSQN()
    wins, losses, draws = 0, 0, 0

    for _ in range(num_games):
        game = TicTacToe(smartMovePlayer1, playerSQN)
        game.play_game()
        
        reward = game.get_reward()
        if reward == 1:
            wins += 1
        elif reward == -1:
            losses += 1
        else:
            draws += 1

    print(f"Results after {num_games} games:")
    print(f"Wins: {wins}")
    print(f"Losses: {losses}")
    print(f"Draws: {draws}")

if __name__ == "__main__":
    try:
        smartMovePlayer1 = 0.5
        assert 0 <= smartMovePlayer1 <= 1
    except:
        print("Usage: python YourBITSid.py <smartMovePlayer1Probability>")
        print("Example: python 2021A7PS2627G.py 0.5")
        print("Error: Probability must lie between 0 and 1.")
        sys.exit(1)
    
    simulate_games(smartMovePlayer1)




Model loaded successfully!

Board:
   |   |   
---+---+---
   |   |   
---+---+---
   |   |   

Player 1 (Smart/Random) chooses position 3

Board:
   |   | X 
---+---+---
   |   |   
---+---+---
   |   |   


Board:
   |   | X 
---+---+---
   |   |   
---+---+---
   | O |   

Player 1 (Smart/Random) chooses position 6

Board:
   |   | X 
---+---+---
   |   | X 
---+---+---
   | O |   


Board:
   |   | X 
---+---+---
 O |   | X 
---+---+---
   | O |   

Player 1 (Smart/Random) chooses position 9

Board:
   |   | X 
---+---+---
 O |   | X 
---+---+---
   | O | X 

Player 1 (Smart/Random) wins!

Board:
   |   |   
---+---+---
   |   |   
---+---+---
   |   |   

Player 1 (Smart/Random) chooses position 1

Board:
 X |   |   
---+---+---
   |   |   
---+---+---
   |   |   


Board:
 X |   |   
---+---+---
   |   |   
---+---+---
   | O |   

Player 1 (Smart/Random) chooses position 9

Board:
 X |   |   
---+---+---
   |   |   
---+---+---
   | O | X 


Board:
 X |   |   
---+---+---
 O |  

In [15]:
import random
import numpy as np
from TicTacToe import TicTacToe

def generate_random_state():
    env = TicTacToe(smartMovePlayer1=0.5)  # Player 1 moves smartly, Player 2 is random
    board = [0] * 9
    num_moves = random.randint(1, 9)
    moves_made = 0
    player_turn = 1

    while moves_made < num_moves:
        valid_actions = [i for i in range(9) if board[i] == 0]
        if player_turn == 1:
            # Player 1 makes a smart or random move
            if random.random() < 0.5:  # Smart move or random move
                position = get_smart_move(board)
                if position is None:  # If no smart move, pick a random move
                    position = random.choice(valid_actions)
            else:
                position = random.choice(valid_actions)
        else:
            # Player 2 tries to block or win
            position = get_smart_move_for_p2(board)
            if position is None:
                # If no winning or blocking move, pick a random move
                position = random.choice(valid_actions)

        board[position] = player_turn
        env.board = board
        if env.check_winner(player_turn):
            break
        moves_made += 1
        player_turn = 3 - player_turn

    env.board = board
    return env, env.current_winner if env.current_winner is not None else player_turn

def get_smart_move(board):
    # Check if Player 1 can win in the next move
    for position in range(9):
        if board[position] == 0:
            board[position] = 1  # Try Player 1's move
            if check_winner(board, 1):
                board[position] = 0
                return position
            board[position] = 0
    return None

def get_smart_move_for_p2(board):
    # Check if Player 2 can win in the next move
    for position in range(9):
        if board[position] == 0:
            board[position] = 2  # Try Player 2's move
            if check_winner(board, 2):
                board[position] = 0
                return position
            board[position] = 0

    # Block Player 1 from winning
    for position in range(9):
        if board[position] == 0:
            board[position] = 1  # Pretend it's Player 1's move
            if check_winner(board, 1):
                board[position] = 0
                return position
            board[position] = 0

    return None

def check_winner(board, player):
    # Check all win conditions
    win_conditions = [
        [0, 1, 2], [3, 4, 5], [6, 7, 8],  # rows
        [0, 3, 6], [1, 4, 7], [2, 5, 8],  # columns
        [0, 4, 8], [2, 4, 6]              # diagonals
    ]
    for condition in win_conditions:
        if all(board[i] == player for i in condition):
            return True
    return False

# Updated training loop:
valid_episodes = []
for _ in range(5000):
    env, current_player = generate_random_state()
    if env.current_winner is not None or env.is_full():
        valid_episodes.append((env, env.current_winner))  # Add only valid games that have ended
    else:
        valid_episodes.append((env, current_player))  # Add incomplete games for future play

epsilon = epsilon_start
win, draw, loss = 0, 0, 0
for e, (env, current_player) in enumerate(valid_episodes):
    if current_player is None:  # Ensure current_player is never None
        continue

    state = np.reshape(env.board, [1, 9])
    done = False
    reward = 0

    while not done:
        if current_player == 2:
            action = epsilon_greedy_action(state, env, epsilon)
            if action is None:
                break
            env.make_move(action, 2)
            reward = env.get_reward()
            done = env.is_full() or env.current_winner is not None
            next_state = np.reshape(env.board, [1, 9])
            replay_buffer.append(Transition(state, action, reward, next_state, done))
            state = next_state
        else:
            if not env.is_full() and env.current_winner is None:
                env.player1_move()
            done = env.is_full() or env.current_winner is not None

        if done:
            if reward == 1:
                win += 1
            elif reward == -1:
                loss += 1
            else:
                draw += 1

        current_player = 3 - current_player  # Alternate between Player 1 (1) and Player 2 (2)
        train_on_replay()
        epsilon = max(epsilon * epsilon_decay, epsilon_end)

    print(f"Episode {e+1}/{episodes}, Win: {win}, Draw: {draw}, Loss: {loss}")

model.save('x.h5')


Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 7
Episode 1/5000, Win: 0, Draw: 1, Loss: 0
Episode 2/5000, Win: 1, Draw: 1, Loss: 0
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 3
Episode 3/5000, Win: 2, Draw: 1, Loss: 0
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 4
Episode 4/5000, Win: 2, Draw: 2, Loss: 0
Player 1 (Smart/Random) chooses position 8
Player 1 (Smart/Random) chooses position 4
Player 1 (Smart/Random) chooses position 5
Episode 5/5000, Win: 2, Draw: 3, Loss: 0
Player 1 (Smart/Random) chooses position 9
Player 1 (Smart/Random) chooses position 2
Player 1 (Smart/Random) chooses position 3
Episode 6/5000, Win: 2, Draw: 4, Loss: 0
Episode 7/5000, Win: 3, Draw: 4, Loss: 0
Player 1 (Smart/Random) chooses position 1
Episode 8/5000, Win: 3, Draw: 5, Loss: 0
Player 1 (Smart/Random) chooses position 1
Player 1 (Smart/Random) chooses position 5
Episode 9/5000, Win: 3, Dra



Episode 5000/5000, Win: 1548, Draw: 3320, Loss: 0


In [31]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from TicTacToe import TicTacToe

class TicTacToeTrainer:
    def __init__(self):
        self.model = Sequential()
        self.model.add(Dense(64, input_dim=9, activation='relu'))
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dense(9, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
        self.epsilon = 1.0
        self.gamma = 0.95
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.memory = deque(maxlen=2000)

    def generate_random_board(self):
        board = np.zeros(9)
        for i in range(random.randint(0, 5)):
            board[random.choice([j for j in range(9) if board[j] == 0])] = random.choice([1, 2])
        return board

    def choose_action(self, state):
        # Choose action using epsilon-greedy approach
        if np.random.rand() <= self.epsilon:
            return random.choice([i for i in range(9) if state[i] == 0])  # Choose random valid move
        q_values = self.model.predict(state.reshape(1, -1))  # Predict Q-values
        return np.argmax(q_values[0])  # Select the action with the highest Q-value

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size=32):
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state.reshape(1, -1))[0])
            target_f = self.model.predict(state.reshape(1, -1))
            target_f[0][action] = target
            self.model.fit(state.reshape(1, -1), target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def evaluate(self, games=20):
        wins, losses, draws = 0, 0, 0
        for _ in range(games):
            env = TicTacToe()
            state = np.array(env.board).reshape(-1)
            done = False
            while not done:
                action = self.choose_action(state)  # Choose the action as a position (0-8)
                reward = env.make_move(action, player=2)  # Make the move for Player 2 (the AI)
                # After the move, update state and check game status
                state = np.array(env.board).reshape(-1)
                if reward is not None:  # Check if the game is over (reward will indicate end of game)
                    done = True
                else:
                    done = False

            # Track game results (you may need to adjust the reward system)
            if reward == 1:  # Assuming reward = 1 for a win
                wins += 1
            elif reward == -1:  # Assuming reward = -1 for a loss
                losses += 1
            else:  # Draw
                draws += 1
        print(f"Stats: Wins: {wins}, Losses: {losses}, Draws: {draws}")

def train_tic_tac_toe():
    trainer = TicTacToeTrainer()
    episodes = 1000
    evaluation_interval = 100

    for episode in range(episodes):
        env = TicTacToe()
        state = trainer.generate_random_board()  # Random board initialization
        env.board = state.reshape(9,1)  # Set the board to the random initial state (3x3)
        state = np.array(env.board).reshape(-1)  # Flatten the board for the neural network input
        done = False
        
        while not done:
            action = trainer.choose_action(state)  # Choose action as a position (0-8)
            
            # Make the move for Player 2 (AI)
            reward = env.make_move(action, player=2)  # Make the move for Player 2
            
            if reward is not None:  # If the game is finished, reward will not be None
                done = True
            else:
                state = np.array(env.board).reshape(-1)  # Update state for the next move
                
            trainer.store_experience(state, action, reward, state, done)  # Store experience
            state = np.array(env.board).reshape(-1)  # Update state after the move

        trainer.replay(batch_size=32)  # Experience replay for model training

        if episode % evaluation_interval == 0:
            print(f"Episode {episode} Evaluation:")
            trainer.evaluate(games=20)

    trainer.model.save('h67.h5')  # Save the trained model after all episodes

train_tic_tac_toe()  # Start the training


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Episode 0 Evaluation:
Stats: Wins: 20, Losses: 0, Draws: 0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m 

KeyboardInterrupt: 