In [None]:
#Import needed Libraries
import numpy as np
import tensorflow as tf

# Define the Tic-Tac-Toe environment
class TicTacToeEnvironment:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=np.int32)
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((3, 3), dtype=np.int32)
        self.current_player = 1
        return self.get_state()

    def get_state(self):
        return self.board.flatten()

    def make_move(self, move):
        row, col = move
        if self.board[row, col] == 0:
            self.board[row, col] = self.current_player
            self.current_player = -self.current_player

    def is_winner(self):
        for player in [1, -1]:
            # Check rows and columns
            if np.any(np.all(self.board == player, axis=0)) or np.any(np.all(self.board == player, axis=1)):
                return player
            # Check diagonals
            if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
                return player
        return 0

    def is_draw(self):
        return np.all(self.board != 0)

# Define the Actor-Critic model
class ActorCriticModel(tf.keras.Model):
    def __init__(self, num_actions):
        super(ActorCriticModel, self).__init__()
        self.dense = tf.keras.layers.Dense(128, activation='relu')
        self.policy = tf.keras.layers.Dense(num_actions, activation='softmax')
        self.value = tf.keras.layers.Dense(1, activation=None)

    def call(self, state):
        x = self.dense(state)
        policy = self.policy(x)
        value = self.value(x)
        return policy, value

# Train the Actor-Critic model
def train_actor_critic(model, optimizer, states, actions, rewards):
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)

    with tf.GradientTape() as tape:
        policy, values = model(states)
        values = tf.squeeze(values)
        advantages = rewards - values

        action_masks = tf.one_hot(actions, depth=num_actions)
        selected_probs = tf.reduce_sum(action_masks * policy, axis=1)

        policy_loss = -tf.math.log(selected_probs + 1e-8) * advantages
        value_loss = tf.square(advantages)

        total_loss = tf.reduce_sum(policy_loss) + tf.reduce_sum(value_loss)

    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return total_loss

# Play Tic-Tac-Toe against the trained model
def play_with_trained_model(model):
    env = TicTacToeEnvironment()
    while True:
        print("Current Board:")
        print(env.board)

        if env.current_player == 1:
            # Human player's turn
            row = int(input("Enter the row (0, 1, 2): "))
            col = int(input("Enter the column (0, 1, 2): "))
            move = (row, col)
        else:
            # Model's turn
            state = env.get_state()
            policy, _ = model(tf.convert_to_tensor([state], dtype=tf.float32))
            policy = np.squeeze(policy.numpy())
            action = np.random.choice(num_actions, p=policy)
            move = (action // 3, action % 3)

        env.make_move(move)

        winner = env.is_winner()
        if winner != 0:
            print("Game Over!")
            if winner == 1:
                print("You Win!")
            else:
                print("Model Wins!")
            break

        if env.is_draw():
            print("Game Over! It's a Draw!")
            break

# Create Tic-Tac-Toe environment and Actor-Critic model
env = TicTacToeEnvironment()
num_actions = 9
actor_critic_model = ActorCriticModel(num_actions)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

# Training parameters
num_episodes = 200
epsilon = 0.1

# Training loop
for episode in range(num_episodes):
    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []

    with tf.GradientTape() as tape:
        for step in range(9):  # Max number of steps in Tic-Tac-Toe
            state = env.get_state()
            episode_states.append(state)

            # Forward pass
            policy, value = actor_critic_model(tf.convert_to_tensor([state], dtype=tf.float32))

            # Add a small constant to the probabilities to avoid NaN
            policy += 1e-8

            # Normalize the probabilities to sum to 1
            policy /= tf.reduce_sum(policy)

            action = np.random.choice(num_actions, p=np.squeeze(policy))
            episode_actions.append(action)

            # Apply action to the environment
            env.make_move((action // 3, action % 3))

            # Calculate reward
            reward = env.is_winner()
            if reward == 0 and env.is_draw():
                reward = 0.1  # Slightly positive reward for a draw
            episode_rewards.append(reward)

            if reward != 0:
                break

        total_loss = train_actor_critic(actor_critic_model, optimizer, episode_states, episode_actions, episode_rewards)
        print(f"Episode: {episode}, Total Loss: {total_loss.numpy()}")

# After training, play the game against the trained model
play_with_trained_model(actor_critic_model)


Episode: 0, Total Loss: -0.1762932687997818
Episode: 1, Total Loss: 10.267303466796875
Episode: 2, Total Loss: 149.88095092773438
Episode: 3, Total Loss: -2.6932926177978516
Episode: 4, Total Loss: 27.903669357299805
Episode: 5, Total Loss: 6.8168182373046875
Episode: 6, Total Loss: -0.8838443756103516
Episode: 7, Total Loss: 2.848550796508789
Episode: 8, Total Loss: 2.001223087310791
Episode: 9, Total Loss: 20.46839714050293
Episode: 10, Total Loss: -0.8348433971405029
Episode: 11, Total Loss: -2.1835055351257324
Episode: 12, Total Loss: -1.3538122177124023
Episode: 13, Total Loss: 450.0263977050781
Episode: 14, Total Loss: 0.741605281829834
Episode: 15, Total Loss: 4.183908462524414
Episode: 16, Total Loss: -1.9500892162322998
Episode: 17, Total Loss: -4.374029636383057
Episode: 18, Total Loss: -1.9036917686462402
Episode: 19, Total Loss: 13.709814071655273
Episode: 20, Total Loss: 3.1658968925476074
Episode: 21, Total Loss: 2.9143762588500977
Episode: 22, Total Loss: 336.46420288085