In [None]:
import gymnasium
from gymnasium import spaces
import numpy as np
from IPython.display import clear_output

class TicTacToeEnv(gymnasium.Env):
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        # There are 9 possible positions to place X or O
        self.action_space = spaces.Discrete(9)

        # The observation is composed by a 3X3 grid with 3 possible states
        self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=int)

        # The environment should be initialized
        self.reset()

    def reset(self, seed=None):
        # The board starts empty
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.board, {}

    def step(self, action):
        # Get the action coordinates in the board
        row, col = divmod(action, 3)

        # Invalid move case
        if self.board[row, col] != 0:
            return self.board, -10, True, False, {}  
        
        # Valid move case
        self.board[row, col] = self.current_player

        # Check if the robot wins
        if self._check_win(self.current_player):
            return self.board, 1, True, False, {}  

        # Check if there is a draw
        if np.all(self.board != 0):
            return self.board, 0, True, False, {}  

        # Select the next player
        self.current_player = 3 - self.current_player
        return self.board, 0, False, False, {}  

    def _check_win(self, player):
        # Check for vertical and horizontal lines
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
            
        # Check for diagonal lines
        if self.board[0, 0] == self.board[1, 1] == self.board[2, 2] == player or \
           self.board[0, 2] == self.board[1, 1] == self.board[2, 0] == player:
            return True
        return False

    def render(self, mode='human'):
        clear_output(wait=False)
        symbols = {0: ' ', 1: 'X', 2: 'O'}
        board = self.board
        print("\n")
        print(f" {symbols[board[0, 0]]} | {symbols[board[0, 1]]} | {symbols[board[0, 2]]} ")
        print("---|---|---")
        print(f" {symbols[board[1, 0]]} | {symbols[board[1, 1]]} | {symbols[board[1, 2]]} ")
        print("---|---|---")
        print(f" {symbols[board[2, 0]]} | {symbols[board[2, 1]]} | {symbols[board[2, 2]]} ")
        print("\n")

    def close(self):
        pass


In [None]:
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Create the environment
env = DummyVecEnv([lambda: TicTacToeEnv()])

# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1, device=device)

# Train the agent
model.learn(total_timesteps=100000, progress_bar=True)

# Save the model
model.save("ppo_tic_tac_toe")

In [None]:
# Load the model
model = PPO.load("ppo_tic_tac_toe")

# Test the trained agent
env = TicTacToeEnv()
obs, info = env.reset()  # Unpack the observation and info
done = False
truncated = False  # Initialize truncated as False
while not done and not truncated:
    action, _states = model.predict(obs)
    obs, reward, done, truncated, info = env.step(action)
    env.render()

print("Reward:", reward)


In [None]:
n_games = 100
wins = 0
draws = 0
losses = 0

for _ in range(n_games):
    obs, info = env.reset()  # Unpack the observation and info
    done = False
    truncated = False  # Initialize truncated as False
    while not done and not truncated:
        action, _states = model.predict(obs)
        obs, reward, done, truncated, info = env.step(action)
    if reward == 1:
        wins += 1
    elif reward == 0:
        draws += 1
    else:
        losses += 1

print(f"Wins: {wins}, Draws: {draws}, Losses: {losses}")


In [4]:
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Load the model
model = PPO.load("ppo_tic_tac_toe")

# Initialize the environment
env = TicTacToeEnv()

def human_move():
    while True:
        try:
            move = int(input("Enter your move (0-8): "))
            if move < 0 or move > 8:
                print("Invalid move. Move must be between 0 and 8.")
            elif env.board.flatten()[move] != 0:
                print("Invalid move. Cell already taken.")
            else:
                return move
        except ValueError:
            print("Invalid input. Please enter a number between 0 and 8.")

# Prompt the user to choose player 1 or player 2
while True:
    human_player = input("Do you want to be player 1 (X) or player 2 (O)? Enter 1 or 2: ")
    if human_player in ['1', '2']:
        human_player = int(human_player)
        break
    else:
        print("Invalid input. Please enter 1 or 2.")

print(f"You are player {human_player} ({'X' if human_player == 1 else 'O'}).")

# Play against the agent
obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    if env.current_player == human_player:  # Human player's turn
        action = human_move()
    else:  # Agent's turn
        action, _states = model.predict(obs)
    
    obs, reward, done, truncated, info = env.step(action)

    if done or truncated:
        env.render()
        if reward == 1:
            print("Player 1 (X)" if env.current_player == 1 else "Player 2 (O)", "wins!")
        elif reward == 0:
            print("It's a draw!")
        else:
            print("Player 1 (X)" if env.current_player == 2 else "Player 2 (O)", "wins!")

env.close()



 O | O | X 
---|---|---
   | O |   
---|---|---
 X | X | X 


Player 1 (X) wins!
