In [22]:
import gymnasium
from gymnasium import spaces
import numpy as np
from IPython.display import clear_output

class TicTacToeEnv(gymnasium.Env):
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        # There are 9 possible positions to place X or O
        self.action_space = spaces.Discrete(9)

        # The observation is composed by a 3X3 grid with 3 possible states
        self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=int)

        # The environment should be initialized
        self.reset()

    def reset(self, seed=None):
        # The board starts empty
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.board, {}

    def step(self, action):
        # Get the action coordinates in the board
        row, col = divmod(action, 3)

        # Invalid move case
        if self.board[row, col] != 0:
            return self.board, -10, True, False, {}  
        
        # Valid move case
        self.board[row, col] = self.current_player

        # Check if the robot wins
        if self._check_win(self.current_player):
            return self.board, 1, True, False, {}  

        # Check if there is a draw
        if np.all(self.board != 0):
            return self.board, 0, True, False, {}  

        # Select the next player
        self.current_player = 3 - self.current_player
        return self.board, 0, False, False, {}  

    def _check_win(self, player):
        # Check for vertical and horizontal lines
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
            
        # Check for diagonal lines
        if self.board[0, 0] == self.board[1, 1] == self.board[2, 2] == player or \
           self.board[0, 2] == self.board[1, 1] == self.board[2, 0] == player:
            return True
        return False

    def render(self, mode='human'):
        clear_output(wait=False)
        symbols = {0: ' ', 1: 'X', 2: 'O'}
        board = self.board
        print("\n")
        print(f" {symbols[board[0, 0]]} | {symbols[board[0, 1]]} | {symbols[board[0, 2]]} ")
        print("---|---|---")
        print(f" {symbols[board[1, 0]]} | {symbols[board[1, 1]]} | {symbols[board[1, 2]]} ")
        print("---|---|---")
        print(f" {symbols[board[2, 0]]} | {symbols[board[2, 1]]} | {symbols[board[2, 2]]} ")
        print("\n")

    def close(self):
        pass


In [2]:
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Create the environment
env = DummyVecEnv([lambda: TicTacToeEnv()])

# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1, device=device)

# Train the agent
model.learn(total_timesteps=100000, progress_bar=True)

# Save the model
model.save("ppo_tic_tac_toe")

Using cuda device


Output()

-----------------------------
| time/              |      |
|    fps             | 234  |
|    iterations      | 1    |
|    time_elapsed    | 8    |
|    total_timesteps | 2048 |
-----------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 223         |
|    iterations           | 2           |
|    time_elapsed         | 18          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014555661 |
|    clip_fraction        | 0.0914      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | -0.0298     |
|    learning_rate        | 0.0003      |
|    loss                 | 4.82        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.027      |
|    value_loss           | 30.9        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 210         |
|    iterations           | 3           |
|    time_elapsed         | 29          |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.015415457 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.16       |
|    explained_variance   | -0.386      |
|    learning_rate        | 0.0003      |
|    loss                 | 2.09        |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0194     |
|    value_loss           | 5.89        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 203          |
|    iterations           | 4            |
|    time_elapsed         | 40           |
|    total_timesteps      | 8192         |
| train/                  |              |
|    approx_kl            | 0.0115083065 |
|    clip_fraction        | 0.0927       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.13        |
|    explained_variance   | -0.00296     |
|    learning_rate        | 0.0003       |
|    loss                 | 3.54         |
|    n_updates            | 30           |
|    policy_gradient_loss | -0.0232      |
|    value_loss           | 12.5         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 199         |
|    iterations           | 5           |
|    time_elapsed         | 51          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011939121 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.08       |
|    explained_variance   | -0.00637    |
|    learning_rate        | 0.0003      |
|    loss                 | 8.6         |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0232     |
|    value_loss           | 11.7        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 196          |
|    iterations           | 6            |
|    time_elapsed         | 62           |
|    total_timesteps      | 12288        |
| train/                  |              |
|    approx_kl            | 0.0099586565 |
|    clip_fraction        | 0.0781       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.04        |
|    explained_variance   | 0.000779     |
|    learning_rate        | 0.0003       |
|    loss                 | 10.1         |
|    n_updates            | 50           |
|    policy_gradient_loss | -0.0213      |
|    value_loss           | 16.4         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 194         |
|    iterations           | 7           |
|    time_elapsed         | 73          |
|    total_timesteps      | 14336       |
| train/                  |             |
|    approx_kl            | 0.010821267 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.99       |
|    explained_variance   | -3.37e-05   |
|    learning_rate        | 0.0003      |
|    loss                 | 7.44        |
|    n_updates            | 60          |
|    policy_gradient_loss | -0.0236     |
|    value_loss           | 17.9        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 193         |
|    iterations           | 8           |
|    time_elapsed         | 84          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.010402136 |
|    clip_fraction        | 0.0797      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.95       |
|    explained_variance   | -0.00174    |
|    learning_rate        | 0.0003      |
|    loss                 | 10.3        |
|    n_updates            | 70          |
|    policy_gradient_loss | -0.0229     |
|    value_loss           | 20.3        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 192         |
|    iterations           | 9           |
|    time_elapsed         | 95          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010638958 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.9        |
|    explained_variance   | -0.00495    |
|    learning_rate        | 0.0003      |
|    loss                 | 10.1        |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.0272     |
|    value_loss           | 21.3        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 191         |
|    iterations           | 10          |
|    time_elapsed         | 106         |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.009406155 |
|    clip_fraction        | 0.0817      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.86       |
|    explained_variance   | -0.0189     |
|    learning_rate        | 0.0003      |
|    loss                 | 10.7        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0216     |
|    value_loss           | 21.2        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 191         |
|    iterations           | 11          |
|    time_elapsed         | 117         |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.010639943 |
|    clip_fraction        | 0.0902      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.82       |
|    explained_variance   | -0.00243    |
|    learning_rate        | 0.0003      |
|    loss                 | 10.5        |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0252     |
|    value_loss           | 21          |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 190         |
|    iterations           | 12          |
|    time_elapsed         | 128         |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.009949918 |
|    clip_fraction        | 0.0925      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.78       |
|    explained_variance   | 0.00485     |
|    learning_rate        | 0.0003      |
|    loss                 | 8.79        |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.0226     |
|    value_loss           | 19.5        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 190         |
|    iterations           | 13          |
|    time_elapsed         | 139         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009596091 |
|    clip_fraction        | 0.0972      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.75       |
|    explained_variance   | -0.0106     |
|    learning_rate        | 0.0003      |
|    loss                 | 8.94        |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0223     |
|    value_loss           | 18.1        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 189          |
|    iterations           | 14           |
|    time_elapsed         | 151          |
|    total_timesteps      | 28672        |
| train/                  |              |
|    approx_kl            | 0.0109331105 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.73        |
|    explained_variance   | -0.00483     |
|    learning_rate        | 0.0003       |
|    loss                 | 8.78         |
|    n_updates            | 130          |
|    policy_gradient_loss | -0.0268      |
|    value_loss           | 18.1         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 188         |
|    iterations           | 15          |
|    time_elapsed         | 162         |
|    total_timesteps      | 30720       |
| train/                  |             |
|    approx_kl            | 0.012288513 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.7        |
|    explained_variance   | -0.00168    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.93        |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.0276     |
|    value_loss           | 15.1        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 188         |
|    iterations           | 16          |
|    time_elapsed         | 173         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.012661917 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.66       |
|    explained_variance   | 0.00392     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.72        |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.0278     |
|    value_loss           | 15.3        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 188         |
|    iterations           | 17          |
|    time_elapsed         | 184         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.013166043 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.63       |
|    explained_variance   | -0.00427    |
|    learning_rate        | 0.0003      |
|    loss                 | 5.7         |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.0264     |
|    value_loss           | 13.2        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 188         |
|    iterations           | 18          |
|    time_elapsed         | 195         |
|    total_timesteps      | 36864       |
| train/                  |             |
|    approx_kl            | 0.012916635 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | -0.0175     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.05        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.027      |
|    value_loss           | 13.1        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 188          |
|    iterations           | 19           |
|    time_elapsed         | 206          |
|    total_timesteps      | 38912        |
| train/                  |              |
|    approx_kl            | 0.0112149045 |
|    clip_fraction        | 0.136        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.56        |
|    explained_variance   | -0.00785     |
|    learning_rate        | 0.0003       |
|    loss                 | 3.61         |
|    n_updates            | 180          |
|    policy_gradient_loss | -0.0238      |
|    value_loss           | 9.67         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 188         |
|    iterations           | 20          |
|    time_elapsed         | 217         |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.013286477 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.53       |
|    explained_variance   | -0.00551    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.67        |
|    n_updates            | 190         |
|    policy_gradient_loss | -0.0218     |
|    value_loss           | 9.13        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 188         |
|    iterations           | 21          |
|    time_elapsed         | 228         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.012652867 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.52       |
|    explained_variance   | -0.00995    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.59        |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0244     |
|    value_loss           | 9.15        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 187         |
|    iterations           | 22          |
|    time_elapsed         | 239         |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.015438723 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.49       |
|    explained_variance   | -0.0082     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.06        |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.0233     |
|    value_loss           | 8.93        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 187         |
|    iterations           | 23          |
|    time_elapsed         | 250         |
|    total_timesteps      | 47104       |
| train/                  |             |
|    approx_kl            | 0.012811492 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.44       |
|    explained_variance   | 0.00502     |
|    learning_rate        | 0.0003      |
|    loss                 | 5.08        |
|    n_updates            | 220         |
|    policy_gradient_loss | -0.0248     |
|    value_loss           | 8.15        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 187          |
|    iterations           | 24           |
|    time_elapsed         | 261          |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0154401455 |
|    clip_fraction        | 0.165        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.43        |
|    explained_variance   | 0.0167       |
|    learning_rate        | 0.0003       |
|    loss                 | 3.36         |
|    n_updates            | 230          |
|    policy_gradient_loss | -0.0255      |
|    value_loss           | 7.2          |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 187         |
|    iterations           | 25          |
|    time_elapsed         | 272         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016557612 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | -0.0419     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.52        |
|    n_updates            | 240         |
|    policy_gradient_loss | -0.0226     |
|    value_loss           | 5.62        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 187         |
|    iterations           | 26          |
|    time_elapsed         | 283         |
|    total_timesteps      | 53248       |
| train/                  |             |
|    approx_kl            | 0.015564177 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.35       |
|    explained_variance   | -0.000152   |
|    learning_rate        | 0.0003      |
|    loss                 | 3.14        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0263     |
|    value_loss           | 7.42        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 187         |
|    iterations           | 27          |
|    time_elapsed         | 294         |
|    total_timesteps      | 55296       |
| train/                  |             |
|    approx_kl            | 0.012461736 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.34       |
|    explained_variance   | -0.00287    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.8         |
|    n_updates            | 260         |
|    policy_gradient_loss | -0.021      |
|    value_loss           | 4.94        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 187         |
|    iterations           | 28          |
|    time_elapsed         | 305         |
|    total_timesteps      | 57344       |
| train/                  |             |
|    approx_kl            | 0.015397603 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.31       |
|    explained_variance   | -0.0122     |
|    learning_rate        | 0.0003      |
|    loss                 | 3           |
|    n_updates            | 270         |
|    policy_gradient_loss | -0.0256     |
|    value_loss           | 5.16        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 188          |
|    iterations           | 29           |
|    time_elapsed         | 314          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0138885155 |
|    clip_fraction        | 0.199        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.29        |
|    explained_variance   | -0.0118      |
|    learning_rate        | 0.0003       |
|    loss                 | 2.86         |
|    n_updates            | 280          |
|    policy_gradient_loss | -0.0209      |
|    value_loss           | 3.95         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 190         |
|    iterations           | 30          |
|    time_elapsed         | 322         |
|    total_timesteps      | 61440       |
| train/                  |             |
|    approx_kl            | 0.011555605 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.28       |
|    explained_variance   | -0.0103     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.63        |
|    n_updates            | 290         |
|    policy_gradient_loss | -0.0206     |
|    value_loss           | 5.79        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 192         |
|    iterations           | 31          |
|    time_elapsed         | 330         |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.012662846 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.24       |
|    explained_variance   | -0.000476   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.09        |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.0182     |
|    value_loss           | 3.67        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 193          |
|    iterations           | 32           |
|    time_elapsed         | 338          |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 0.0133637665 |
|    clip_fraction        | 0.159        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.24        |
|    explained_variance   | -0.015       |
|    learning_rate        | 0.0003       |
|    loss                 | 1.09         |
|    n_updates            | 310          |
|    policy_gradient_loss | -0.0173      |
|    value_loss           | 3.14         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 194         |
|    iterations           | 33          |
|    time_elapsed         | 346         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.014159478 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.23       |
|    explained_variance   | 0.00376     |
|    learning_rate        | 0.0003      |
|    loss                 | 1.92        |
|    n_updates            | 320         |
|    policy_gradient_loss | -0.0197     |
|    value_loss           | 3.85        |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 196          |
|    iterations           | 34           |
|    time_elapsed         | 354          |
|    total_timesteps      | 69632        |
| train/                  |              |
|    approx_kl            | 0.0155871995 |
|    clip_fraction        | 0.169        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.2         |
|    explained_variance   | -0.0175      |
|    learning_rate        | 0.0003       |
|    loss                 | 0.784        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0216      |
|    value_loss           | 3.92         |
------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 197         |
|    iterations           | 35          |
|    time_elapsed         | 363         |
|    total_timesteps      | 71680       |
| train/                  |             |
|    approx_kl            | 0.014731762 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.19       |
|    explained_variance   | -0.0307     |
|    learning_rate        | 0.0003      |
|    loss                 | 2.38        |
|    n_updates            | 340         |
|    policy_gradient_loss | -0.0188     |
|    value_loss           | 3.02        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 198         |
|    iterations           | 36          |
|    time_elapsed         | 372         |
|    total_timesteps      | 73728       |
| train/                  |             |
|    approx_kl            | 0.012643097 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.17       |
|    explained_variance   | -0.00525    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.9         |
|    n_updates            | 350         |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 3           |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 198         |
|    iterations           | 37          |
|    time_elapsed         | 381         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.014944415 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.13       |
|    explained_variance   | 0.0283      |
|    learning_rate        | 0.0003      |
|    loss                 | 2.14        |
|    n_updates            | 360         |
|    policy_gradient_loss | -0.0233     |
|    value_loss           | 3.19        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 199         |
|    iterations           | 38          |
|    time_elapsed         | 389         |
|    total_timesteps      | 77824       |
| train/                  |             |
|    approx_kl            | 0.016302938 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.14       |
|    explained_variance   | -0.0293     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.46        |
|    n_updates            | 370         |
|    policy_gradient_loss | -0.0181     |
|    value_loss           | 2.24        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 200         |
|    iterations           | 39          |
|    time_elapsed         | 398         |
|    total_timesteps      | 79872       |
| train/                  |             |
|    approx_kl            | 0.013867968 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.12       |
|    explained_variance   | -0.00354    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.69        |
|    n_updates            | 380         |
|    policy_gradient_loss | -0.0234     |
|    value_loss           | 4.52        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 201         |
|    iterations           | 40          |
|    time_elapsed         | 407         |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.012517377 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.11       |
|    explained_variance   | -0.0132     |
|    learning_rate        | 0.0003      |
|    loss                 | 1.1         |
|    n_updates            | 390         |
|    policy_gradient_loss | -0.0165     |
|    value_loss           | 3.66        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 201         |
|    iterations           | 41          |
|    time_elapsed         | 415         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.012799039 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | -0.0508     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.479       |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0144     |
|    value_loss           | 1.53        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 202         |
|    iterations           | 42          |
|    time_elapsed         | 424         |
|    total_timesteps      | 86016       |
| train/                  |             |
|    approx_kl            | 0.010780998 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.00936    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.66        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0167     |
|    value_loss           | 1.98        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 203         |
|    iterations           | 43          |
|    time_elapsed         | 433         |
|    total_timesteps      | 88064       |
| train/                  |             |
|    approx_kl            | 0.011981606 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.06       |
|    explained_variance   | -0.00256    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.758       |
|    n_updates            | 420         |
|    policy_gradient_loss | -0.0155     |
|    value_loss           | 2.35        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 203         |
|    iterations           | 44          |
|    time_elapsed         | 442         |
|    total_timesteps      | 90112       |
| train/                  |             |
|    approx_kl            | 0.011193009 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.04       |
|    explained_variance   | 0.0162      |
|    learning_rate        | 0.0003      |
|    loss                 | 1.32        |
|    n_updates            | 430         |
|    policy_gradient_loss | -0.0152     |
|    value_loss           | 1.52        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 204         |
|    iterations           | 45          |
|    time_elapsed         | 451         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.012956172 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.03       |
|    explained_variance   | -0.00772    |
|    learning_rate        | 0.0003      |
|    loss                 | 3.2         |
|    n_updates            | 440         |
|    policy_gradient_loss | -0.0155     |
|    value_loss           | 2.51        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 204         |
|    iterations           | 46          |
|    time_elapsed         | 459         |
|    total_timesteps      | 94208       |
| train/                  |             |
|    approx_kl            | 0.009403275 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1          |
|    explained_variance   | -0.00916    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.84        |
|    n_updates            | 450         |
|    policy_gradient_loss | -0.0173     |
|    value_loss           | 2.27        |
-----------------------------------------


----------------------------------------
| time/                   |            |
|    fps                  | 205        |
|    iterations           | 47         |
|    time_elapsed         | 468        |
|    total_timesteps      | 96256      |
| train/                  |            |
|    approx_kl            | 0.01035283 |
|    clip_fraction        | 0.137      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1         |
|    explained_variance   | -0.0143    |
|    learning_rate        | 0.0003     |
|    loss                 | 1.85       |
|    n_updates            | 460        |
|    policy_gradient_loss | -0.0137    |
|    value_loss           | 2.32       |
----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 205         |
|    iterations           | 48          |
|    time_elapsed         | 477         |
|    total_timesteps      | 98304       |
| train/                  |             |
|    approx_kl            | 0.011154717 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.951      |
|    explained_variance   | 0.0232      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0288     |
|    n_updates            | 470         |
|    policy_gradient_loss | -0.0182     |
|    value_loss           | 0.725       |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 206         |
|    iterations           | 49          |
|    time_elapsed         | 485         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009605601 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.925      |
|    explained_variance   | 0.00686     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.64        |
|    n_updates            | 480         |
|    policy_gradient_loss | -0.0175     |
|    value_loss           | 2.17        |
-----------------------------------------


In [9]:
# Load the model
model = PPO.load("ppo_tic_tac_toe")

# Test the trained agent
env = TicTacToeEnv()
obs, info = env.reset()  # Unpack the observation and info
done = False
truncated = False  # Initialize truncated as False
while not done and not truncated:
    action, _states = model.predict(obs)
    obs, reward, done, truncated, info = env.step(action)
    env.render()

print("Reward:", reward)


[[0 0 0]
 [0 0 0]
 [0 0 1]]
[[0 2 0]
 [0 0 0]
 [0 0 1]]
[[0 2 1]
 [0 0 0]
 [0 0 1]]
[[0 2 1]
 [2 0 0]
 [0 0 1]]
[[0 2 1]
 [2 0 0]
 [0 1 1]]
[[0 2 1]
 [2 2 0]
 [0 1 1]]
[[0 2 1]
 [2 2 1]
 [0 1 1]]
Reward: 1


In [10]:
n_games = 100
wins = 0
draws = 0
losses = 0

for _ in range(n_games):
    obs, info = env.reset()  # Unpack the observation and info
    done = False
    truncated = False  # Initialize truncated as False
    while not done and not truncated:
        action, _states = model.predict(obs)
        obs, reward, done, truncated, info = env.step(action)
    if reward == 1:
        wins += 1
    elif reward == 0:
        draws += 1
    else:
        losses += 1

print(f"Wins: {wins}, Draws: {draws}, Losses: {losses}")


Wins: 96, Draws: 2, Losses: 2


In [24]:
# Load the model
model = PPO.load("ppo_tic_tac_toe")

# Initialize the environment
env = TicTacToeEnv()

def human_move():
    while True:
        try:
            move = int(input("Enter your move (0-8): "))
            if move < 0 or move > 8:
                print("Invalid move. Move must be between 0 and 8.")
            elif env.board.flatten()[move] != 0:
                print("Invalid move. Cell already taken.")
            else:
                return move
        except ValueError:
            print("Invalid input. Please enter a number between 0 and 8.")

# Prompt the user to choose player 1 or player 2
while True:
    human_player = input("Do you want to be player 1 (X) or player 2 (O)? Enter 1 or 2: ")
    if human_player in ['1', '2']:
        human_player = int(human_player)
        break
    else:
        print("Invalid input. Please enter 1 or 2.")

print(f"You are player {human_player} ({'X' if human_player == 1 else 'O'}).")

# Play against the agent
obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    if env.current_player == human_player:  # Human player's turn
        action = human_move()
    else:  # Agent's turn
        action, _states = model.predict(obs)
    
    obs, reward, done, truncated, info = env.step(action)

    if done or truncated:
        env.render()
        if reward == 1:
            print("Player 1 (X)" if env.current_player == 1 else "Player 2 (O)", "wins!")
        elif reward == 0:
            print("It's a draw!")
        else:
            print("Player 1 (X)" if env.current_player == 2 else "Player 2 (O)", "wins!")

env.close()



 O | X | O 
---|---|---
 O | X |   
---|---|---
   | X | X 


Player 1 (X) wins!
