<a href="https://colab.research.google.com/github/Keerthana2048/RL/blob/main/RL7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
from gym import spaces
import numpy as np

class TicTacToeEnv(gym.Env):
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        self.observation_space = spaces.Box(low=0, high=2, shape=(9,), dtype=np.int32)
        self.action_space = spaces.Discrete(9) #specifies that there are 9 possible discrete actions (indexed as integers 0, 1, 2, …, 8)
        self.reset()

    def reset(self):
        self.board = np.zeros(9, dtype=np.int32)
        self.current_player = 1  # 1 = X (agent), 2 = O (opponent)
        return self.board

    def step(self, action):
        # --- Agent (X) move ---
        if self.board[action] != 0:
            return self.board, -10, True, {}  # illegal move

        self.board[action] = 1  # agent always plays "X"
        done, reward = self.check_game_over(player=1)
        if done:
            return self.board, reward, done, {}

        # --- Opponent (O) move (random) ---
        available = np.where(self.board == 0)[0] # returns a list of indices where the board is empty
        if len(available) > 0:
            opp_action = np.random.choice(available)
            self.board[opp_action] = 2
            done, reward = self.check_game_over(player=2)
            if done:
                return self.board, -1, True, {}  # agent loses → -1

        return self.board, 0, False, {}

    def check_game_over(self, player):
        b = self.board.reshape(3, 3)
        # rows & cols
        for i in range(3):
            if np.all(b[i] == player): return True, 1 if player == 1 else -1
            if np.all(b[:, i] == player): return True, 1 if player == 1 else -1
        # diagonals
        if np.all(np.diag(b) == player): return True, 1 if player == 1 else -1
        if np.all(np.diag(np.fliplr(b)) == player): return True, 1 if player == 1 else -1

        if 0 not in self.board:  # draw
            return True, 0
        return False, 0


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
env = TicTacToeEnv()
obs = env.reset()
done = False

while not done:
    action = env.action_space.sample()  # random action
    obs, reward, done, info = env.step(action)
    print("Board:", obs.reshape(3,3), "Reward:", reward)


Board: [[0 0 0]
 [0 2 0]
 [0 0 1]] Reward: 0
Board: [[0 0 0]
 [0 2 0]
 [0 0 1]] Reward: -10


In [None]:
import gym
from gym import spaces
import numpy as np
import itertools

class TicTacToeEnv(gym.Env):
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        self.observation_space = spaces.Box(low=0, high=2, shape=(9,), dtype=np.int32)
        self.action_space = spaces.Discrete(9)
        self.reset()

    def reset(self):
        self.board = np.zeros(9, dtype=np.int32)
        self.current_player = 1
        return tuple(self.board)

    def step(self, action):
        if self.board[action] != 0:
            return tuple(self.board), -10, True, {}  # illegal move

        self.board[action] = 1
        if self.check_win(1): return tuple(self.board), 1, True, {}
        if 0 not in self.board: return tuple(self.board), 0, True, {}

        # Opponent random move
        opp_moves = np.where(self.board == 0)[0] # returns a list of indices where the board is empty
        if len(opp_moves) > 0:
            opp_action = np.random.choice(opp_moves)
            self.board[opp_action] = 2
            if self.check_win(2): return tuple(self.board), -1, True, {}

        return tuple(self.board), 0, False, {}

    def check_win(self, player):
        b = self.board.reshape(3,3)
        return any([
            np.all(b[i,:]==player) for i in range(3)
        ]) or any([
            np.all(b[:,j]==player) for j in range(3)
        ]) or np.all(np.diag(b)==player) or np.all(np.diag(np.fliplr(b))==player)

    # Function for planning (Value Iteration) ---
    def get_transitions(self, state, action):
        """Return list of (prob, next_state, reward, done) for given (s,a)."""
        board = np.array(state, dtype=np.int32)
        if board[action] != 0:
            return [(1.0, tuple(board), -10, True)]

        # Apply X move
        board[action] = 1
        if self.check_static(board, 1):
            return [(1.0, tuple(board), 1, True)]
        if 0 not in board:
            return [(1.0, tuple(board), 0, True)]

        # Opponent (random)
        opp_moves = np.where(board == 0)[0]
        transitions = []
        for opp_action in opp_moves:
            new_board = board.copy()
            new_board[opp_action] = 2
            if self.check_static(new_board, 2):
                transitions.append((1/len(opp_moves), tuple(new_board), -1, True))
            else:
                transitions.append((1/len(opp_moves), tuple(new_board), 0, False))
        return transitions

    def check_static(self, board, player):
        b = board.reshape(3,3)
        return any([
            np.all(b[i,:]==player) for i in range(3)
        ]) or any([
            np.all(b[:,j]==player) for j in range(3)
        ]) or np.all(np.diag(b)==player) or np.all(np.diag(np.fliplr(b))==player)