In [None]:
!pip install gymnasium
from stable_baselines3.common.env_checker import check_env
import gymnasium as gym
from gym import Env
from gym import spaces
import random
import numpy as np
import os




In [None]:
NOTHING = 0
PLAYER = 1
WIN = 2
LOSE = 3
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3

In [None]:
class BasicEnv(gym.Env):
    def __init__(self):
        self.cumulative_reward = 0
        #
        # set the initial state to a flattened 6x6 grid with a randomly placed entry, win, and player
        #
        self.state = [NOTHING] * 36

        self.player_position = random.randrange(0, 36)
        self.win_position = random.randrange(0, 36)
        self.lose_position = random.randrange(0, 36)

        # make sure the entry and lose points aren't overlapping eachother
        while self.win_position == self.player_position:
            self.win_position = random.randrange(0, 36)

        while self.lose_position == self.win_position or self.lose_position == self.player_position:
            self.lose_position = random.randrange(0, 36)

        self.state[self.player_position] = PLAYER
        self.state[self.win_position] = WIN
        self.state[self.lose_position] = LOSE

        # convert the python array into a numpy array (needed since Gym expects the state to be this way)
        self.state = np.array(self.state, dtype=np.int16)

        # observation space (valid ranges for observations in the state)
        self.observation_space = gym.spaces.Box(0, 3, [36,], dtype=np.int16)

        # valid actions:
        #   0 = up
        #   1 = down
        #   2 = left
        #   3 = right
        self.action_space = gym.spaces.Discrete(4)

    def step(self, action):
        # placeholder for debugging information
        info = {}

        # set default values for done, reward, and the player position before taking the action
        done = False
        reward = -0.01
        previous_position = self.player_position

        #
        # take the action by moving the player
        #
        if action == UP:
            if (self.player_position - 6) >= 0:
                self.player_position -= 6

        elif action == DOWN:
            if (self.player_position + 6) < 36:
                self.player_position += 6

        elif action == LEFT:
            if (self.player_position % 6) != 0:
                self.player_position -= 1

        elif action == RIGHT:
            if (self.player_position % 6) != 5:
                self.player_position += 1
        else:
            raise Exception("invalid action")

        #
        # check for win/lose conditions and set reward
        #
        if self.state[self.player_position] == WIN:
            reward = 1.0
            self.cumulative_reward += reward
            done = True
            clear_screen()
            print(f'Cumulative Reward: {self.cumulative_reward}')
            print('YOU WIN!!!!')

        elif self.state[self.player_position] == LOSE:
            reward = -1.0
            self.cumulative_reward += reward
            done = True
            clear_screen()
            print(f'Cumulative Reward: {self.cumulative_reward}')
            print('YOU LOSE')

        #
        # Update the environment state
        #
        if not done:
            # update the player position
            self.state[previous_position] = NOTHING
            self.state[self.player_position] = PLAYER

        self.cumulative_reward += reward
        truncated=False
        return self.state, reward, done, truncated,info

    def render(self):
        # visualization can be added here
        pretty_print(self.state, self.cumulative_reward)

    def reset(self,seed=None):
        if seed is not None:
          random.seed(seed)
        self.cumulative_reward = 0
        #
        # set the initial state to a flattened 6x6 grid with a randomly placed entry, win, and player
        #
        self.state = [NOTHING] * 36

        self.player_position = random.randrange(0, 36)
        self.win_position = random.randrange(0, 36)
        self.lose_position = random.randrange(0, 36)

        # make sure the entry and lose points aren't overlapping eachother
        while self.win_position == self.player_position:
            self.win_position = random.randrange(0, 36)

        while self.lose_position == self.win_position or self.lose_position == self.player_position:
            self.lose_position = random.randrange(0, 36)

        self.state[self.player_position] = PLAYER
        self.state[self.win_position] = WIN
        self.state[self.lose_position] = LOSE

        # convert the python array into a numpy array (needed since Gym expects the state to be this way)
        self.state = np.array(self.state, dtype=np.int16)
        observation=self.state
        info={}
        return observation,info

In [None]:
def pretty_print(state_array, cumulative_reward):
    clear_screen()
    print(f'Cumulative Reward: {cumulative_reward}')
    print()
    for i in range(6):
        for j in range(6):
            print('{:4}'.format(state_array[i*6 + j]), end = "")
        print()

def clear_screen():
    clear_output()
    os.system("cls")


In [None]:
1#from BasicEnvironment import *

env = BasicEnv()
env.render()
action = int(input("Enter action:"))
state, reward, done,truncated, info = env.step(action)
while not done:
    env.render()
    action = int(input("Enter action:"))
    state, reward, done,truncated, info = env.step(action)

Cumulative Reward: 0.99
YOU WIN!!!!


In [None]:
from IPython.display import clear_output

In [None]:
!pip install stable_baselines3

Collecting stable_baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuff

In [None]:
#from BasicEnvironment import *
from stable_baselines3.common.env_checker import check_env
env = BasicEnv()
check_env(env)

Cumulative Reward: 1.0
YOU WIN!!!!


In [None]:
episodes = 50
for episode in range(episodes):
  done=False
  obs=env.reset()
  i=0
  while(i<=50):
    random_action=env.action_space.sample()
    env.render()
    print("action",random_action)
    obs,reward,done,truncated,info=env.step(random_action)
    print('reward',reward)
    i+=1

Cumulative Reward: 1.99

   0   0   0   0   0   0
   0   0   0   0   0   0
   0   0   0   1   2   0
   0   0   0   0   0   0
   3   0   0   0   0   0
   0   0   0   0   0   0
action 3
reward -0.01


KeyboardInterrupt: 