## Creating custom environment

### Gridworld Game with custom size

In [30]:
from typing import Optional
import numpy as np
import gymnasium as gym

#### Create the environment and implement reset() and step() functions

In [31]:
class GridWorldEnv(gym.Env):
    def __init__(self, size: int = 5):
        self.size = size
        self._agent_location = np.array([-1, -1], dtype=np.int32)
        self._target_location = np.array([-1, -1], dtype=np.int32)
        self.observation_space = gym.spaces.Dict(
            {
                "agent": gym.spaces.Box(0, size - 1, shape=(2,), dtype=int),
                "target": gym.spaces.Box(0, size - 1, shape=(2,), dtype=int),
            }
        )
        self.action_space = gym.spaces.Discrete(4)
        self._action_to_direction = {
            0: np.array([1, 0]), # right
            1: np.array([0, 1]), # up
            2: np.array([-1, 0]), # left
            3: np.array([0, -1]), # down
        }
    
    def _get_obs(self):
        return {"agent": self._agent_location, "target": self._target_location}
    
    def _get_info(self):
        return {
            "distance": np.linalg.norm(
                self._agent_location - self._target_location, ord=1
            )
        }
    
    def reset(self, *, seed: Optional[int] = None, options: Optional[int] = None):
        super().reset(seed=seed)
        self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)
        self._target_location = self._agent_location
        while np.array_equal(self._target_location, self._agent_location):
            self._target_location = self.np_random.integers(0, self.size, size=2, dtype=int)
        observation = self._get_obs()
        info = self._get_info()
        return observation, info
    
    def step(self, action):
        direction = self._action_to_direction[action]
        self._agent_location = np.clip(self._agent_location + direction, 0, self.size - 1)
        terminated = np.array_equal(self._agent_location, self._target_location)
        truncated = False
        reward = 1 if terminated else 0
        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, terminated, truncated, info


#### Register the new env to call it using gym.make()

In [32]:
gym.register(id="gymnasium_env/GridWorld-v0", entry_point=GridWorldEnv)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


#### Now build a simple tabular model based AI to play the game

In [33]:
class Agent:
    def __init__(self, env: gym.Env, gamma=0.95, eps = 0.1):
        self.env = env
        self.gamma = gamma
        self.eps = eps
        self.state_values = np.zeros(
            shape=(self.env.unwrapped.size, self.env.action_space.n),
            dtype=np.float32,
        )
    
    def choose_action(self, obs): # simple Value iteration technoque
        next_state_vals = []
        for action in range(self.env.action_space.n):
            if action == 0:
                next_state_vals.append(self.state_values[np.clip(obs + np.array([1, 0]), 0, self.env.size - 1)])
            elif action == 1:
                next_state_vals.append(self.state_values[np.clip(obs + np.array([0, 1]), 0, self.env.size - 1)])
            elif action == 2:
                next_state_vals.append(self.state_values[np.clip(obs + np.array([-1, 0]), 0, self.env.size - 1)])
            else:
                next_state_vals.append(self.state_values[np.clip(obs + np.array([0, -1]), 0, self.env.size - 1)])
        return np.argmax(next_state_vals) if np.random.random() > self.eps else np.random.choice([0, 1, 2, 3])
    
    def update_value(self, obs, reward, next_obs):
        self.state_values[obs] = reward + self.gamma * self.state_values[next_obs]


In [34]:
env = gym.make("gymnasium_env/GridWorld-v0", size=4)
agent = Agent(env)
NUM_EPISODES = 10

In [38]:
for episode in range(NUM_EPISODES):
    done = False
    obs, _ = env.reset()
    step_counter = 0
    while not done:
        step_counter += 1
        action = agent.choose_action(obs['agent'])
        next_obs, reward, term, trunc, _ = env.step(action)
        agent.update_value(obs['agent'], reward, next_obs['agent'])
        done = term or trunc
    print(f"Episode {episode + 1} finished in {step_counter} steps!")
    env.reset()


  logger.warn(


KeyError: 4