In [1]:
!pip install gymnasium[toy_text]==0.29.1

Collecting gymnasium[toy_text]==0.29.1
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[toy_text]==0.29.1)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
# coding: utf-8
"""Defines some frozen lake maps."""
import gymnasium as gym

from gymnasium.envs.registration import register

import numpy as np

np.set_printoptions(precision=3)

# De-register environments if there is a collision
env_dict = gym.envs.registration.registry.copy()
for env in env_dict:
    if "Deterministic-4x4-FrozenLake-v0" in env:
        del gym.envs.registration.registry[env]
    elif "Stochastic-4x4-FrozenLake-v0" in env:
        del gym.envs.registration.registry[env]


register(
    id="Deterministic-4x4-FrozenLake-v0",
    entry_point="gymnasium.envs.toy_text.frozen_lake:FrozenLakeEnv",
    kwargs={"map_name": "4x4", "is_slippery": False},
)

register(
    id="Stochastic-4x4-FrozenLake-v0",
    entry_point="gymnasium.envs.toy_text.frozen_lake:FrozenLakeEnv",
    kwargs={"map_name": "4x4", "is_slippery": True},
)


In [3]:

"""
parameters P, nS, nA, gamma are defined as follows:

	P: nested dictionary of a nested lists
		From gym.core.Environment
		For each pair of states in [1, nS] and actions in [1, nA], P[state][action] is a
		tuple of the form (probability, nextstate, reward, terminal) where
			- probability: float
				the probability of transitioning from "state" to "nextstate" with "action"
			- nextstate: int
				denotes the state we transition to (in range [0, nS - 1])
			- reward: int
				either 0 or 1, the reward for transitioning from "state" to
				"nextstate" with "action"
			- terminal: bool
			  True when "nextstate" is a terminal state (hole or goal), False otherwise
	nS: int
		number of states in the environment
	nA: int
		number of actions in the environment
	gamma: float
		Discount factor. Number in range [0, 1)
"""

def value_iteration(env, gamma=0.9, tol=1e-3):
    """
    Learn value function and policy by using value iteration method for a given
    gamma and environment.

    Parameters:
    ----------
    P, nS, nA, gamma:
      defined at beginning
    tol: float
      Terminate value iteration when
        max |V(s) - pre_V(s)| < tol
    Returns:
    ----------
    V: np.ndarray[nS]
    policy: np.ndarray[nS]
    """

    P, nS, nA = env.P, env.nS, env.nA
    V = np.ones(nS)
    policy = np.zeros(nS, dtype=int)
    iteration = 0

    while True:
        delta = 0
        for s in range(nS):
            v = V[s]
            q = np.zeros(nA)
            for a in range(nA):
                for prob, next_state, reward, done in P[s][a]:
                    q[a] += prob * (reward + gamma * V[next_state])
                V[s] = np.max(q)
                policy[s] = np.argmax(q)
            delta = max(delta, abs(v - V[s]))
        iteration += 1
        print(f"[Iteration {iteration}] Value function:")
        print(V.reshape(4, 4))

        if delta < tol:
            break

    return V, policy

In [4]:
# Make gym environment
env = gym.make("Deterministic-4x4-FrozenLake-v0", render_mode="rgb_array")

env.nS = env.nrow * env.ncol
env.nA = 4

print("\n" + "-" * 25 + "\nBeginning Value Iteration\n" + "-" * 25)

V_vi, p_vi = value_iteration(env, gamma=0.9, tol=1e-2)

print("Policy:")
print(p_vi.reshape(4, 4))


-------------------------
Beginning Value Iteration
-------------------------
[Iteration 1] Value function:
[[0.9 0.9 0.9 0.9]
 [0.9 0.9 0.9 0.9]
 [0.9 0.9 0.9 0.9]
 [0.9 0.9 1.9 0.9]]
[Iteration 2] Value function:
[[0.81 0.81 0.81 0.81]
 [0.81 0.81 0.81 0.81]
 [0.81 0.81 1.71 0.81]
 [0.81 1.71 1.81 0.81]]
[Iteration 3] Value function:
[[0.729 0.729 0.729 0.729]
 [0.729 0.729 1.539 0.729]
 [0.729 1.539 1.629 0.729]
 [0.729 1.629 1.729 0.729]]
[Iteration 4] Value function:
[[0.656 0.656 1.385 1.247]
 [0.656 0.656 1.466 0.656]
 [1.385 1.466 1.556 0.656]
 [0.656 1.556 1.656 0.656]]
[Iteration 5] Value function:
[[0.59  1.247 1.319 1.188]
 [1.247 0.59  1.4   0.59 ]
 [1.319 1.4   1.49  0.59 ]
 [0.59  1.49  1.59  0.59 ]]
[Iteration 6] Value function:
[[1.122 1.188 1.26  1.134]
 [1.188 0.531 1.341 0.531]
 [1.26  1.341 1.431 0.531]
 [0.531 1.431 1.531 0.531]]
[Iteration 7] Value function:
[[1.069 1.134 1.207 1.087]
 [1.134 0.478 1.288 0.478]
 [1.207 1.288 1.378 0.478]
 [0.478 1.378 1.478 0.47

  logger.warn(
  logger.warn(
  logger.warn(


'\nEnd of Implement\n'