In [2]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Define the value function as a vector with size (num_states)
num_states = env.observation_space.n
V = np.zeros(num_states)
print("Initial Value Function")
print(V.reshape(4, 4))
print()

# Define the parameters of the value iteration algorithm
gamma = 0.99  # discount factor
epsilon = 1e-8  # convergence threshold
num_iterations = 100000  # maximum number of iterations

# Define a function to update the value of a state based on the Bellman equation
def bellman_update(V, state, gamma):
    action_values = np.zeros(env.action_space.n)
    for action in range(env.action_space.n):
        for prob, next_state, reward, done in env.P[state][action]:
            action_values[action] += prob * (reward + gamma * V[next_state])
    return np.max(action_values)

# Run the value iteration algorithm
for i in range(num_iterations):
    delta = 0
    for state in range(num_states):
        v = V[state]
        V[state] = bellman_update(V, state, gamma)
        delta = max(delta, abs(v - V[state]))
    if delta < epsilon:
        break

# Print the learned value function and optimal policy
print("Final Value function:")
print(V.reshape(4, 4))
optimal_policy = np.zeros(num_states, dtype=np.int)
for state in range(num_states):
    action_values = np.zeros(env.action_space.n)
    for action in range(env.action_space.n):
        for prob, next_state, reward, done in env.P[state][action]:
            action_values[action] += prob * (reward + gamma * V[next_state])
    optimal_policy[state] = np.argmax(action_values)
print("\nFinal Optimal policy:")
print(optimal_policy.reshape((4, 4)))

Initial Value Function
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Final Value function:
[[0.54202581 0.49880303 0.47069551 0.4568515 ]
 [0.55845085 0.         0.35834799 0.        ]
 [0.59179866 0.64307976 0.6152075  0.        ]
 [0.         0.7417204  0.86283741 0.        ]]

Final Optimal policy:
[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  optimal_policy = np.zeros(num_states, dtype=np.int)
