In [4]:
import numpy as np
import gym

# create the environment
env = gym.make('FrozenLake-v1')

num_states = env.observation_space.n
num_actions = env.action_space.n

def value_iteration(env, theta=0.0001, gamma=0.9):
    # Initialize the value function to zero
    V = np.zeros(num_states)
    
    # Loop until convergence
    while True:
        delta = 0
        
        # For each state s
        for s in range(num_states):
            v = V[s]
            
            # Compute the new value for the state s
            q = np.zeros(num_actions)
            for a in range(num_actions):
                for prob, next_state, reward, done in env.P[s][a]:
                    q[a] += prob * (reward + gamma * V[next_state])
            
            V[s] = np.max(q)
            delta = max(delta, np.abs(v - V[s]))
        
        # Check for convergence
        if delta < theta:
            break

    # Compute the optimal policy
    policy = np.zeros((num_states, num_actions))
    for s in range(num_states):
        q = np.zeros(num_actions)
        for a in range(num_actions):
            for prob, next_state, reward, done in env.P[s][a]:
                q[a] += prob * (reward + gamma * V[next_state])
        best_action = np.argmax(q)
        policy[s, best_action] = 1
    
    return V, policy


V, policy = value_iteration(env)

print("Final Value Function: ")
print(V.reshape(4,4))
print("\nFinal Policy: ")
print(policy)

Final Value Function: 
[[0.06848032 0.06111567 0.07422254 0.05560469]
 [0.09153995 0.         0.11212558 0.        ]
 [0.14522151 0.24737863 0.29954442 0.        ]
 [0.         0.37986011 0.63898452 0.        ]]

Final Policy: 
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


  and should_run_async(code)
  deprecation(
  deprecation(
