In [20]:
import gym
import numpy as np

In [21]:
def max_action_value(env, value_function, state, discount):
    '''
    calculate the max action value
    '''
    actions = np.zeros(env.nA)
    # According to the equation from intro. to RL
    for action in range(env.nA):
        for prob_state, next_state, reward, done in env.P[state][action]:
            actions[action] += prob_state * (reward + discount * value_function[next_state])
    return np.max(actions), np.argmax(actions)

In [22]:
def value_iteration(env, discount=1, tolerance = 0.0001):
    '''
    Apply the value iteration method
    '''
    
    # value iteration
    # finding optimal value function
    value = np.zerosrandom.rand(env.nS)
    while True:
        delta = 0
        for state in range(env.nS):
            action_value, _= max_action_value(env, value, state, discount)
            delta = max(delta, np.abs(action_value - value[state]))
            value[state] = action_value
        if delta < tolerance:
            break
            
    # find optimal policy according to optimal value function
    policy = np.zeros((env.nS, env.nA))
    for state in range(env.nS):
        _, optimal_action = max_action_value(env, value, state, discount)
        policy[state, optimal_action] = 1.0
    
    return policy, value

In [23]:
env = gym.make('FrozenLake-v1')
policy, value = value_iteration(env)

In [24]:
print("The policy distribution: ")
print(policy)
print("The action in each state: ")
print(np.reshape(np.argmax(policy, axis=1), (4,4)))
env.render()
env.close()

The policy distribution: 
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
The action in each state: 
[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]

[41mS[0mFFF
FHFH
FFFH
HFFG


In [25]:
print(np.reshape(value, (4,4)))

[[0.82182145 0.82126109 0.82087163 0.82067347]
 [0.82199325 0.         0.52824715 0.        ]
 [0.82226231 0.82260733 0.76389785 0.        ]
 [0.         0.88171208 0.94085038 0.        ]]
