In [48]:
import gym
import numpy as np

In [49]:
env = gym.make('FrozenLake-v1')
env.reset()

0

In [50]:
print(env.action_space)
print(env.observation_space)
# print(env.observation_space.high)
# print(env.observation_space.low)

# print(env.P[0][0])
#    P: transitions
#       dictionary of lists, where
#    P[s][a] == [(probability, nextstate, reward, done), ...]

Discrete(4)
Discrete(16)


In [51]:
def policy_eval(policy, env, discount_factor = 1.0, tolerance = 0.00001):
    
    '''
    Input: 
        policy: the policy which is evaluated
                A numpy array with shape [env.nS, env.nA]
        env: the environment
             env.P[s][a] is the transition
             env.nS is number of states
             env.nA is number of actions
        discount_factor: Assume no discount here
        tolerance: the precision
    Output:
        value: a numpy array with shape [env.nS]
        '''
    
    numState = env.nS
    value = np.zeros(numState)
    while True:
        # delta records the change
        delta = 0 
        for state in range(numState):
            v = 0
            for action, prob_action in enumerate(policy[state]):
                for prob_state, next_state, reward, done in env.P[state][action]:
                    v += prob_action * prob_state * (reward + discount_factor * value[next_state])
            delta = max(delta, np.abs(v - value[state]))
            value[state] = v
        
        if delta < tolerance:
            break
    return value

In [52]:
policy = np.ones((env.nS, env.nA)) / env.nA
value_table = policy_eval(policy, env)

In [53]:
print("The value function: \n")
print(value_table.reshape(4,4))
env.render()
env.close()

The value function: 

[[0.013911   0.01161424 0.02094062 0.01046758]
 [0.01623478 0.         0.04074774 0.        ]
 [0.03479961 0.08816698 0.14205099 0.        ]
 [0.         0.17581855 0.4392897  0.        ]]

[41mS[0mFFF
FHFH
FFFH
HFFG
