In [30]:
import gym
import numpy as np

In [31]:
# env = gym.make('FrozenLake8x8-v1')
env = gym.make('FrozenLake-v1')

In [32]:
print(env.nS)
print(env.observation_space)
print(env.nA)
print(env.action_space)

16
Discrete(16)
4
Discrete(4)


In [33]:
def policy(q_function, state, epsilon):
    # eps-greedy policy
    optimal_action = np.argmax(q_function[state])
    if np.random.uniform() > epsilon:
        return optimal_action
    else:
        return np.random.choice(len(q_function[state]))

In [52]:
def mc_greedy(env, q_function, policy, episodes = 100000, discount = 1.0, epsilon = 0.1):
    
    return_sum = np.zeros((env.nS, env.nA))
    return_count = np.zeros((env.nS, env.nA))
    
    for i_episode in range(1, episodes):
        
        record = []
        state = env.reset()
        for t in range(1000):
            action = policy(q_function, state, epsilon)
            next_state, reward, done, _ = env.step(action)
            record.append((state, action, reward))
            
            if done:
                break;
            state = next_state
            
        # preparation for iterating all s-a pair in 'record'
        state_action_pair_in_episode = set([(r[0], r[1]) for r in record])
        for s, a in state_action_pair_in_episode:
            first_occurence_index = next(i for i,x in enumerate(record)
                                        if x[0] == s and x[1] == a)
            G = sum([x[2] * (discount ** i) for i,x in enumerate(record[first_occurence_index:])])
            return_sum[s][a] += G
            return_count[s][a] += 1
            q_function[s][a] = return_sum[s][a] / return_count[s][a]
    
    return q_function
        

In [53]:
# q_function = np.random.rand(env.nS, env.nA)
q_function = np.zeros((env.nS, env.nA))
q = mc_greedy(env, q_function, policy)

In [54]:
value = np.zeros(env.nS)
for i in range(env.nS):
    value[i] = np.max(q[i])
print("Epsilon-Optimal Value Function: ")
print(value.reshape(4,4))

env.reset()
env.render()
env.close()

print("Chosen actions: ")
print(np.argmax(q, axis = 1).reshape(4,4))

print("Q-function: ")
print(q)

Epsilon-Optimal Value Function: 
[[0.27411473 0.24727211 0.21598855 0.20399476]
 [0.3194222  0.         0.23550197 0.        ]
 [0.39613182 0.49807827 0.50006256 0.        ]
 [0.         0.63120644 0.8056401  0.        ]]

[41mS[0mFFF
FHFH
FFFH
HFFG
Chosen actions: 
[[1 3 2 3]
 [0 0 2 0]
 [3 1 0 0]
 [0 2 1 0]]
Q-function: 
[[0.24847637 0.27411473 0.25037465 0.19388944]
 [0.15010267 0.12416851 0.11571906 0.24727211]
 [0.18763021 0.19638518 0.21598855 0.20766296]
 [0.12757202 0.13914623 0.12483574 0.20399476]
 [0.3194222  0.22462562 0.21558245 0.19521122]
 [0.         0.         0.         0.        ]
 [0.22210988 0.15005359 0.23550197 0.06142035]
 [0.         0.         0.         0.        ]
 [0.18863049 0.29013629 0.25761589 0.39613182]
 [0.31852103 0.49807827 0.35012837 0.27301255]
 [0.50006256 0.41800643 0.33029491 0.23419078]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.34244792 0.48062016 0.63120644 0.42169692]
 [0.62380952 0.