In [2]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Define the parameters of the Monte Carlo Control algorithm
num_episodes = 10000
gamma = 0.9  # discount factor
epsilon = 0.2  # exploration probability

# Define the Q-value function as a matrix with size (num_states, num_actions)
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

# Define a dictionary to store the returns for each state-action pair
returns = {}

# Define a function to select an action based on the Q-value function and exploration-exploitation strategy
def select_action(state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # explore
    else:
        return np.argmax(Q[state])  # exploit

# Define the main loop of the Monte Carlo Control algorithm
for episode in range(num_episodes):
    # Initialize episode variables
    state = env.reset()
    done = False
    episode_history = []

    # Generate an episode by following the current policy
    while not done:
        action = select_action(state)  # decaying epsilon-greedy exploration
        next_state, reward, done, _ = env.step(action)
        episode_history.append((state, action, reward))
        state = next_state

    # Update the Q-value function using the episode history
    G = 0  # total discounted reward
    for t in reversed(range(len(episode_history))):
        state, action, reward = episode_history[t]
        G = gamma * G + reward
        state_action = (state, action)
        if state_action not in [(x[0], x[1]) for x in episode_history[0:t]]:
            if state_action not in returns:
                returns[state_action] = [G]
            else:
                returns[state_action].append(G)
            Q[state][action] = np.mean(returns[state_action])

# Print the learned Q-value function and optimal policy
print("Final Q-value function:")
print(Q)
optimal_policy = np.argmax(Q, axis=1)
print("\nFinal Optimal policy:")
print(optimal_policy.reshape((4,4)))

Final Q-value function:
[[3.33282228e-03 8.81938799e-03 5.42125199e-03 7.79768134e-03]
 [5.08353843e-03 1.26459114e-02 1.30918624e-02 1.71345062e-02]
 [3.22612498e-02 2.28074691e-02 2.74372501e-02 1.52626199e-02]
 [3.66241889e-03 1.39209264e-02 0.00000000e+00 0.00000000e+00]
 [2.88856700e-03 9.64342486e-03 3.49453659e-03 4.14207209e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.02427211e-02 4.73421498e-02 3.68228053e-02 3.17726057e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.28174880e-04 1.21500000e-02 2.29466923e-02 1.19656436e-02]
 [7.57967971e-03 1.13247307e-01 5.33740909e-02 6.21072210e-02]
 [1.87100526e-01 1.35244904e-01 1.75068311e-01 4.41399416e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.44657483e-02 0.00000000e+00 2.18544046e-01 0.00000000e+00]
 [2.22850691e-01 5.13161479e-01 5.44214900e-01 3.04992146e-01]
 [0.00000000e+00 0.00000000e+00