In [3]:
!pip install gymnasium
import numpy as np
import gymnasium as gym
import random
from collections import defaultdict
# Fix np.bool8 deprecation if needed
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_
# Create environment
env = gym.make("FrozenLake-v1", is_slippery=True)
# -------------------------------------
# Monte Carlo Prediction
# -------------------------------------
def mc_prediction(policy, env, episodes=5000, gamma=0.9):
    value_table = defaultdict(float)
    returns = defaultdict(list)
    for _ in range(episodes):
        # env.reset() now returns a tuple: (observation, info)
        state, info = env.reset()
        episode = []
        while True:
            action = policy(state)
            # env.step() now returns a tuple: (observation, reward, terminated, truncated, info)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated # Check for both terminated and truncated
            episode.append((state, action, reward))
            state = next_state
            if done:
                break
        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state_t, _, reward_t = episode[t]
            G = gamma * G + reward_t
            if state_t not in visited:
                returns[state_t].append(G)
                value_table[state_t] = np.mean(returns[state_t])
                visited.add(state_t)
    return value_table
def random_policy(state):
    return env.action_space.sample()
v = mc_prediction(random_policy, env)
print("Value function from Monte Carlo Prediction:")
for s in sorted(v):
    print(f"State {s}: V = {v[s]:.4f}")
# -------------------------------------
# Monte Carlo Control (Îµ-greedy)
# -------------------------------------
def mc_control_epsilon_greedy(env, episodes=10000, gamma=0.9, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns = defaultdict(list)
    # Initialize policy with random actions for each state
    policy = defaultdict(lambda: np.random.choice(env.action_space.n))
    for _ in range(episodes):
        # env.reset() now returns a tuple: (observation, info)
        state, info = env.reset()
        episode = []
        while True:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            # env.step() now returns a tuple: (observation, reward, terminated, truncated, info)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated # Check for both terminated and truncated
            episode.append((state, action, reward))
            state = next_state
            if done:
                break
        G = 0
        visited_sa = set()
        for t in reversed(range(len(episode))):
            state_t, action_t, reward_t = episode[t]
            G = gamma * G + reward_t
            if (state_t, action_t) not in visited_sa:
                returns[(state_t, action_t)].append(G)
                Q[state_t][action_t] = np.mean(returns[(state_t, action_t)])
                # Update the policy for the current state based on the best action
                policy[state_t] = np.argmax(Q[state_t])
                visited_sa.add((state_t, action_t))
    return Q, policy
Q, learned_policy = mc_control_epsilon_greedy(env)
print("\nLearned Policy (0=Left, 1=Down, 2=Right, 3=Up):")
# Print the policy for each state
for s in range(env.observation_space.n):
    print(f"State {s}: Best Action = {learned_policy[s]}")

Value function from Monte Carlo Prediction:
State 0: V = 0.0044
State 1: V = 0.0035
State 2: V = 0.0077
State 3: V = 0.0006
State 4: V = 0.0061
State 6: V = 0.0205
State 8: V = 0.0166
State 9: V = 0.0477
State 10: V = 0.0789
State 13: V = 0.1359
State 14: V = 0.3506

Learned Policy (0=Left, 1=Down, 2=Right, 3=Up):
State 0: Best Action = 2
State 1: Best Action = 2
State 2: Best Action = 0
State 3: Best Action = 0
State 4: Best Action = 3
State 5: Best Action = 2
State 6: Best Action = 2
State 7: Best Action = 3
State 8: Best Action = 3
State 9: Best Action = 1
State 10: Best Action = 1
State 11: Best Action = 3
State 12: Best Action = 1
State 13: Best Action = 1
State 14: Best Action = 2
State 15: Best Action = 2
