Use the Cliff Walking Environment:
https://www.gymlibrary.dev/environments/toy_text/cliff_walking/
 
Learn the optimal policy using 500 episodes :
Monte Carlo ES (Exploring Starts)
On-policy first-visit MC control (for Ɛ-soft policies), for Ɛ = 0.1
 
Compare and comment on both methods' performance in terms of the number of steps needed to learn optimal policy and the number of episodes .


In [2]:
import numpy as np
import gymnasium as gym
from collections import defaultdict

def monte_carlo_es(env, num_episodes=500, gamma=1.0):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns = defaultdict(list)
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_log = []
        done = False
        
        while not done:
            action = np.random.choice(env.action_space.n)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode_log.append((state, action, reward))
            state = next_state
            done = terminated or truncated
        
        G = 0
        visited = set()
        for t in reversed(range(len(episode_log))):
            state, action, reward = episode_log[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                Q[state][action] = np.mean(returns[(state, action)])
    
    policy = {s: np.argmax(Q[s]) for s in Q.keys()}
    return policy, Q

def mc_control_epsilon_soft(env, num_episodes=500, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns = defaultdict(list)
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_log = []
        done = False
        
        while not done:
            if np.random.rand() < epsilon:
                action = np.random.choice(env.action_space.n)
            else:
                action = np.argmax(Q[state])
            
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode_log.append((state, action, reward))
            state = next_state
            done = terminated or truncated
        
        G = 0
        visited = set()
        for t in reversed(range(len(episode_log))):
            state, action, reward = episode_log[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                Q[state][action] = np.mean(returns[(state, action)])
    
    policy = {s: np.argmax(Q[s]) for s in Q.keys()}
    return policy, Q

# Initialize the CliffWalking environment
env = gym.make("CliffWalking-v0")

# Run Monte Carlo ES
policy_es, Q_es = monte_carlo_es(env)

# Run On-policy first-visit MC control (Ɛ-soft)
policy_mc, Q_mc = mc_control_epsilon_soft(env)

# Compare results
print("Monte Carlo Exploring Starts Policy:")
print(policy_es)
print("\nOn-policy MC Control (Ɛ-soft) Policy:")
print(policy_mc)


Monte Carlo Exploring Starts Policy:
{35: 2, 23: 2, 22: 1, 21: 1, 20: 1, 19: 1, 18: 1, 30: 1, 31: 0, 29: 1, 17: 1, 7: 1, 6: 1, 8: 1, 16: 1, 28: 1, 27: 0, 15: 1, 3: 1, 2: 1, 1: 1, 0: 1, 12: 1, 24: 0, 13: 1, 36: 0, 26: 1, 25: 0, 14: 1, 4: 1, 5: 1, 34: 1, 10: 1, 11: 2, 9: 1, 32: 1, 33: 1}

On-policy MC Control (Ɛ-soft) Policy:
{36: 0, 24: 1, 12: 1, 1: 0, 13: 1, 0: 0, 3: 2, 2: 1, 4: 1, 15: 0, 14: 0, 16: 1, 5: 0, 6: 1, 18: 0, 7: 1, 8: 1, 9: 1, 21: 0, 10: 1, 22: 0, 34: 1, 11: 2, 20: 0, 17: 0, 23: 2, 19: 0, 32: 0, 30: 1, 28: 3, 27: 0, 25: 2, 33: 0, 29: 3, 26: 1, 35: 2, 31: 1}
