In [1]:
import numpy as np
import gymnasium as gym

In [2]:
def monte_carlo_es(env, episodes=500, gamma=1.0):
    Q = np.zeros((env.nS, env.nA))
    returns = {s: {a: [] for a in range(env.nA)} for s in range(env.nS)}
    policy = np.zeros(env.nS, dtype=int)
    
    for episode in range(episodes):
        state, _ = env.reset()  # Fix: Reset returns (state, info)
        action = np.random.choice(env.nA)
        episode_data = []
        done = False
        
        while not done:
            next_state, reward, done, _, _ = env.step(action)  # Fix: Unpack correctly
            episode_data.append((state, action, reward))
            state, action = next_state, np.random.choice(env.nA)
        
        G = 0
        visited = set()
        for state, action, reward in reversed(episode_data):
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns[state][action].append(G)
                Q[state, action] = np.mean(returns[state][action])
                policy[state] = np.argmax(Q[state])
    
    return Q, policy

In [7]:
def on_policy_first_visit_mc(env, episodes=500, gamma=1.0, epsilon=0.1):
    env = env.unwrapped  # Ensure access to nS and nA
    Q = np.zeros((env.nS, env.nA))
    returns = {s: {a: [] for a in range(env.nA)} for s in range(env.nS)}
    
    # Initialize policy as a valid probability distribution
    policy = np.ones((env.nS, env.nA)) * (epsilon / env.nA)
    for s in range(env.nS):
        best_action = np.random.choice(env.nA)  # Assign a random action initially
        policy[s, best_action] = 1 - epsilon + (epsilon / env.nA)  # Ensure sum = 1

    for episode in range(episodes):
        state, _ = env.reset()  # Reset environment
        episode_data = []
        done = False

        while not done:
            action = np.random.choice(env.nA, p=policy[state])  # Select action based on policy
            next_state, reward, done, _, _ = env.step(action)
            episode_data.append((state, action, reward))
            state = next_state
        
        G = 0
        visited = set()
        for state, action, reward in reversed(episode_data):
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns[state][action].append(G)
                Q[state, action] = np.mean(returns[state][action])

                # Update policy to Ɛ-soft
                best_action = np.argmax(Q[state])
                policy[state] = epsilon / env.nA  # Reset all actions to epsilon / |A|
                policy[state, best_action] = 1 - epsilon + (epsilon / env.nA)  # Best action gets higher probability

                # **Fix:** Explicitly normalize the policy to ensure sum = 1
                policy[state] /= np.sum(policy[state])

    return Q, policy

In [4]:
env = gym.make("CliffWalking-v0")
env = env.unwrapped

In [5]:
Q_es, policy_es = monte_carlo_es(env)

In [8]:
Q_mc, policy_mc = on_policy_first_visit_mc(env)

In [9]:
print("Monte Carlo ES Policy:")
print(policy_es)
print("\nOn-Policy First-Visit MC Policy:")
print(policy_mc)

Monte Carlo ES Policy:
[1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 1 1 1 1 2 1 1 1 2 0
 0 0 0 0 0 0 0 0 0 0 0]

On-Policy First-Visit MC Policy:
[[0.025 0.025 0.025 0.925]
 [0.025 0.025 0.925 0.025]
 [0.025 0.025 0.025 0.925]
 [0.025 0.925 0.025 0.025]
 [0.025 0.025 0.925 0.025]
 [0.025 0.025 0.925 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.025 0.925 0.025]
 [0.025 0.025 0.025 0.925]
 [0.025 0.025 0.025 0.925]
 [0.925 0.025 0.025 0.025]
 [0.025 0.025 0.925 0.025]
 [0.025 0.925 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.025 0.925 0.025]
 [0.025 0.925 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.025 0.925 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.025 0.925 0.025 0.025]
 [0.925