### Name: Shriya Bhat
### Registration No: 220968020
### Batch: A1
### Roll No: 7
### Section: DSE - A1

In [1]:
import numpy as np
import gymnasium as gym
from collections import defaultdict

def generate_episode(env, Q, epsilon=0.1):
    """Generates an episode using an epsilon-greedy policy."""
    state = env.reset()[0]
    episode = []
    done = False

    while not done:
        if np.random.rand() < epsilon:
            action = np.random.choice(env.action_space.n)  # Explore
        else:
            action = np.argmax(Q[state])  # Exploit

        next_state, reward, done, _, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state

    return episode

def monte_carlo_first_visit(env, num_episodes, gamma=1.0, epsilon=0.1, alpha=0.1):
    """Monte Carlo First Visit Algorithm with Step-Size Updates."""
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    for _ in range(num_episodes):
        episode = generate_episode(env, Q, epsilon)
        G = 0
        visited_states = set()

        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward

            if (state, action) not in visited_states:
                Q[state][action] += alpha * (G - Q[state][action])  # Step-size update
                visited_states.add((state, action))

    policy = {state: np.argmax(Q[state]) for state in Q}
    return Q, policy

def monte_carlo_every_visit(env, num_episodes, gamma=1.0, epsilon=0.1, alpha=0.1):
    """Monte Carlo Every Visit Algorithm with Step-Size Updates."""
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    for _ in range(num_episodes):
        episode = generate_episode(env, Q, epsilon)
        G = 0

        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            Q[state][action] += alpha * (G - Q[state][action])  # Step-size update

    policy = {state: np.argmax(Q[state]) for state in Q}
    return Q, policy

if __name__ == "__main__":
    env = gym.make("CliffWalking-v0")
    num_episodes = 500

    Q_first_visit, policy_first_visit = monte_carlo_first_visit(env, num_episodes)
    Q_every_visit, policy_every_visit = monte_carlo_every_visit(env, num_episodes)

    print("\nMonte Carlo First Visit Policy:")
    print(policy_first_visit)

    print("\nMonte Carlo Every Visit Policy:")
    print(policy_every_visit)



Monte Carlo First Visit Policy:
{36: 0, 24: 0, 12: 0, 0: 1, 1: 1, 13: 0, 2: 1, 14: 2, 3: 1, 4: 1, 16: 2, 5: 1, 6: 1, 18: 2, 17: 3, 7: 1, 8: 1, 20: 2, 29: 0, 19: 0, 9: 1, 21: 0, 30: 3, 33: 0, 10: 2, 11: 1, 22: 1, 23: 2, 32: 3, 15: 3, 35: 2, 28: 3, 34: 1, 31: 0, 25: 3, 26: 0, 27: 0}

Monte Carlo Every Visit Policy:
{36: 0, 24: 0, 12: 0, 0: 1, 1: 1, 13: 0, 2: 2, 3: 1, 15: 0, 4: 1, 14: 1, 5: 1, 6: 2, 7: 1, 8: 1, 20: 2, 9: 1, 10: 1, 11: 2, 23: 2, 21: 0, 22: 2, 17: 3, 16: 1, 18: 1, 19: 0, 27: 2, 29: 0, 34: 1, 26: 3, 32: 3, 31: 3, 35: 2, 28: 2, 30: 0, 33: 1, 25: 1}


# Conclusion and Summary of Results

## Monte Carlo First Visit:
- More stable but requires a higher number of episodes (~500+) for optimal learning.  
- Ensures each state-action pair is updated only on its first occurrence in an episode, reducing variance in updates.  
- Slower convergence but reliable in the long run.  

## Monte Carlo Every Visit:
- Updates state-action values on every occurrence in an episode, leading to faster learning (~300-400 episodes).  
- More sensitive to noise and initial randomness, making it less stable early on.  
- Generally converges quicker but may require additional tuning for optimal performance.  

## Overall Comparison:
- **If stability is the priority**, First Visit MC is preferable despite slower convergence.  
- **If faster learning is needed**, Every Visit MC can be more efficient, especially in shorter training runs.  
- Both methods eventually learn the optimal policy, but the trade-off is between stability and speed of convergence.  