### Monte Carlo Predictions

***Simple Grid World***

**1.Environment Setup (Simple GridWorld or Custom)**

In [3]:
import numpy as np
import random

# Define a simple environment
states = [0, 1, 2, 3]
terminal_states = [3]
actions = ['left', 'right']
transitions = {
    0: {'right': 1},
    1: {'right': 2, 'left': 0},
    2: {'right': 3, 'left': 1}
}
rewards = {
    (2, 'right'): 1
}


**2.Define a Policy $\pi$**

In [4]:
def policy(state):
  return 'right' if 'right' in transitions.get(state, {}) else None


**3.Generate an Episode using $\pi$**

In [5]:
def generate_episode():
  episode = []
  state = 0
  while state not in terminal_states:
    action = policy(state)
    next_state = transitions[state][action]
    reward = rewards.get((state, action), 0)
    episode.append((state, action, reward))
    state = next_state
  return episode


**4.Monte Carlo Prediction Algorithm**

In [None]:
def monte_carlo_prediction(num_episodes=1000):
    V = {s: 0 for s in states}
    Returns = {s: [] for s in states}

    for _ in range(num_episodes):
        episode = generate_episode()
        visited_states = set()
        G = 0
        # Calculate returns in reverse (for efficiency)
        for t in reversed(range(len(episode))):
            state, _, reward = episode[t]
            G = reward + G
            if state not in visited_states:
                visited_states.add(state)
                Returns[state].append(G)
                V[state] = np.mean(Returns[state])
    return V


**5.Results**

In [7]:
V = monte_carlo_prediction(1000)
for s in V:
    print(f"V({s}) = {V[s]:.2f}")


V(0) = 1.00
V(1) = 1.00
V(2) = 1.00
V(3) = 0.00
