# Monte Carlo Methods

In [None]:
!pip install gymnasium
import gymnasium as gym
import numpy as np
from tqdm.notebook import tqdm
import plotly.express as px # Install with: conda install -c plotly plotly_express

print(f"Gym version: {gym.__version__}")

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/953.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m819.2/953.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Gym version: 0.29.1


In [None]:
# Video management imports
import cv2

# Helper functions to save videos and images
def save_video(img_array, path='./video/test.mp4'):
  height, width, layers = img_array[0].shape
  size = (width, height)
  out = cv2.VideoWriter(path, cv2.VideoWriter_fourcc(*'AVC1'), 15, size)
  for i in range(len(img_array)):
    bgr_img = cv2.cvtColor(img_array[i], cv2.COLOR_RGB2BGR)
    out.write(bgr_img)
  out.release()
  print('Video saved.')

def save_images(img_array, path='./images'):
  for i, image in enumerate(img_array):
    bgr_img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    cv2.imwrite(path + '/img_' + str(i) + '.jpg', bgr_img)

## Let it go, the FrozenLake environment

Info [FrozenLake-v1 environment](https://gymnasium.farama.org/environments/toy_text/frozen_lake/)


## Monte Carlo Agent

The MC update that is implemented in the `step(episode)` method to update the Q-values is:

$$ Q_{t+1}(s, a) = Q_t(s, a) + \frac{1}{N(s, a)}(G_t - Q_t(s, a))$$

Where: $ G_t = R_{t+1} + \gamma G_{t+1} $ is the incremental form of the return, and $ \frac{1}{N(s, a)} $ is the number of times this combination of (state, action) has appeared in the episode.

In [None]:
visits = {}
class MonteCarloAgent:
    def __init__(self, /, num_states, num_actions, epsilon=0.1, gamma=1.0):
        self.num_states = num_states
        self.num_actions = num_actions
        self.q_values = np.zeros((self.num_states, self.num_actions))
        self.epsilon = epsilon
        self.gamma = gamma

    def choose_policy_action(self, observation):
        return self.argmax(self.q_values[observation, :])

    def choose_e_greedy_action(self, observation):
        if np.random.rand() < self.epsilon:
            return np.random.choice(range(self.num_actions))
        else:
            return self.choose_policy_action(observation)

    def argmax(self, np_array):
        """argmax method with random tie-breaking.

        Args:
          np_array: A numpy array.
        Returns:
          Index of one of the appearances of the highest value in the array.
        """
        tie_indices = np.flatnonzero(np_array == np_array.max())
        return np.random.choice(tie_indices)

    def step(self, episode):
        """Method to update the Q values.

        Args:
          episode: A list of tuples of the form (observation, action, reward).
        """
        g = 0
        for observation, action, reward in reversed(episode):
            g = reward + self.gamma * g
            visits[(observation, action)] = visits.get((observation, action), 0) + 1
            self.q_values[observation, action] += 1.0 / visits[(observation, action)] * (g - self.q_values[observation, action])

## Training

In [None]:
EPISODES = 250_000

# Initialize the environment
env = gym.make('FrozenLake-v1', render_mode='rgb_array')

# Initialize the agent
agent = MonteCarloAgent(num_states=env.observation_space.n, num_actions=env.action_space.n, epsilon=0.1, gamma=1.0)
visit_count = np.zeros(agent.num_states)
performance = []
# Training loop
for episode_number in tqdm(range(EPISODES)):
    # Generate an episode
    episode = []
    done = False
    observation, _ = env.reset()
    action = np.random.choice(range(agent.num_actions))
    count = 0
    while not done:
        new_observation, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode.append((observation, action, reward))
        observation = new_observation
        action = agent.choose_e_greedy_action(observation)

        visit_count[observation] += 1
        count += 1
        if count > 100:
            done = True

    # Update the Q values
    agent.step(episode)

print('Done!')

  0%|          | 0/250000 [00:00<?, ?it/s]

Done!


## Testing the Agent

### Performance

In [None]:
fig = px.imshow(visit_count.reshape((4, 4)))
fig.show()

### The Learned Policy

In [None]:
images = []
for _ in range(5):
    # reset per episode variables starting with the environment
    observation, _ = env.reset()
    count = 0
    done = False
    while not done:
        action = agent.choose_policy_action(observation)
        observation, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        image = env.render()
        images.append(image)
        count += 1
        if count > 100:
            done = True

save_video(images, path='video/montecarlo.mp4')

Video saved.


# Exercises

1. When does the MC method do its updates?
2. Which is the estimate MC method compute from experience?
3. What would happen for environments with infinite steps are possible?
4. How does $\epsilon$ affect the exploration?

## 1. MC methods perform updates at the end of each episode. The updates involve adjusting the value estimates for states or state-action pairs based on the returns observed during that episode.


## 2. The estimate calculated by MC methods is the expected return (or average return) for a particular state or state-action pair. This estimate is obtained by averaging the returns observed across multiple episodes.


## 3. In environments with an infinite number of steps, MC methods can still be applied. However, the updates and estimates would be based on the returns observed within the finite episodes that are actually experienced. The estimates would converge to more accurate values as more episodes are sampled. However, the agent could enter an infinite loop where it never reaches the goal.


## 4. The parameter ϵ is used in the exploration strategy:
  ## 1. The parameter ϵ is used in the ϵ-greedy exploration strategy. It determines the probability of selecting a random action (exploration) versus selecting the action with the highest estimated value (exploitation).
  ## 2. A higher ϵ value increases exploration, as there is a greater chance of selecting a random action, allowing the agent to discover new states and potentially better actions.
  ## 3. A lower ϵ value increases exploitation, as the agent is more likely to choose the action that currently appears to be the best based on its estimates.


# Appendix

## `np_array.max()`
Obtains the highest value on the array.

In [None]:
test_array = np.array([0, 7, 3, 8, 4, 3, 9, 9, 2])
test_array.max()

9

## `np.flatnonzero(np_array)`

This method can be used to get the indices of the values in the array that are different from zero or zero-like (e.g. False).

In [None]:
test_array = np.array([0, 7, 4, 0, 7, 5, 0, 3])
np.flatnonzero(test_array)

array([1, 2, 4, 5, 7])

In [None]:
test_array = np.array([False, True, False, True, True, False])
np.flatnonzero(test_array)

array([1, 3, 4])

## `np.random.choice(np_array)`
Returns a random sample from the given array.

In [None]:
test_array = np.array([4, 7, 11, 1, 8])
np.random.choice(test_array)

11