#Environment Settings (do not change)

Please do **not** change this part.

In [None]:
# This will clean all variables
!pip install gymnasium
!pip install gymnasium[other]
!pip install gymnasium[toy-text]

%reset -f

import numpy as np

if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

from typing import List, Tuple
import gym

# Install necessary things to display videos
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.*

from gym.wrappers.monitoring.video_recorder import VideoRecorder
from base64 import b64encode
from IPython.display import HTML
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  return f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'

Collecting gymnasium
  Using cached gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Using cached gymnasium-1.1.1-py3-none-any.whl (965 kB)
Using cached Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1
Collecting moviepy>=1.0.0 (from gymnasium[other])
  Using cached moviepy-2.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting opencv-python>=3.0 (from gymnasium[other])
  Using cached opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy>=1.0.0->gymnasium[other])
  Using cached imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting proglog<=1.0.0 (from moviepy>=1.0.0->gymnasium[other])
  Using cached prog

# Basics

Setup environment:

In [None]:
# Discount factor (in [0,1))
gamma = 0.95

# Simulation
n_episodes = 200
max_length_episode = 100

# Decide whether to generate a video or not
generate_video = True # leave it to True (or delete the rendering)
video_name = 'FrozenLake'
n_episodes_video = 2 # episode which end up in the video

# Environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True, render_mode='rgb_array')

# The tutorial will be on FrozenLake, but feel free to play with other enviroments too
#env = gym.make('Taxi-v3', render_mode='rgb_array')
#env = gym.make('CliffWalking-v0', render_mode='rgb_array')

  deprecation(
  deprecation(


States and actions (for the environment FrozenLake, 4x4):

*   The state is the position on the 4x4 grid (i.e., between 0 and 15);
*   The action is left (0), down (1), right (2), up (3).




In [None]:
env.action_space

Discrete(4)

In [None]:
env.action_space.n

4

In [None]:
env.observation_space

Discrete(16)

In [None]:
env.observation_space.n

16

Probability matrix, for instance here the probabilities when at state 0 and action 1 is played.

In [None]:
env.P[0][1] # p_state, state, reward of that transition, done (ignore the last output)

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 4, 0.0, False),
 (0.3333333333333333, 1, 0.0, False)]

We start with the semantics: For us, a policy is Python list of length $n_\mathrm{states}$ and each entry of this list is a vector of size $n_\mathrm{actions}$. To sample an action at a given state, which will be helpful later, the following function can be used.

*Note*: this is not the most efficient way, just for illustration purposes.

In [None]:
def sample_action(policy: List[np.ndarray], state: int) -> int:
  return np.random.choice(np.arange(start=0, stop=env.action_space.n), size=1, p=policy[state])[0]

We start with a simply random policy. We will later experiment other policies.

For us, a policy is list of arrays. In particular:
- `pi` is list whose dimension is the number of states;
- `pi[s]` is numpy array whose dimension is the number of actions which represents a probability distribution over the action space;
- `pi[s][a]` is the probability of playing action `a` when at state `s`.




In [None]:
pi_random = []
for s in range(env.observation_space.n):
  probability_actions = np.ones(env.action_space.n)/env.action_space.n
  pi_random.append(probability_actions)

Check your result by inspecting the probability distribution at the first state.  

In [None]:
pi_random[0]

array([0.25, 0.25, 0.25, 0.25])

Finally, we test our sampling method.

In [None]:
sample_action(policy=pi_random,
              state=1)

np.int64(0)

#Simulation environment
We now write a function that simulates a policy.

In [None]:
def simulate_environment(policy:List[np.ndarray], sim_video_name: str) -> float:
  # Setup video
  if generate_video:
    video = VideoRecorder(env, sim_video_name)

  total_reward = 0.0

  for e in range(n_episodes):
    reward_episode = 0.0

    # Do video or not
    do_video = generate_video and e <= n_episodes_video-1

    # Reset
    observation = env.reset()
    if do_video:
      video.capture_frame()

    # Simulate an episode
    for t in range(max_length_episode):
      action = sample_action(policy=policy,
                             state=observation)
      observation, reward, done, _ = env.step(action)
      if do_video:
        video.capture_frame()
      # Compute reward
      reward_episode += gamma**t * reward

      if done:
        break

    # Increase reward
    total_reward += reward_episode

  env.close()
  if generate_video:
    video.close()

  return total_reward/n_episodes

We can now simulate our random policy. The simulations parameters are listed at the top of the file.

In [None]:
average_reward_random = simulate_environment(policy=pi_random,
                                             sim_video_name=video_name + '_random_policy.mp4')
print('Average reward: ' + str(average_reward_random))

  logger.deprecation(
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


Average reward: 0.015904804691404775


Display video:

In [None]:
HTML(render_mp4(video_name +  '_random_policy.mp4'))

#Value function of a policy

Now, we can write a function that computes the value/cost-to-go of a given policy. We will do it in three ways:

1.   Solving the (linear) Bellman equation:
$$V^\pi(x)=\sum_{a}\pi(x,a)\left(R_x^a+\sum_{x'}P_{xx'}^aV^\pi(x')\right);$$
2.   Using the contractivity "Bellman equation" (see more next time): Update $V_t^\pi$ to $V_{t+1}^\pi$ via
$$V_{t+1}^\pi(x)=\sum_a\pi(x,a)\left(R_x^a+\sum_{x'}P_{xx'}^aV_t^\pi(x')\right);$$
note that you also need to define an initial condition and an adequate stopping  criterium;
3.   Via numerical simulations.

*Note*: This is not the optimal value/cost-to-go, but just the reward/cost incurred when using the policy $\pi$. Next time, we will look into methods to compute the optimal policy $\pi^\ast$.

First, though, we write a function that computes the expected reward $R_x^a$ and the probability vector $P_{xx'}^a$ when playing action $a$ at state $x$.






In [None]:
def get_reward_probability_vector_state_action(state: int, action: int) -> Tuple[float, np.ndarray]:
  expected_reward = 0.0
  probability_vector = np.zeros(env.observation_space.n)

  # Extract info
  output = env.P[state][action]
  for o in output:
    p_next_state = o[0]
    next_state = o[1]
    reward = o[2]

    # Reward
    expected_reward += p_next_state*reward

    # Probability vector
    probability_vector[next_state] += p_next_state

  return expected_reward, probability_vector

We can now start with 1. We split the task in two pieces:


1.   Implement a function `get_reward_probability_matrix` that outputs the reward vector (numpy array whose dimension is the number of states) whose entry $x$ is
$$\sum_{a}\pi(x,a)R_x^a$$
and the probability matrix (two-dimensional numpy array whose dimension is the number of states) whose entry $(x,x')$ is
$$\sum_{a}\pi(x,a)P_{xx'}^a.$$
2.   Use these two quantities to solve the linear equation. The output should be $V(x)$ as a numpy array (whose dimension is the number of states).

*Hint:* Use the function `get_reward_probability_vector_state_action` you wrote above.


In [None]:
def get_reward_probability_matrix(policy: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
  expected_reward = np.zeros(env.observation_space.n)
  probability_matrix = np.zeros((env.observation_space.n, env.observation_space.n))

  for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
      # Probability of playing that action
      p_action = policy[state][action]
      # Reward and probability vector
      expected_reward_s_a, p_vector_s_a = get_reward_probability_vector_state_action(state=state,
                                                                                     action=action)
      # Reward
      expected_reward[state] += p_action*expected_reward_s_a
      # Probability matrix
      probability_matrix[state, :] += p_action*p_vector_s_a
  return expected_reward, probability_matrix

def compute_value_policy(policy: List[np.ndarray]) -> np.ndarray:
  r, p = get_reward_probability_matrix(policy=policy)
  return np.linalg.solve(np.eye(env.observation_space.n) - gamma*p, r)

value_random_policy = compute_value_policy(pi_random)
print(value_random_policy)

[ 7.76738424e-03  6.86813641e-03  1.42829484e-02  6.46133382e-03
  1.03018709e-02 -9.88877520e-17  3.25263116e-02 -2.00772939e-16
  2.53070433e-02  7.09470575e-02  1.22669943e-01  6.53039237e-16
  3.05773300e-16  1.50747467e-01  4.13031652e-01  0.00000000e+00]


We now use 2.:  

In [None]:
def compute_value_policy_iterative(policy: List[np.ndarray]) ->  np.ndarray:
  value_pi = np.zeros(env.observation_space.n)
  for t in range(1000):
    value_pi_new = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
      for action in range(env.action_space.n):
        expected_reward_action, probability_vector = get_reward_probability_vector_state_action(state=state,
                                                                                                action=action)
        value_pi_new[state] += policy[state][action]*(expected_reward_action + gamma*np.dot(probability_vector, value_pi))
    # Check if to stop
    if np.max(np.abs(value_pi - value_pi_new)) <= 1e-6:
      print('The iteration converged at t=' + str(t+1) + '.\n')
      break
    value_pi = value_pi_new.copy()
  return value_pi_new

value_iterative_random_policy = compute_value_policy_iterative(pi_random)
print(value_iterative_random_policy)

The iteration converged at t=40.

[0.00776406 0.00686578 0.01428087 0.00645973 0.01029992 0.
 0.03252549 0.         0.02530589 0.07094636 0.12266933 0.
 0.         0.15074695 0.41303116 0.        ]


Finally, we check the value at the first cell and check that your result numerically (use the simulation done above):

In [None]:
# Print value at the starting cell (note: for other environment it might not be the first cell)
print(average_reward_random)

0.015904804691404775


#Policy Iteration

Write a function that computes the greedy policy. It consists of two ingredients:

*   A function compute_bellman_operator that evaluates
$$\max_{u\in U}R_x^u + \gamma\sum_{x'} P_{xx'}^u V(x')$$
for a given state $x$. This is also the Q function evaluated at $x$ and $u$.

*   A second function compute_greedy_policy that computes
$$\arg\max_{\pi}\sum_{u}\pi(x,u)\left(R_x^u + \gamma\sum_{x'} P_{xx'}^u V(x')\right)$$
Here, note that the minimum always is a deterministic policy.

*Hints:*
- You should reuse the functions coded last time (see code above).
- You may find the functions `np.max()` and `np.argmax()` useful.
- Recall that a policy a list of numpy arrays (see above). Stick to this formalism also for this exercise.

In [None]:
def compute_bellman_operator(state: int, value_function: np.ndarray) -> Tuple[float, int]:
  candidates = np.zeros(env.action_space.n)
  for action in range(env.action_space.n):
    expected_reward_action, probability_vector = get_reward_probability_vector_state_action(state=state,
                                                                                            action=action)
    candidates[action] = expected_reward_action + gamma*np.dot(probability_vector, value_function)
  return np.max(candidates), np.argmax(candidates) # since it is a reward, we maximize

def compute_greedy_policy(value_function: np.ndarray) -> List[np.ndarray]:
  pi_greedy = []
  for state in range(env.observation_space.n):
    _,  best_action = compute_bellman_operator(state=state,
                                               value_function=value_function)
    stochastic_policy = np.zeros(env.action_space.n) # here we could also focus on deterministic policies
    stochastic_policy[best_action] = 1.0
    pi_greedy.append(stochastic_policy)
  return pi_greedy

We can now implement policy iteration. We initialize the algorithm with the random policy and stop at convergence (or when a given number of iterations is reached).

In [None]:
# Maximum number of iterations
max_number_iterations = 100 # will converge in finitely many steps anyway
tol = 1e-5

# Initialize with random policy
pi = pi_random.copy()
value = compute_value_policy(policy=pi)

for t in range(max_number_iterations):
  pi_new = compute_greedy_policy(value_function=value)
  value_new = compute_value_policy(policy=pi_new)
  # Check if converged (we compare the value since the pi^ast might not be unique)
  if np.max(np.abs(value  - value_new)) <= tol:
    print('Policy iteration converged at t=' + str(t+1) + '.\n')
    break
  # Update policy
  pi = pi_new.copy()
  value = value_new.copy()

# Final result
pi_policy_iteration = pi
value_policy_iteration = compute_value_policy(policy=pi_policy_iteration)

# Print value at the starting cell
print(value_policy_iteration[0])

Policy iteration converged at t=2.

0.18047157839720157


Simulate policy:

In [None]:
average_reward_policy_iteration = simulate_environment(policy=pi_policy_iteration,
                                                       sim_video_name=video_name + '_policy_iteration.mp4')
print('Average reward: ' + str(average_reward_policy_iteration))

  logger.deprecation(


Average reward: 0.1840808029549534


Display video:

In [None]:
HTML(render_mp4(video_name + '_policy_iteration.mp4'))

#Value Iteration

We now perform value iteration. We run the algortihm for at most a maximum number of iterations and we stop when the difference between the value functions of consecutive steps (measured via $\|\cdot\|_\infty$) is smaller than some given tolerance.

*Hints:* You should reuse the functions coded last time (see code above).

In [None]:
# Maximum number of iterations
max_number_iterations = 1000
tol = 1e-5

# Initial guess for the value function
value = np.zeros(env.observation_space.n)

for t in range(max_number_iterations):
  value_new = np.zeros(env.observation_space.n)
  for state in range(env.observation_space.n):
    value_new[state], _ = compute_bellman_operator(state=state,
                                                   value_function=value)
  # Check if to stop
  if np.max(np.abs(value - value_new)) <= tol:
    print('Value iteration converged at t=' + str(t+1) + '.\n')
    break
  value = value_new.copy()

# Final result
value_value_iteration = value_new
pi_value_iteration = compute_greedy_policy(value_function=value_value_iteration)

# Print value at the starting cell
print(value_value_iteration[0])

Value iteration converged at t=100.

0.18035744556439995


Simulate policy

In [None]:
average_reward_value_iteration = simulate_environment(policy=pi_value_iteration,
                                                      sim_video_name=video_name + '_value_iteration.mp4')
print('Average reward: ' + str(average_reward_value_iteration))

  logger.deprecation(


Average reward: 0.19477372778548158


Display video

In [None]:
HTML(render_mp4(video_name + '_value_iteration.mp4'))

  and should_run_async(code)


#Monte-Carlo learning

We start with some basic functions. Our first function should output the value function given the Q function and current policy.

*Note:* This is simply consistency between value and Q functions.

In [None]:
def value_function_from_Q_function(Q_function: np.ndarray, policy: List[np.ndarray]) -> np.ndarray:
  # INSERT YOUR CODE HERE: compute the value function given the policy and the Q_function
  return None

Second, we write a function that computes the greedy policy given the Q function.

*Note:* Use the function `compute_greedy_policy` from above as well as the consistency between value function and Q function in `value_function_from_Q_function`.

In [None]:
def compute_greedy_policy_from_Q_function(Q_function: np.ndarray, policy: List[np.ndarray]) -> List[np.ndarray]:
  # INSERT YOUR CODE HERE: compute the greedy policy
  return None

Third, we now implement a function that sample with some exploration. This  is the equivalent to the $ɛ$-greedy policy.

In [None]:
def sample_action_eps_greedy(policy: List[np.ndarray], state: int, eps: float) -> int:
  # INSERT YOUR CODE HERE
  return None

Fourth, we estimate Q function from experiments.

*Note:* This function will work only if all state-action pairs are visited. In FrozenLake, we will never visit state-action pair of cells corresponding to the water (since as soon as we get there the experiment in terminated), so be careful there.

In [None]:
def compute_Q_function_from_experiments(n_experiments: int, policy: List[np.ndarray], eps: float) -> np.ndarray:
  sum_future_rewards = np.zeros((env.observation_space.n, env.action_space.n)) # sum of all future rewards across all experiments (will then normalize to get the Q function)
  count_visited = np.zeros((env.observation_space.n, env.action_space.n)) # number of times a state-action pair is visited, across all experiments
  for _ in range(n_experiments):
    # Create vectors to save state, action, and reward (this will reset for a new experiment)
    state_vec, action_vec, reward_vec = [], [], []

    # Reset
    observation = env.reset()

    # Simulate an episode and update Q_function
    for t in range(max_length_episode):
      action = sample_action_eps_greedy(policy=policy,
                                        state=observation,
                                        eps=eps)
      observation_new, reward, done, _ = env.step(action)

      # Collect data
      state_vec.append(observation)
      action_vec.append(action)
      reward_vec.append(reward)

      # Update
      observation = observation_new

      if done:
        break

    # Transform into arrays
    state_vec = np.asarray(state_vec)
    action_vec = np.asarray(action_vec)
    reward_vec = np.asarray(reward_vec)

    # Update sum_future_rewards and count_visited
    # INSERT YOUR CODE HERE
  env.close()

  # Get Q_function - we need to be careful with zero elements
  # INSERT YOUR CODE HERE

  return Q_function

With all these ingredients, we can now run some model-free reinforcement learning.

In [None]:
# Parameters
number_iterations = 10 # number of iterations of model-free RL
n_experiments = 10 # number of experiments used to estimate the Q function at each iteration
exploration_eps = 0.05 # eps for the greedy strategy

# Initialize with random policy
pi = pi_random.copy()
Q_function = compute_Q_function_from_experiments(n_experiments=n_experiments,
                                                 policy=pi,
                                                 eps=exploration_eps)

for t in range(number_iterations):
  # Greedy policy
  pi_new = compute_greedy_policy_from_Q_function(Q_function=Q_function,
                                                 policy=pi)
  # Estimate Q-function, note that we have some exploration here
  Q_function_new = compute_Q_function_from_experiments(n_experiments=n_experiments,
                                                       policy=pi_new,
                                                       eps=exploration_eps)
  # Update policy
  pi = pi_new.copy()
  Q_function = Q_function_new.copy()

# Final result
pi_rl = pi
Q_function_rl = compute_Q_function_from_experiments(n_experiments=n_experiments,
                                                    policy=pi_rl,
                                                    eps=0.0) # no exploration here
value_rl = value_function_from_Q_function(Q_function=Q_function_rl,
                                          policy=pi_rl)

# Print value at the starting cell
print(value_rl[0])

Simulate policy:

In [None]:
average_reward_rl = simulate_environment(policy=pi_rl,
                                         sim_video_name=video_name + '_rl.mp4')
print('Average reward: ' + str(average_reward_rl))

Display video:

In [None]:
HTML(render_mp4(video_name + '_rl.mp4'))

Check correctness by comparing to policy iteration (which has access to the exact model).

Note that the error in the holes is irrelevant.  

In [None]:
for s in range(env.observation_space.n):
  print('Error in the policy at state ' + str(s) + ': ' + str(np.max(np.abs(pi_rl[s] - pi_policy_iteration[s]))))
  print('Error in the value  at state ' + str(s) + ': ' + str(np.max(np.abs(value_rl[s] - value_policy_iteration[s]))))