# Experience Replay

As off-policy methods use a different policy for exploring the observation space (behavior policy) and value-function update (target policy), it is possible to use a memory buffer of experiences, i.e., (observation, action, reward) tuples, during the training process.

In [None]:

!pip install gymnasium
!pip install plotly

import gymnasium as gym
import numpy as np
from collections import deque
import random
import plotly.express as px

print(f"OpenAI Gym version: {gym.__version__}")

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
OpenAI Gym version: 0.29.1


In [None]:
# Video management imports
import cv2

# Helper functions to save videos and images
def save_video(img_array, path='./video/test.mp4'):
  height, width, layers = img_array[0].shape
  size = (width, height)
  out = cv2.VideoWriter(path, cv2.VideoWriter_fourcc(*'AVC1'), 15, size)
  for i in range(len(img_array)):
    bgr_img = cv2.cvtColor(img_array[i], cv2.COLOR_RGB2BGR)
    out.write(bgr_img)
  out.release()
  print('Video saved.')

def save_images(img_array, path='./images'):
  for i, image in enumerate(img_array):
    bgr_img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    cv2.imwrite(path + '/img_' + str(i) + '.jpg', bgr_img)

In [None]:
# Check if we running in Google Colab or Jupyter Notebook
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print('Running in Google Colab')
    # Connect with Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # This auxiliary function simplifies the visualization of OpenCV Images
    from google.colab import output
    def clear_ouput():
        ouput.clear()
else:
    from IPython.display import clear_output
    print('Running in Jupyter Notebook')

Running in Google Colab
Mounted at /content/drive


## The Basic TD Agent
To implement the experience replay buffer, three variables to manage it were added:
- experience_replay_size: The size of the memory buffer.
- experience_replay: The memory buffer itself. It is a deque, as we want to maintain the most recent experience in the buffer.
- minibatch_size: Size of the sample extracted from memory to update the state value function.

In [None]:
class TDAgent:
  def __init__(self, /, num_states, num_actions, gamma=1.0, alpha=0.5, epsilon = 0.1, experience_replay_size=1000, minibatch_size = 32):
    # Future discount parameter
    self.gamma = gamma
    # Learning rate
    self.alpha = alpha
    # Epsilon parameter for the e-greedy policy
    self.epsilon = epsilon
    # Number of states
    self.num_states = num_states
    # Number of actions
    self.num_actions = num_actions
    # Q-values
    self.q_values = np.zeros((self.num_states, self.num_actions))
    # Size of the experience replay buffer
    self.experience_replay_size = experience_replay_size
    # Experience replay buffer
    self.experience_replay = deque(maxlen=self.experience_replay_size)
    # Size of the sample to update the value function
    self.minibatch_size = minibatch_size

  def choose_policy_action(self, observation):
    """Chooses the action with the maximum q-value.

    Args:
      observation: An observation from the environment.
    Returns:
      The greedy action (the one with best value).
    """
    return self.argmax(self.q_values[observation, :])

  def choose_e_greedy_action(self, observation):
    """Chooses an action following the e-greedy exploration method.

    Args:
      observation: An observation from the environment.
    Returns:
      The e-greedy action. With probability e, it returns a random action, which
      includes the policy action, otherwise, with probability 1 - e, it returns
      the policy action.
    """
    if np.random.rand() < self.epsilon:
      return np.random.choice(range(self.num_actions))
    else:
      return self.choose_policy_action(observation)

  def argmax(self, np_array):
    """argmax method with random tie-breaking.

    Args:
      np_array: A numpy array.
    Returns:
      Index of one of the appearances of the highest value in the array.
    """
    tie_indices = np.flatnonzero(np_array == np_array.max())
    return np.random.choice(tie_indices)

## The Q-Learning Agent

When using the experience replay buffer, the update step changes:

- We have to extract a sample of experience: the minibatch.
- For each sample, update the action value function: minibatch update.

Remember the step for the Q-learning without batches:

```
def step(self):
  td_target = reward
  if(not done):
    td_target += self.gamma * np.max(self.q_values[next_observation, :])
  self.q_values[self.observation, self.action] += self.alpha * (td_target - self.q_values[self.observation, self.action])
```

We will do the same but for arrays of observations, actions, rewards, and next observations.

In [None]:
class QLearningAgent(TDAgent):
  def minibatch_step(self):
    # Get from the experience replay buffer a minibatch
    minibatch = random.sample(self.experience_replay, self.minibatch_size)
    # Separate in different arrays observations, actions, rewards, dones, and
    # next_observations.
    observations, actions, rewards, dones, next_observations = zip(*minibatch)
    # Convert them to np.arrays
    observations = np.array(observations)
    actions = np.array(actions)
    rewards = np.array(rewards, dtype=np.float64)
    dones = np.array(dones)
    next_observations = np.array(next_observations)

    # Instead of a single TD target, an array of TD targets is computed
    # For the final state, the TD target is the last reward.
    td_targets = np.copy(rewards)
    # For the rest of the states we need to add the expected return.
    td_targets[dones == False] += self.gamma * np.max(self.q_values[next_observations, :], axis=1)[dones == False]
    # Then do the Bellman update
    self.q_values[observations, actions] += self.alpha * (td_targets - self.q_values[observations, actions])

## Training

In [None]:
# Num of episodes to learn
EPISODES = 100
# Experience replay buffer size
MEMORY_SIZE = 2000
# Size of the batch extracted from the experience replay buffer
MINIBATCH_SIZE = 32

# Initialize the environment
env = gym.make('CliffWalking-v0', render_mode="rgb_array")

# Define agent
agent = QLearningAgent(num_states=env.observation_space.n, num_actions= env.action_space.n, gamma= 1.0, alpha= 0.9, epsilon=0.1, experience_replay_size= MEMORY_SIZE, minibatch_size=MINIBATCH_SIZE)
# Tracking the training
visit_count = np.zeros(agent.num_states)

total_steps = 0
for episode in range(EPISODES):
    # Reset episode variables and enviroment
    done = False
    observation, _ = env.reset()
    visit_count[observation] += 1
    while not done:
      total_steps += 1
      # Choose action
      action = agent.choose_e_greedy_action(observation)
      # Take a step in env
      next_observation, reward, terminated, truncated, _ = env.step(action)
      done = terminated or truncated

      # Append experience to the experience replay buffer
      agent.experience_replay.append((observation, action, reward, done, next_observation))
      # Check if the buffer has the minimum size for extracting a minibatch
      if len(agent.experience_replay) > agent.minibatch_size:
        # Have the agent update its Q-Values
        agent.minibatch_step()
      # Update agent observation to new one
      observation = next_observation
      visit_count[observation] += 1

## Training Evolution

In [None]:
fig = px.imshow(visit_count.reshape((4, 12)))
fig.show()

## Testing

In [None]:
observation, _ = env.reset()
done = False
images = []
while not done:
    action = agent.choose_policy_action(observation)
    observation, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    image = env.render()
    images.append(image)

    if len(images) > 100:
        done = True

save_video(images, path='video/Q-Learning-with-experience.mp4')

Video saved.


## 1. The use of experience replay is expected to result in a different visit count compared to basic Q-learning. Experience replay helps the agent revisit and learn from past experiences, reducing the impact of temporal correlations in the training data. As a result, the agent's exploration and learning behavior can be more effective and stable.

