# CS4049 Assessment 2:

This assessment requires the use of a Taxi environment to train a model, using OpenAI. 

In [1]:
import gymnasium as gym  # For the environment.
import tensorflow as tf
import keras
import numpy as np
import random
import math
import matplotlib.pyplot as plt

print(tf.__version__)


2.15.0


__We can break down reinforcement learning into five simple steps:__

1. The agent is at state zero in an environment.
2. It will take an action based on a specific strategy.
3. It will receive a reward or punishment based on that action.
4. By learning from previous moves the the strategy of the agent becomes optimised. 
5. The process will repeat until an optimal strategy is found. 


The epsilon-greedy or $\epsilon$-greedy method balances the exploration of an environment with a probability $\epsilon \approx 10 \% $ and the exploitation of an environment, with probability $1-\epsilon$ at the same time. 

We start with a higher $\epsilon$, which reduces over time due to understanding the environment better.

## Tabular Method for the TaxiAgent:

In [2]:
class TaxiAgent:
    def __init__(self, gamma: float = 0.95, alpha: float = 0.7, currentEpsilon: float = 1.0, decayFactor: float = 0.1):
        """An agent to be used for the taxi. This will keep track of the state of the taxi. This takes in 4 values, the gamma or the discount factor, the alpha or the learning rate, the current epsilon(the factor that controls the rate of exploration), and the decay factor which controls the rate at which the epsilon reduces."""
        self.env = gym.make('Taxi-v3')
        state_space = self.env.observation_space.n
        action_space = self.env.action_space.n
        print(state_space, action_space)
        self.quality_matrix = np.zeros((state_space, action_space))
        self.gamma = gamma
        self.alpha = alpha
        self.currentEpsilon = currentEpsilon
        self.minEpsilon = decayFactor
        self.reset()
        """ print(env.action_space.n) """
        """ print(f'Random action = {env.action_space.sample()} ') """
        """ print(observation) """

        pass

    def chooseAction(self, observation) -> int:
        """Choose the action based on the epsilon greedy principle."""
        greediness = random.uniform(0, 1)
        if greediness > self.currentEpsilon:
            # Agent has chosen to exploit the environment
            action = np.argmax(self.quality_matrix[observation])
        else:
            # Agent has chosen to explore the environment
            action = self.env.action_space.sample()
        return action

    def reset(self) -> None:
        """Resets the environment."""
        self.observation, self.info = self.env.reset()

    def updateQualityMatrix(self, action: int, old_obs: int, new_obs: int, reward) -> None:
        """Internally updates the QMatrix using the Bellman equation."""
        self.quality_matrix[old_obs][action] += self.alpha*(reward+(self.gamma*np.max(
            self.quality_matrix[new_obs]) - self.quality_matrix[old_obs][action]))

    def decayEpsilon(self, episode: int) -> None:
        """A function that changes the epsilon amount to be smaller, reflecting the decrease in exploration."""
        self.currentEpsilon = self.minEpsilon + \
            (1 - self.minEpsilon)*np.exp(-self.gamma*episode)

    def step(self, action) -> bool:
        """New step function using the QMatrix. Will output True if the environment is terminated or finishes."""
        new_obs, reward, terminated, truncated, info = self.env.step(action)
        self.updateQualityMatrix(action, self.observation, new_obs, reward)
        self.observation = new_obs
        return terminated or truncated

In [3]:
def train(episodes: int, max_steps: int = 200):
    """The function to train the TaxiAgent."""
    agent = TaxiAgent()
    for episode in range(episodes):
        agent.reset()
        agent.decayEpsilon(episode)
        curr_step = 1
        done = False
        while curr_step < max_steps:
            action_to_take = agent.chooseAction(agent.observation)
            done = agent.step(action_to_take)
            curr_step += 1
            if done:
                break

    return agent


resulting_agent = train(2000, 200)

500 6


In [4]:
def evaluate_agent(env: gym.Env, max_steps: int, numEvalEpisodes: int, Q: np.array):
    """This function evaluates the agent environment and outputs the mean reward and the standard deviation reward for the environment."""

    episode_rewards = []
    for episode in range(numEvalEpisodes):
        state, _ = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps):
            # Take the action (index) that have the maximum reward
            action = np.argmax(Q[state])
            new_state, reward, done1, done2, info = env.step(action)
            total_rewards_ep += reward

            if done1 or done2:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward


mean_reward, std_reward = evaluate_agent(
    resulting_agent.env, 200, 1000, resulting_agent.quality_matrix)
print(f"Mean reward= {mean_reward:.2f} \n ± std of: {std_reward:.2f}")

Mean reward= 8.01 
 ± std of: 2.62


In [5]:
def watch_agent(env: gym.Env, max_steps: int, Q: np.array) -> None:
    """This is a visualising function for the environment."""
    state, _ = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
    rewards = []
    for step in range(max_steps):
        # Take the action (index) that have the maximum reward
        action = np.argmax(Q[state])
        new_state, reward, done1, done2, info = env.step(action)
        total_rewards_ep += reward
        rewards.append(total_rewards_ep)

        if done1 or done2:
            break
        state = new_state


new_env = gym.make("Taxi-v3", render_mode="human")
watch_agent(new_env, 200, resulting_agent.quality_matrix)
new_env.close()


## Deep Q Learning Method for Taxi Agent:

A deep learning model uses multiple layers of a neural network to extract the abstract data from an input.

In [29]:
""" Have an agent class, with a policy. """
import tensorflow as tf

from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.agents.dqn import dqn_agent
from tf_agents.specs import tensor_spec
from tf_agents.networks import sequential, q_network
from tf_agents.utils import common
from tf_agents.policies import random_tf_policy, py_epsilon_greedy_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver


In [7]:
num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration =   1 # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}
fc_layer_params = (60,10)

In [8]:
env_name = 'Taxi-v3'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)


# class RLAgent():
#   def __init__(self) -> None:
#     self.env_name = 'Taxi-v3'
#     """ self.env = gym.make('Taxi-v3') """
#     self.env = suite_gym.load(self.env_name)

#     train_py_env = suite_gym.load(self.env_name)
#     eval_py_env = suite_gym.load(self.env_name)
#     train_env = tf_py_environment.TFPyEnvironment(self.env_name)
#     train_env = tf_py_environment.TFPyEnvironment(self.env_name)
#     pass
#   def policy(self):
#     pass
  

In [9]:
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.

q_net = q_network.QNetwork(env.observation_spec(), env.action_spec(), fc_layer_params=fc_layer_params)
q_net.create_variables()
# dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
# q_values_layer = tf.keras.layers.Dense(
#     num_actions,
#     activation=None,
#     kernel_initializer=tf.keras.initializers.RandomUniform(
#         minval=-0.03, maxval=0.03),
#     bias_initializer=tf.keras.initializers.Constant(-0.2))


# q_net = sequential.Sequential(dense_layers + [q_values_layer])




TensorSpec(shape=(6,), dtype=tf.float32, name=None)

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [14]:
eval_policy = agent.policy
collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())


In [40]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [17]:
compute_avg_return(eval_env, random_policy, num_eval_episodes)

-798.5

In [41]:
# Import the tf_uniform_replay_buffer module
from tf_agents.replay_buffers import tf_uniform_replay_buffer

# Create a TFUniformReplayBuffer instance
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    agent.collect_data_spec,
    max_length=replay_buffer_max_length,
    batch_size=batch_size)


# Create a ReplayBufferObserver instance
rb_observer = [replay_buffer.add_batch]

driver = dynamic_step_driver.DynamicStepDriver(
    env=env,
    policy=eval_policy,
    observers=[rb_observer],
    num_steps=1
)


In [42]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2).prefetch(3)

print(dataset)


iterator = iter(dataset)
print(iterator)

<_PrefetchDataset element_spec=(Trajectory(
{'action': TensorSpec(shape=(64, 2), dtype=tf.int64, name=None),
 'discount': TensorSpec(shape=(64, 2), dtype=tf.float32, name=None),
 'next_step_type': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None),
 'observation': TensorSpec(shape=(64, 2), dtype=tf.int64, name=None),
 'policy_info': (),
 'reward': TensorSpec(shape=(64, 2), dtype=tf.float32, name=None),
 'step_type': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None)}), BufferInfo(ids=TensorSpec(shape=(64, 2), dtype=tf.int64, name=None), probabilities=TensorSpec(shape=(64,), dtype=tf.float32, name=None)))>
<tensorflow.python.data.ops.iterator_ops.OwnedIterator object at 0x000002403097FE20>


In [39]:
agent.train = common.function(agent.train)

agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

# Reset the environment.
time_step = train_py_env.reset()

collect_driver = dynamic_step_driver.DynamicStepDriver(
  env,
  agent.collect_policy,
  [rb_observer],
  num_steps=1
)

for _ in range(num_iterations):

  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)[0]
  print(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  # if step % log_interval == 0:
  #   print('step = {0}: loss = {1}'.format(step, train_loss))

  # if step % eval_interval == 0:
  #   avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
  #   print('step = {0}: Average Return = {1}'.format(step, avg_return))
  #   returns.append(avg_return)


AttributeError: 'tuple' object has no attribute 'rank'