# CS4049 Assessment 2:

This assessment requires the use of a Taxi environment to train a model, using OpenAI. 

In [14]:
import gymnasium as gym # For the environment.
import tensorflow as tf
import keras
import numpy as np
import random
import math
import matplotlib.pyplot as plt

__We can break down reinforcement learning into five simple steps:__

1. The agent is at state zero in an environment.
2. It will take an action based on a specific strategy.
3. It will receive a reward or punishment based on that action.
4. By learning from previous moves and optimizing the strategy. 
5. The process will repeat until an optimal strategy is found. 


The epsilon-greedy method balances the exploration of an environment with a probability $\epsilon \approx 10 \% $ and the exploitation of an environment , with probability $1-\epsilon$ at the same time. 

We start with a higher $\epsilon$, which reduces over time due to understanding the environment better.

In [15]:
class TaxiAgent:
  def __init__(self):
    """An agent to be used for the taxi. This will keep track of the state of the taxi."""
    self.env = gym.make('Taxi-v3')
    state_space = self.env.observation_space.n
    action_space = self.env.action_space.n
    self.quality_matrix = np.zeros((state_space, action_space))
    self.curr_epsilon = 1
    self.min_epsilon = 1
    self.alpha = 0.7
    self.gamma = 0.95
    self.reset()
    """ print(env.action_space.n) """
    """ print(f'Random action = {env.action_space.sample()} ') """
    """ print(observation) """
    
    pass
  
  
  def choose_action(self, obs):
    """Choose action based on the epsilon greedy principle."""
    greediness = random.uniform(0,1)
    if greediness > self.curr_epsilon:
      # Agent has chosen to exploit the environment
      action = np.argmax(self.quality_matrix[obs])
    else:
      # Agent has chosen to explore the environment
      action = self.env.action_space.sample()    
    return action
  
  def reset(self):
    """Resets the environment."""
    self.observation, self.info = self.env.reset()
    
  def update_quality(self, action, old_obs, new_obs, reward):
    """Update the QMatrix using the Bellman equation."""
    self.quality_matrix[old_obs][action] += self.alpha*(reward+(self.gamma*np.max(self.quality_matrix[new_obs]) - self.quality_matrix[old_obs][action]))
  
  def decay_epsilon(self, episode):
    """Change the epsilon amount to be smaller, reflecting the decrease in exploration."""
    self.curr_epsilon = self.min_epsilon + (1 - self.min_epsilon)*np.exp(-self.gamma*episode)
    
  def step(self, action):
    new_obs, reward, terminated, truncated, info = self.env.step(action)
    self.update_quality(action, self.observation, new_obs, reward)
    self.observation = new_obs
    return terminated or truncated
    

In [16]:

def train(episodes, max_steps=200):
    """Train the TaxiAgent."""
    
    agent = TaxiAgent()
    for episode in range(episodes):
        agent.reset()
        agent.decay_epsilon(episode)
        curr_step = 1
        done = False
        while curr_step < max_steps:
            action_to_take = agent.choose_action(agent.observation)
            done = agent.step(action_to_take)
            if done:
                break
    
    return agent


resulting_agent = train(1000, 200)

Other methods for exploration-exploitation are:
- The upper confidence level.
- Thompson sampling.


In [17]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q):

  episode_rewards = []
  for episode in range(n_eval_episodes):
    state, _ = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
   
    for step in range(max_steps):
      # Take the action (index) that have the maximum reward
      action = np.argmax(Q[state])
      new_state, reward, done1, done2, info = env.step(action)
      total_rewards_ep += reward
       
      if done1 or done2:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

mean_reward, std_reward = evaluate_agent(resulting_agent.env, 200, 1000, resulting_agent.quality_matrix)
print(f"Mean_reward= {mean_reward:.2f} +/- {std_reward:.2f}")

: 

In [None]:
def watch_agent(env, max_steps, Q):
    state, _ = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
   
    for step in range(max_steps):
      # Take the action (index) that have the maximum reward
      action = np.argmax(Q[state])
      new_state, reward, done1, done2, info = env.step(action)
      total_rewards_ep += reward
      
      if done1 or done2:
        break
      state = new_state

new_env = gym.make("Taxi-v3", render_mode="human")
watch_agent(new_env, 200, resulting_agent.quality_matrix)
new_env.close()

KeyboardInterrupt: 