# CS4049 Assessment 2:

This assessment requires the use of a Taxi environment to train a model, using OpenAI. 

In [28]:
import gymnasium as gym # For the environment.
import tensorflow as tf
import keras
import numpy as np
import random
import math

__We can break down reinforcement learning into five simple steps:__

1. The agent is at state zero in an environment.
2. It will take an action based on a specific strategy.
3. It will receive a reward or punishment based on that action.
4. By learning from previous moves and optimizing the strategy. 
5. The process will repeat until an optimal strategy is found. 


The epsilon-greedy method balances the exploration of an environment with a probability $\epsilon \approx 10 \% $ and the exploitation of an environment , with probability $1-\epsilon$ at the same time. 

We start with a higher $\epsilon$, which reduces over time due to understanding the environment better.

In [29]:
class TaxiAgent:
  def __init__(self):
    """An agent to be used for the taxi. This will keep track of the state of the taxi."""
    self.env = gym.make('Taxi-v3')
    state_space = self.env.observation_space.n
    action_space = self.env.action_space.n
    self.quality_matrix = np.zeros((state_space, action_space))
    self.curr_epsilon = 1
    self.min_epsilon = 1
    self.alpha = 0.7
    self.gamma = 0.95
    self.observation, self.info = self.env.reset()
    """ print(env.action_space.n) """
    """ print(f'Random action = {env.action_space.sample()} ') """
    """ print(observation) """
    
    pass
    
  def choose_action(self, obs):
    """Choose action based on the epsilon greedy principle."""
    greediness = random.uniform(0,1)
    if greediness > self.curr_epsilon:
      # Agent has chosen to exploit the environment
      action = np.argmax(self.quality_matrix[obs])
    else:
      # Agent has chosen to explore the environment
      action = self.env.action_space.sample()    
    return action
  
  def update_quality(self, action, old_obs, new_obs, reward):
    """Update the QMatrix using the Bellman equation."""
    self.quality_matrix[old_obs][action] += self.alpha*(reward+(self.gamma*np.max(self.quality_matrix[new_obs]) - self.quality_matrix[old_obs][action]))
  
  def decay_epsilon(self, episode):
    """"""
    self.curr_epsilon = self.min_epsilon + (1 - self.min_epsilon)*np.exp(-self.gamma*episode)

In [None]:

def train(episodes, max_steps=200):
    
    agent = TaxiAgent()
    for episode in range(episodes):
        
        curr_step = 1
        
        while curr_step < max_steps:
            action_to_take = agent.choose_action(agent.env.obs)
            agent.env.step()