# Q-Learning Grid Search

## Import

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gymnasium as gym

import numpy as np
from random import randint, uniform
from IPython.display import clear_output

from time import sleep

## Charging environment

In [3]:
env = gym.make("Taxi-v3", render_mode="ansi")
# We can test render_mode to 'human' in the future

env.reset()
print(env.render())

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+




The filled square represents the taxi, which is yellow without a passenger and green with a passenger.  
The pipe ("|") represents a wall which the taxi cannot cross.  
R, G, Y, B are the possible pickup and destination locations. The blue letter represents the current passenger pick-up location, and the purple letter is the current destination.  

In [4]:
print(f"Action Space {env.action_space}")
print(f"State Space {env.observation_space}")

Action Space Discrete(6)
State Space Discrete(500)


We got 6 actions inside the environment:
- **0**: south
- **1**: north
- **2**: east
- **3**: west
- **4**: pickup
- **5**: dropoff

And 500 possible states:
- **5x5** grid
- **4** destinations
- **5** passenger locations

In [5]:
print(f"Current state: {env.s}")

Current state: 493


In [6]:
env.P[env.s]

{0: [(1.0, 493, -1, False)],
 1: [(1.0, 393, -1, False)],
 2: [(1.0, 493, -1, False)],
 3: [(1.0, 473, -1, False)],
 4: [(1.0, 493, -10, False)],
 5: [(1.0, 493, -10, False)]}

For each action in this state, we have:
- **probability**: always at 1.0 in this env
- **nextstate**: the next state if the agent takes this action
- **reward**: the reward (positive or negative) gained after performing this action
- **done**: boolean at True when a passenger is correctly dropof

## Q-Learning

In order to implementing the Q-Learning algorithm, we will be through those steps:

- Initialize the Q-table by all zeros.
- Start exploring actions: For each state, select any one among all possible actions for the current state (S).
- Travel to the next state (S') as a result of that action (a).
- For all possible actions from the state (S') select the one with the highest Q-value.
- Update Q-table values using the equation.
- Set the next state as the current state.
- If goal state is reached, then end and repeat the process.

In [7]:
# Hyperparameters - Firsts tests
# EPSILON_LIST = np.arange(0.1, 1, 0.1)
# ALPHA_LIST = np.arange(0.1, 1, 0.1)
# GAMMA_LIST = np.arange(0.1, 1, 0.1)

# Hyperparameters - Second tests
EPSILON_LIST = np.arange(0.15, 0.25, 0.01)
ALPHA_LIST = np.arange(0.05, 0.15, 0.01)
GAMMA_LIST = np.arange(0.35, 0.45, 0.01)

For grid search pruposes, we scan all values of hyperparameters from 0 to 1, with steps of 0.1  
With a second test we can search with better values 

In [8]:
# with open('results_grid-search_v2.csv', 'a') as results:
#     results.write("Epsilon;Alpha;Gamma;Epochs_learn;Penalties_learn;Epochs_eval;Penalties_eval;Worked")

In [9]:
# Grid search
for EPSILON in EPSILON_LIST:
    for ALPHA in ALPHA_LIST:
        for GAMMA in GAMMA_LIST:
            # Q-table
            q_table = np.zeros([env.observation_space.n, env.action_space.n])
            
            # Fields
            penalties_learn = epochs_learn = 0
            penalties_eval = epochs_eval = 0
            worked = True
            
            # Learning
            for i in range(100_000):
                # Reset the environment
                state = env.reset()[0]
                
                # initialize fields
                done = False
                
                # Start the episode process
                while not done:
                    # Deciding which action to perform
                    if uniform(0, 1) < EPSILON:
                        action = env.action_space.sample() # Exploration
                    else:
                        action = np.argmax(q_table[state]) # Exploitation
                    
                    # Performing action inside the environment
                    next_state, reward, done, info, _ = env.step(action)
                    
                    # Getting usefull fields to calculate the new value of Q-Table
                    old_value = q_table[state, action]
                    next_max = np.max(q_table[next_state])
                    
                    # Calculate new value
                    new_value = (1 - ALPHA) * old_value + ALPHA * (reward + GAMMA * next_max)
                    
                    q_table[state, action] = new_value
                    
                    # Getting stats when the agent performed illegal action (pickup or dropoff)
                    penalties_learn += 1 if reward == -10 else 0
                    
                    # Updating state
                    state = next_state
                    
                    epochs_learn += 1
                    
                    if epochs_learn > 10_000_000:
                        worked = False
                        done = True
                
                # Display the number of episode
                if i % 100 == 0:
                    clear_output(wait=True)
                    print(f"Learning Phase\n\rEpsilon: {EPSILON} | Alpha: {ALPHA} | Gamma: {GAMMA} => Episode: {i}")
            
            # Evaluate
            if worked:
                for i in range(100):
                    # Print
                    clear_output(wait=True)
                    print(f"Evaluation Phase\n\rEpsilon: {EPSILON} | Alpha: {ALPHA} | Gamma: {GAMMA} => Episode: {i}")
                    
                    # Reset the environment
                    state = env.reset()[0]
                    
                    # Initialize fields
                    done = False
                    
                    # Start the episode process
                    while not done:
                        # Only Exploitation during the evaluation phase
                        action = np.argmax(q_table[state])
                        
                        # Performing action inside the environment
                        state, reward, done, info, _ = env.step(action)
                        
                        # Getting stats when the agent performed illegal action (pickup or dropoff)
                        penalties_eval += 1 if reward == -10 else 0
                        
                        epochs_eval += 1
                        
                        if epochs_eval > 100_000:
                            worked = False
                            done = True
            
            # Adding stats inside the csv file
            with open('results_grid-search_v2.csv', 'a') as results:
                results.write(f"\r{EPSILON};{ALPHA};{GAMMA};{epochs_learn};{penalties_learn};{epochs_eval};{penalties_eval};{worked}")

print("Finished")

Evaluation Phase
Epsilon: 0.25 | Alpha: 0.13 | Gamma: 0.36 => Episode: 99
