# **Taxi 3**

In [1]:
# Imports

import numpy as np
import gym
import random

In [2]:
# Create environment: Taxi-v3 from OpenAI Gym
env = gym.make("Taxi-v3")

# Render gym
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|[35mB[0m: |
+---------+



In [4]:
# Create and initialize Q-table
action_size = env.action_space.n
print("Action size:", action_size)

state_size = env.observation_space.n
print("State size:", state_size)

Action size: 6
State size: 500


In [5]:
# Fill Q-table with zeros 
q_table = np.zeros((state_size, action_size))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


**3. Hyperparameters**

In [40]:
# Number of episodes
num_episodes = 50000 

# Max steps per episode
num_steps = 99 

# Learning rate
learning_rate = 0.06

# Discount rate
gamma = 0.86

# Exploration rate & its max starting value
epsilon = 0.95
epsilon_max = 0.95

# Minimum exploration probability 
epsilon_min = 0.01   

# Decay rate
decay_rate = 0.0099             

In [41]:
# Q-learning

for episode in range(num_episodes):
    # Reset the environment before each episode
    state = env.reset()

    step = 0
    done = False
    
    for step in range(num_steps):
        # Epsilon-greedy strategy
        explore_probability = random.uniform(0,1)
        
        # In case of exploitation
        if explore_probability > epsilon:
            action = np.argmax(q_table[state,:])
        
        # In case of exploration
        else:
            action = env.action_space.sample()
        
        # Take the action and get new state and reward
        new_state, reward, done, info = env.step(action)

        # Update Q-table
        q_table[state, action] = q_table[state, action] + learning_rate * (reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])
                
        # Update state
        state = new_state
        
        # If done: finish episode
        if done: 
            break
    
    # Reduce epsilon
    epsilon = epsilon_min + (epsilon_max - epsilon_min) * np.exp(-decay_rate * episode) 

In [42]:
# Test

env.reset()
rewards = []
num_test_episodes = 99


for episode in range(num_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(num_steps):
        # UNCOMMENT IT IF YOU WANT TO SEE OUR AGENT PLAYING
        # env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(q_table[state,:])
        
        new_state, reward, done, info = env.step(action)

        #env.render()
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score: " +  str(sum(rewards) / num_test_episodes))

Score: 8.242424242424242
