In [10]:
import numpy as np
import random
import gym

from time import sleep
from IPython.display import clear_output

## Configuring the Environment

In [11]:
env = gym.make('Taxi-v3').env

* Blue = Passenger
* Pink = Destination

In [12]:
env.render()

+---------+
|[35mR[0m: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [13]:
env.reset()
env.render()

+---------+
|R: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [14]:
# row x col x passenger locations x destinations
env.observation_space

Discrete(500)

In [15]:
env.action_space

Discrete(6)

## Training

In [20]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table.shape

(500, 6)

In [21]:
%%time

alpha = 0.1    # Learning rate
gamma = 0.6    # discount factor
epsilon = 0.1

for i in range(100000):
  state = env.reset()
 
  penalties, rewards = 0, 0
  done = False

  while not done:
    # Exploration - random position
    if random.uniform(0, 1) < epsilon:
      action = env.action_space.sample()
    # Exploitation - best position
    else:
      action = np.argmax(q_table[state])

    next_state, rewards, done, info = env.step(action)

    q_old = q_table[state, action]
    next_max_value = np.max(q_table[next_state])

    q_new = (1 - alpha) * q_old + alpha * (rewards + gamma * next_max_value)
    q_table[state, action] = q_new

    if rewards == -10:
      penalties += 1
    
    state = next_state

  if i % 100 == 0:
    clear_output(wait=True)
    print('Episode: ', i)

print('Training Completed')

Episode:  99900
Training Completed
Wall time: 57.7 s


In [26]:
env.reset()
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |B: |
+---------+



In [35]:
# row, col, pass location, dest location
env.encode(3, 2, 4, 2)

358

In [36]:
q_table[358]

array([-1.51009661, -0.7504    , -1.35142091, -1.15075502, -6.77331254,
       -4.87979358])

In [37]:
env.render()
env.step(1)

+---------+
|R: | : :[34;1mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)


(26, -1, False, {'prob': 1.0})

## Avaliação

In [40]:
total_penalties = 0
episode = 100
frames = []

for _ in range(episode):
  state = env.reset()
  penalties, rewards = 0, 0
  done = False

  while not done:
    action = np.argmax(q_table[state])
    state, rewards, done, info = env.step(action)

    if rewards == -10:
      penalties += 1

    frames.append({
        'Frame': env.render(mode='ansi'),
        'State': state,
        'Action': action,
        'Reward': rewards
    })

  total_penalties += penalties

print('Episode', episode)
print('Penalties', total_penalties)

Episode 100
Penalties 0


In [41]:
for frame in frames:
  clear_output(wait=True)
  print(frame['Frame'])
  print('State', frame['State'])
  print('Action', frame['Action'])
  print('Reward', frame['Reward'])
  sleep(.1)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

State 410
Action 5
Reward 20
