Para maiores testes vamos mudar alguns argumentos no nosso treinamento

In [56]:
import gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3", render_mode='ansi').env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.reset()

initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

print("\n" + streets.render())


+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+




Primeiramente mantemos os mesmos como base

In [79]:
import numpy as np

q_table1 = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.1 # porcentagem de aprendizado por ciclo
discount_factor = 0.6
exploration = 0.1 # chance de tomar uma decisão aleatória
epochs = 10000 # quantidade de ciclos

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    state = state[0]
    while not done:
        
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Toma uma decisão aleatória
        else:
            action = np.argmax(q_table1[state]) # Pega a posição do maior valor de q
        
        
        next_state, reward, done, info, x = streets.step(action)
        
        prev_q = q_table1[state, action]
        next_max_q = np.max(q_table1[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table1[state, action] = new_q
        
        state = next_state

Nesse aumentamos a aprendizagem por ciclo, mas diminuimos a quantidade de ciclos

In [80]:
q_table2 = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.5 # porcentagem de aprendizado por ciclo
discount_factor = 0.6
exploration = 0.1 # chance de tomar uma decisão aleatória
epochs = 5000 # quantidade de ciclos

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    state = state[0]
    while not done:
        
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Toma uma decisão aleatória
        else:
            action = np.argmax(q_table2[state]) # Pega a posição do maior valor de q
        
        
        next_state, reward, done, info, x = streets.step(action)
        
        prev_q = q_table2[state, action]
        next_max_q = np.max(q_table2[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table2[state, action] = new_q
        
        state = next_state

Nesse aumentamos a quantidade de ciclos e a chance de exploração

In [81]:
q_table3 = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.1 # porcentagem de aprendizado por ciclo
discount_factor = 0.6
exploration = 0.5 # chance de tomar uma decisão aleatória
epochs = 20000 # quantidade de ciclos

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    state = state[0]
    while not done:
        
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Toma uma decisão aleatória
        else:
            action = np.argmax(q_table3[state]) # Pega a posição do maior valor de q
        
        
        next_state, reward, done, info, x = streets.step(action)
        
        prev_q = q_table3[state, action]
        next_max_q = np.max(q_table3[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table3[state, action] = new_q
        
        state = next_state

Nesse aumentamos a taxa de aprendizagem e a chance de exploração

In [82]:
q_table4 = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.5 # porcentagem de aprendizado por ciclo
discount_factor = 0.6
exploration = 0.5 # chance de tomar uma decisão aleatória
epochs = 10000 # quantidade de ciclos

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    state = state[0]
    while not done:
        
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Toma uma decisão aleatória
        else:
            action = np.argmax(q_table4[state]) # Pega a posição do maior valor de q
        
        
        next_state, reward, done, info, x = streets.step(action)
        
        prev_q = q_table4[state, action]
        next_max_q = np.max(q_table4[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table4[state, action] = new_q
        
        state = next_state

Por fim, testamos todos juntos

In [89]:
from IPython.display import clear_output
from time import sleep
trips = [0,0,0,0]
for tripnum in range(1, 101):
    state = streets.reset()
    state = state[0]
    state_save = state
    done = False
    trip_length = 0
    
    
    while not done and trip_length < 25:
        action = np.argmax(q_table1[state])
        next_state, reward, done, info, x = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render())
        
        state = next_state
        trip_length += 1
        
    trips[0] += trip_length - 1
    trip_length = 0
    streets.reset()
    streets.s = state_save
    state = state_save
    done = False
    
    while not done and trip_length < 25:
        action = np.argmax(q_table2[state])
        next_state, reward, done, info, x = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render())
        
        state = next_state
        trip_length += 1
    trips[1] += trip_length - 1
    trip_length = 0
    streets.reset()
    streets.s = state_save
    state = state_save
    done = False
    
    while not done and trip_length < 25:
        action = np.argmax(q_table3[state])
        next_state, reward, done, info, x = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render())
        
        state = next_state
        trip_length += 1
    trips[2] += trip_length - 1
    trip_length = 0
    streets.reset()
    streets.s = state_save
    state = state_save
    done = False
    
    while not done and trip_length < 25:
        action = np.argmax(q_table4[state])
        next_state, reward, done, info, x = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render())
        
        state = next_state
        trip_length += 1
    trips[3] += trip_length - 1
    
    

Trip number 100 Step 14
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)



Não há grande diferença nas médias apresentadas, podendo mudar a melhor em diferentes execuções

In [90]:
print(f'Média 1: {trips[0]/100}')
print(f'Média 2: {trips[1]/100}')
print(f'Média 3: {trips[2]/100}')
print(f'Média 4: {trips[3]/100}')

Média 1: 12.66
Média 2: 13.15
Média 3: 12.72
Média 4: 13.43
