Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Taxi"

In [180]:
import numpy as np
import random
from taxi_env_extended import TaxiEnvExtended

In [181]:
env = TaxiEnvExtended()

In [182]:
env.max_steps

200

Obtener la cantidad de estados y acciones

In [183]:
actions = env.action_space.n
states = env.observation_space.n

Inicialización de la tabla Q

In [184]:
Q = np.zeros((states, actions))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

Obtención de la acción a partir de la tabla Q

In [185]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [186]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()\
    
    else: # exploit
        action = np.argmax(Q[state])\
        
    return action

Ejemplo de episodio 

In [187]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
step_count = 0
while not done:
    state = obs
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    step_count += 1
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)
print('total_steps', step_count)

66
-> 66 2 -1 86 False
-> 86 0 -1 186 False
-> 186 4 -10 186 False
-> 186 0 -1 286 False
-> 286 1 -1 186 False
-> 186 0 -1 286 False
-> 286 1 -1 186 False
-> 186 0 -1 286 False
-> 286 0 -1 386 False
-> 386 0 -1 486 False
-> 486 5 -10 486 False
-> 486 2 -1 486 False
-> 486 1 -1 386 False
-> 386 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 2 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 1 -1 386 False
-> 386 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 1 -1 386 False
-> 386 3 -1 366 False
-> 366 1 -1 266 False
-> 266 5 -10 266 False
-> 266 4 -10 266 False
-> 266 2 -1 286 False
-> 286 4 -10 286 False
-> 286 5 -10 286 False
-> 286 4 -10 286 False
-> 286 5 -10 286 False
-> 286 5 -10 286 False
-> 286 0 -1 386 False
-> 386 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-> 486 0 -1 486 False
-

Generar muchos episodios

In [197]:
episodes = 10000

In [189]:
alpha=1 / (episodes+1)
alpha

0.000999000999000999

In [190]:
gamma = 0.9

In [191]:
total_reward = 0
step_count = 0

In [201]:
i = 0
success = 0
while i< episodes:
    obs, _ = env.reset()
    done=False
    while not done:
        state = obs
        action = epsilon_greedy_policy(state, Q, 0.5)
        obs, reward, done, _, _ = env.step(action)
        
        total_reward += reward
        if (reward ==20):
            print("lo dejo")
            success+=1
        
        step_count += 1          
        value = Q[state][action] + alpha*(reward + gamma*np.argmax(Q[obs]) - Q[state][action])
        Q[state][action] = value
    
    
    if(i%10 == 0):
        print(i, " of ", episodes, " Episodes")
    i=i+1
print(success)

0  of  10000  Episodes
10  of  10000  Episodes
20  of  10000  Episodes
30  of  10000  Episodes
40  of  10000  Episodes
50  of  10000  Episodes
60  of  10000  Episodes
70  of  10000  Episodes
80  of  10000  Episodes
lo dejo
90  of  10000  Episodes
100  of  10000  Episodes
110  of  10000  Episodes
120  of  10000  Episodes
130  of  10000  Episodes
lo dejo
140  of  10000  Episodes
150  of  10000  Episodes
160  of  10000  Episodes
170  of  10000  Episodes
180  of  10000  Episodes
190  of  10000  Episodes
200  of  10000  Episodes
210  of  10000  Episodes
lo dejo
220  of  10000  Episodes
230  of  10000  Episodes
240  of  10000  Episodes
250  of  10000  Episodes
260  of  10000  Episodes
270  of  10000  Episodes
280  of  10000  Episodes
290  of  10000  Episodes
300  of  10000  Episodes
310  of  10000  Episodes
320  of  10000  Episodes
330  of  10000  Episodes
340  of  10000  Episodes
350  of  10000  Episodes
360  of  10000  Episodes
370  of  10000  Episodes
380  of  10000  Episodes
390  of  100

In [193]:
print(success/episodes*100, '%')

0.4 %


In [194]:
total_reward

-494715

In [195]:
step_count

199896

In [202]:
Q

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.53215003, -0.28167604, -0.26128907, -0.25610256,  0.47078209,
        -2.65705866],
       [ 0.33262636,  0.69804676, -0.40174798,  1.65847734,  0.71828213,
        -2.7829118 ],
       ...,
       [ 0.38564685, -0.19417773,  0.37835953,  1.40814286, -1.43583278,
        -1.50503306],
       [ 0.49934701, -0.51935856,  0.84208084,  1.10121346, -4.15377888,
        -4.26366511],
       [ 0.0085508 ,  0.01180613,  0.07826128,  0.04863445, -0.08966114,
        -0.12202155]])