Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Taxi"

In [6]:
import numpy as np
import random
from taxi_env_extended import TaxiEnvExtended

In [7]:
env = TaxiEnvExtended()

In [8]:
env.max_steps

200

Obtener la cantidad de estados y acciones

In [9]:
actions = env.action_space.n
states = env.observation_space.n

Inicialización de la tabla Q

In [10]:
Q = np.zeros((states, actions))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

Obtención de la acción a partir de la tabla Q

In [11]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [12]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()\
    
    else: # exploit
        action = np.argmax(Q[state])\
        
    return action

Ejemplo de episodio 

In [13]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
step_count = 0
while not done:
    state = obs
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    step_count += 1
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)
print('total_steps', step_count)

48
-> 48 4 -10 48 False
-> 48 0 -1 148 False
-> 148 0 -1 248 False
-> 248 1 -1 148 False
-> 148 0 -1 248 False
-> 248 0 -1 348 False
-> 348 5 -10 348 False
-> 348 0 -1 448 False
-> 448 3 -1 428 False
-> 428 2 -1 448 False
-> 448 5 -10 448 False
-> 448 1 -1 348 False
-> 348 1 -1 248 False
-> 248 0 -1 348 False
-> 348 0 -1 448 False
-> 448 4 -10 448 False
-> 448 2 -1 448 False
-> 448 3 -1 428 False
-> 428 0 -1 428 False
-> 428 5 -10 428 False
-> 428 0 -1 428 False
-> 428 4 -10 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 4 -10 428 False
-> 428 1 -1 328 False
-> 328 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 0 -1 428 False
-> 428 2 -1 448 False
-> 448 3 -1 428 False
-> 428 0 -1 428 False
-> 428 2 -1 448 False
-> 448 4 -10 448 False
-> 448 0 -1 448 False
-> 448 0 -1 448 False
-> 448 2 -1 448 False
-> 448 0 -1 448 False
-> 448 0 -1 448 False
->

Generar muchos episodios

In [31]:
episodes = 100000

In [32]:
alpha=1 / (episodes+1)
alpha

9.99990000099999e-06

In [33]:
gamma = 0.9

In [34]:
total_reward = 0
step_count = 0

In [35]:
i = 0
success = 0
while i< episodes:
    obs, _ = env.reset()
    done=False
    while not done:
        state = obs
        action = epsilon_greedy_policy(state, Q, 0.7)
        obs, reward, done, _, _ = env.step(action)
        
        total_reward += reward
        if (reward ==20):
            print("lo dejo")
            success+=1
        
        step_count += 1          
        value = Q[state][action] + alpha*(reward + gamma*np.argmax(Q[obs]) - Q[state][action])
        Q[state][action] = value
    
    
    if(i%10 == 0):
        print(i, " of ", episodes, " Episodes")
    i=i+1
print(success)

0  of  100000  Episodes
lo dejo
10  of  100000  Episodes
lo dejo
lo dejo
lo dejo
20  of  100000  Episodes
30  of  100000  Episodes
40  of  100000  Episodes
50  of  100000  Episodes
60  of  100000  Episodes
lo dejo
70  of  100000  Episodes
80  of  100000  Episodes
90  of  100000  Episodes
lo dejo
lo dejo
100  of  100000  Episodes
110  of  100000  Episodes
120  of  100000  Episodes
130  of  100000  Episodes
140  of  100000  Episodes
lo dejo
150  of  100000  Episodes
160  of  100000  Episodes
170  of  100000  Episodes
180  of  100000  Episodes
lo dejo
190  of  100000  Episodes
lo dejo
200  of  100000  Episodes
lo dejo
210  of  100000  Episodes
220  of  100000  Episodes
230  of  100000  Episodes
lo dejo
240  of  100000  Episodes
250  of  100000  Episodes
260  of  100000  Episodes
270  of  100000  Episodes
280  of  100000  Episodes
290  of  100000  Episodes
300  of  100000  Episodes
lo dejo
lo dejo
310  of  100000  Episodes
lo dejo
320  of  100000  Episodes
330  of  100000  Episodes
lo dejo

In [36]:
print(success/episodes*100, '%')

5.1450000000000005 %


In [37]:
total_reward

-59691185

In [21]:
step_count

1961402

In [38]:
Q

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.19700037,  0.33011789,  0.15422711,  0.95761613,  0.319165  ,
        -1.39473942],
       [-0.20340659,  0.51369606,  0.33304333,  0.50829388,  2.01833302,
        -1.32245811],
       ...,
       [ 0.24074913, -0.15044562,  0.24586614,  0.35835083, -1.09949355,
        -1.1006712 ],
       [ 0.32197743,  0.32366772,  0.68515191,  0.31949255, -3.26937891,
        -3.19946739],
       [ 0.00955551, -0.00566354,  0.01216256,  0.0538003 , -0.04379357,
        -0.04326843]])