Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [172]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [173]:
env = PendulumEnvExtended(render_mode='rgb_array')

Discretización de los estados

In [174]:
x_space_partitions = 11
y_space_partitions = 11
vel_space_partitions = 33

In [175]:
x_space = np.linspace(-1, 1, x_space_partitions)
x_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [176]:
y_space = np.linspace(-1, 1, y_space_partitions)
y_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [177]:

vel_space = np.linspace(-8, 8, vel_space_partitions)
vel_space

array([-8. , -7.5, -7. , -6.5, -6. , -5.5, -5. , -4.5, -4. , -3.5, -3. ,
       -2.5, -2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,  1.5,  2. ,  2.5,
        3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,  6.5,  7. ,  7.5,  8. ])

Obtener el estado a partir de la observación

In [178]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [179]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

(3, 6, 17)

Discretización de las acciones

In [180]:
actions_partitions = 21

In [181]:
actions = list(np.linspace(-2, 2, actions_partitions))
actions

[-2.0,
 -1.8,
 -1.6,
 -1.4,
 -1.2,
 -1.0,
 -0.7999999999999998,
 -0.5999999999999999,
 -0.3999999999999999,
 -0.19999999999999996,
 0.0,
 0.20000000000000018,
 0.40000000000000036,
 0.6000000000000001,
 0.8000000000000003,
 1.0,
 1.2000000000000002,
 1.4000000000000004,
 1.6,
 1.8000000000000003,
 2.0]

In [182]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [195]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [184]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [185]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    # exploit
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [186]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action_idx])
     
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
   # Usar action_idx para actualizar Q
   
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)

[-0.93560565  0.35304683 -0.33803278]
-> (1, 7, 16) -2.0 -7.744091748590472 [-0.9343064   0.356471   -0.07324766] False
-> (1, 7, 16) -1.6 -7.716846599689232 [-0.9428271   0.33328226  0.4941056 ] False
-> (1, 7, 16) -2.0 -7.874552755763511 [-0.95457107  0.2979834   0.7440673 ] False
-> (1, 7, 16) -0.5999999999999999 -8.1193599398345 [-0.97152746  0.23692708  1.2675549 ] False
-> (1, 7, 16) 1.2000000000000002 -8.58854309831146 [-0.98847944  0.1513552   1.7452501 ] False
-> (1, 7, 16) 0.0 -9.246617963491099 [-0.99903214  0.04398661  2.1587665 ] False
-> (1, 7, 16) 2.0 -10.065102551623427 [-0.99675465 -0.08049982  2.4917564 ] False
-> (1, 7, 16) -2.0 -9.990640559613121 [-0.97963595 -0.20078199  2.4313817 ] False
-> (1, 7, 16) -2.0 -9.231448923335059 [-0.9504252  -0.31095326  2.280795  ] False
-> (1, 7, 16) 0.8000000000000003 -8.507069702149225 [-0.9074697  -0.42011756  2.3475802 ] False
-> (1, 7, 16) -2.0 -7.884473599303694 [-0.8601669 -0.5100126  2.0324922] False
-> (1, 7, 16) -2.0 -7.20

In [187]:
iterations = 10

In [188]:
alpha = 0.01
alpha

0.01

In [189]:
gamma = 0.9
gamma

0.9

In [190]:
episodes=100

In [197]:
i=0
while i<episodes:
    obs,_ = env.reset()
    print(obs)
    done = False
    total_reward = 0
    state = get_state(obs)
    total_steps = 0 
    while not done:
        x, y, vel = state
        
        action = epsilon_greedy_policy(state, Q, 0.5)
        action_idx = actions.index(action)    
        real_action = np.array([action_idx])

        obs, reward, done, _, _ = env.step(real_action)
        total_steps += 1 
        next_state = get_state(obs)
        next_x, next_y, next_vel = next_state
        value = Q[x][y][vel][action_idx] + alpha * (reward + (gamma* (np.max(Q[next_x][next_y][next_vel]))) - Q[x][y][vel][action_idx] )
        Q[x][y][vel][action_idx] = value
        state = next_state
        total_reward += reward
        print('->', state, action_idx, reward, obs, done)
        env.render()
    print("relation", total_reward/total_steps)
    print('total_reward', total_reward)
    i+=1

[ 0.9882732   0.15269609 -0.8801177 ]
-> (10, 6, 16) 15 -0.10496031274141597 [ 0.9915598   0.12965001 -0.46559563] False
-> (10, 6, 16) 0 -0.0385820894817657 [ 0.9937794   0.1113666  -0.36835814] False
-> (10, 6, 16) 1 -0.027022905419622205 [ 0.9945076   0.10466439 -0.13483317] False
-> (10, 6, 17) 2 -0.016812870768233144 [0.9931587  0.11677267 0.24366513] False
-> (10, 6, 17) 0 -0.019635558845274993 [0.9910885  0.13320482 0.33124462] False
-> (10, 6, 18) 20 -0.03282177412361966 [0.9855578  0.16933939 0.73114824] False
-> (10, 7, 19) 10 -0.08641197477342702 [0.97410524 0.22609498 1.1581528 ] False
-> (10, 7, 20) 7 -0.19014633160061548 [0.9525002  0.30453783 1.627724  ] False
-> (10, 8, 21) 19 -0.3647098146755998 [0.914203   0.40525666 2.1561275 ] False
-> (10, 8, 21) 0 -0.6389940594357439 [0.8575734  0.51436156 2.46007   ] False
-> (9, 9, 22) 1 -0.8980785289448995 [0.77121097 0.6365797  2.995841  ] False
-> (9, 9, 23) 0 -1.3736823961175766 [0.64961463 0.7602637  3.473276  ] False
-> (8

In [196]:
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    