Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [37]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [38]:
env = PendulumEnvExtended()

Discretización de los estados

In [39]:
x_space_partitions = 11
y_space_partitions = 11
vel_space_partitions = 33

In [40]:
x_space = np.linspace(-1, 1, x_space_partitions)
x_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [41]:
y_space = np.linspace(-1, 1, y_space_partitions)
y_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [42]:

vel_space = np.linspace(-8, 8, vel_space_partitions)
vel_space

array([-8. , -7.5, -7. , -6.5, -6. , -5.5, -5. , -4.5, -4. , -3.5, -3. ,
       -2.5, -2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,  1.5,  2. ,  2.5,
        3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,  6.5,  7. ,  7.5,  8. ])

Obtener el estado a partir de la observación

In [43]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [44]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

(3, 6, 17)

Discretización de las acciones

In [45]:
actions_partitions = 21

In [46]:
actions = list(np.linspace(-2, 2, actions_partitions))
actions

[-2.0,
 -1.8,
 -1.6,
 -1.4,
 -1.2,
 -1.0,
 -0.7999999999999998,
 -0.5999999999999999,
 -0.3999999999999999,
 -0.19999999999999996,
 0.0,
 0.20000000000000018,
 0.40000000000000036,
 0.6000000000000001,
 0.8000000000000003,
 1.0,
 1.2000000000000002,
 1.4000000000000004,
 1.6,
 1.8000000000000003,
 2.0]

In [47]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [48]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [49]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [50]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    # exploit
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [51]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action_idx])
     
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
   # Usar action_idx para actualizar Q
   
    total_reward += reward
    print('->', state, action, reward, obs, done)
print('total_reward', total_reward)

[-0.56429476  0.8255734   0.46558338]
-> (3, 10, 17) -1.8 -4.733204942850369 [-0.6141567  0.7891841  1.2347634] False
-> (3, 10, 17) 2.0 -5.138793799398298 [-0.6944459   0.71954495  2.1266515 ] False
-> (3, 10, 17) -2.0 -5.9205937001620725 [-0.7839264  0.6208537  2.66631  ] False
-> (3, 10, 17) -1.0 -6.824525265295296 [-0.878428    0.47787476  3.4319506 ] False
-> (3, 10, 17) -1.6 -8.169174645520929 [-0.95717466  0.28951102  4.0903563 ] False
-> (3, 10, 17) -0.5999999999999999 -9.787503488161532 [-0.9979946  0.0632991  4.6074896] False
-> (3, 10, 17) -2.0 -11.598526752768493 [-0.98568493 -0.16859764  4.654964  ] False
-> (3, 10, 17) -2.0 -11.000758060916864 [-0.92267585 -0.38557652  4.528516  ] False
-> (3, 10, 17) -2.0 -9.589944119814072 [-0.82090676 -0.5710622   4.2393336 ] False
-> (3, 10, 17) 1.4000000000000004 -8.221303955811461 [-0.68706745 -0.7265937   4.111037  ] False
-> (3, 10, 17) -1.6 -7.114771235285082 [-0.53468984 -0.84504837  3.8660915 ] False
-> (3, 10, 17) -2.0 -6.0526

In [52]:
iterations = 10

In [53]:
alpha = 0.01
alpha

0.01

In [54]:
gamma = 0.9
gamma

0.9

In [55]:
episodes=1000

In [56]:
i=0
while i<episodes:
    obs,_ = env.reset()
    done = False
    total_reward = 0
    state = get_state(obs)
    total_steps = 0 
    while not done:
        x, y, vel = state
        
        action = epsilon_greedy_policy(state, Q, 0.5)
        action_idx = actions.index(action)    
        real_action = np.array([action_idx])

        obs, reward, done, _, _ = env.step(real_action)
        total_steps += 1 
        next_state = get_state(obs)
        next_x, next_y, next_vel = next_state
        value = Q[x][y][vel][action_idx] + alpha * (reward + (gamma* (np.max(Q[next_x][next_y][next_vel]))) - Q[x][y][vel][action_idx] )
        Q[x][y][vel][action_idx] = value
        state = next_state
        total_reward += reward
    print("relation", total_reward/total_steps)
    print('total_reward', total_reward)
    print("total_steps", total_steps)
    i+=1

relation -6.927479008251885
total_reward -6927.479008251886
total_steps 1000
relation -7.685851977014525
total_reward -7685.851977014525
total_steps 1000
relation -8.055428364781582
total_reward -8055.428364781582
total_steps 1000
relation -7.6186019141788055
total_reward -7618.601914178806
total_steps 1000
relation -7.950964735317082
total_reward -7950.964735317082
total_steps 1000
relation -8.24085520339379
total_reward -8240.85520339379
total_steps 1000
relation -8.23903580220642
total_reward -8239.03580220642
total_steps 1000
relation -8.243380859666216
total_reward -8243.380859666217
total_steps 1000
relation -8.238779687476374
total_reward -8238.779687476374
total_steps 1000
relation -5.957817512558841
total_reward -5957.817512558841
total_steps 1000
relation -7.0258598499254115
total_reward -7025.859849925411
total_steps 1000
relation -7.4062656306060335
total_reward -7406.2656306060335
total_steps 1000
relation -8.100714530876193
total_reward -8100.714530876192
total_steps 1000

In [57]:
Q

array([[[[  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         ...,
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ]],

        [[  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.     