Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [1]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [2]:
env = PendulumEnvExtended(render_mode='rgb_array')

Discretización de los estados

In [34]:
x_space_partitions = 21
y_space_partitions = 21
vel_space_partitions = 65

In [35]:
x_space = np.linspace(-1, 1, x_space_partitions)
x_space

array([-1. , -0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1,  0. ,
        0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])

In [36]:
y_space = np.linspace(-1, 1, y_space_partitions)
y_space

array([-1. , -0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1,  0. ,
        0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])

In [37]:

vel_space = np.linspace(-8, 8, vel_space_partitions)
vel_space

array([-8.  , -7.75, -7.5 , -7.25, -7.  , -6.75, -6.5 , -6.25, -6.  ,
       -5.75, -5.5 , -5.25, -5.  , -4.75, -4.5 , -4.25, -4.  , -3.75,
       -3.5 , -3.25, -3.  , -2.75, -2.5 , -2.25, -2.  , -1.75, -1.5 ,
       -1.25, -1.  , -0.75, -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,
        1.  ,  1.25,  1.5 ,  1.75,  2.  ,  2.25,  2.5 ,  2.75,  3.  ,
        3.25,  3.5 ,  3.75,  4.  ,  4.25,  4.5 ,  4.75,  5.  ,  5.25,
        5.5 ,  5.75,  6.  ,  6.25,  6.5 ,  6.75,  7.  ,  7.25,  7.5 ,
        7.75,  8.  ])

Obtener el estado a partir de la observación

In [4]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [5]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

(3, 6, 52)

Discretización de las acciones

In [38]:
actions_partitions = 41

In [42]:
actions = list(np.linspace(-2, 2, actions_partitions))
actions

[-2.0,
 -1.9,
 -1.8,
 -1.7,
 -1.6,
 -1.5,
 -1.4,
 -1.2999999999999998,
 -1.2,
 -1.1,
 -1.0,
 -0.8999999999999999,
 -0.7999999999999998,
 -0.7,
 -0.5999999999999999,
 -0.5,
 -0.3999999999999999,
 -0.2999999999999998,
 -0.19999999999999996,
 -0.09999999999999987,
 0.0,
 0.10000000000000009,
 0.20000000000000018,
 0.30000000000000027,
 0.40000000000000036,
 0.5,
 0.6000000000000001,
 0.7000000000000002,
 0.8000000000000003,
 0.9000000000000004,
 1.0,
 1.1,
 1.2000000000000002,
 1.3000000000000003,
 1.4000000000000004,
 1.5,
 1.6,
 1.7000000000000002,
 1.8000000000000003,
 1.9000000000000004,
 2.0]

In [30]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [31]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [32]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [33]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    # exploit
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [54]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action_idx])
     
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
   # Usar action_idx para actualizar Q
   
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)

[-0.20442708 -0.9788818   0.53905565]
-> (8, 1, 35) 1.2000000000000002 -3.189631157658911 [-0.19929034 -0.9799405   0.1048943 ] False
-> (8, 1, 35) -2.0 -3.139064543089353 [-0.23005746 -0.9731771  -0.6300611 ] False
-> (8, 1, 35) -2.0 -3.290265307904992 [-0.29564816 -0.9552969  -1.3599439 ] False
-> (8, 1, 35) -0.5 -3.6893247330808867 [-0.37922144 -0.92530596 -1.7764165 ] False
-> (8, 1, 35) -2.0 -4.156189855997651 [-0.4903354 -0.8715338 -2.470396 ] False
-> (8, 1, 35) -2.0 -4.950303183716236 [-0.6199483  -0.78464264 -3.1240463 ] False
-> (8, 1, 35) 0.40000000000000036 -5.995206754078639 [-0.7441779  -0.66798145 -3.4125283 ] False
-> (8, 1, 35) -2.0 -6.973115092650078 [-0.85985166 -0.510544   -3.9135144 ] False
-> (8, 1, 35) -1.0 -8.325624599611539 [-0.9440823  -0.32970983 -3.9964225 ] False
-> (8, 1, 35) 1.8000000000000003 -9.472510669739426 [-0.99038124 -0.13836561 -3.9437048 ] False
-> (8, 1, 35) -0.5 -10.575978636531056 [-0.998821   0.0485439 -3.747479 ] False
-> (8, 1, 35) 1.5 -10

In [None]:
iterations = 10

In [55]:
alpha = 0.01
alpha

0.01

In [57]:
gamma = 0.9
gamma

0.9

In [63]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    x, y, vel = state
    
    action = epsilon_greedy_policy(state, Q, 0.5)
    action_idx = actions.index(action)    
    real_action = np.array([action_idx])

    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
    value = Q[x][y][vel][real_action] + alpha * (reward + (gamma* (np.max(Q[next_state]))) - Q[x][y][vel][real_action] )
    Q[x][y][vel][real_action] = value
    print(value)

    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)

[ 0.14728566  0.989094   -0.66243833]
[-0.02072734]
-> (12, 20, 30) -0.5999999999999999 -2.072734318092416 [0.12849805 0.99170977 0.37938216] False
[-0.0209359]
-> (12, 20, 30) -2.0 -2.0935898305670975 [0.07263208 0.9973588  1.1231645 ] False
[-0.02371454]
-> (12, 20, 30) -1.9 -2.371454148517622 [-0.02835928  0.9995978   2.0211835 ] False
[-0.02969829]
-> (12, 20, 30) 0.20000000000000018 -2.9698291328087185 [-0.18090561  0.9835005   3.0708818 ] False
[-0.04019001]
-> (12, 20, 30) -1.8 -4.019001176149432 [-0.3777199  0.9259199  4.108507 ] False
[-0.05526252]
-> (12, 20, 30) -1.7 -5.5262518945480625 [-0.5991827   0.80061233  5.102947  ] False
[-0.07506599]
-> (12, 20, 30) 0.40000000000000036 -7.5065987138531405 [-0.80911833  0.58764577  6.0034065 ] False
[-0.09925502]
-> (12, 20, 30) -1.6 -9.925502481677025 [-0.9579749   0.28685194  6.7441406 ] False
[-0.12678569]
-> (12, 20, 30) -1.5 -12.678569356786003 [-0.9974073  -0.07196267  7.2592797 ] False
[-0.1469596]
-> (12, 20, 30) -0.29999999