Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [24]:
import numpy as np
from tqdm import tqdm
from pendulum_env_extended import PendulumEnvExtended
import random 

In [25]:
env = PendulumEnvExtended()

In [26]:
max_steps= env.max_steps
max_steps

1000

Discretización de los estados

In [27]:
x_space_partitions = 11
y_space_partitions = 11
vel_space_partitions = 33

In [28]:
x_space = np.linspace(-1, 1, x_space_partitions)
x_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [29]:
y_space = np.linspace(-1, 1, y_space_partitions)
y_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [30]:

vel_space = np.linspace(-8, 8, vel_space_partitions)
vel_space

array([-8. , -7.5, -7. , -6.5, -6. , -5.5, -5. , -4.5, -4. , -3.5, -3. ,
       -2.5, -2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,  1.5,  2. ,  2.5,
        3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,  6.5,  7. ,  7.5,  8. ])

Obtener el estado a partir de la observación

In [31]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [32]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

(3, 6, 17)

Discretización de las acciones

In [33]:
actions_partitions = 21

In [34]:
actions = list(np.linspace(-2, 2, actions_partitions))
actions

[-2.0,
 -1.8,
 -1.6,
 -1.4,
 -1.2,
 -1.0,
 -0.7999999999999998,
 -0.5999999999999999,
 -0.3999999999999999,
 -0.19999999999999996,
 0.0,
 0.20000000000000018,
 0.40000000000000036,
 0.6000000000000001,
 0.8000000000000003,
 1.0,
 1.2000000000000002,
 1.4000000000000004,
 1.6,
 1.8000000000000003,
 2.0]

In [35]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [36]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [37]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [38]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    # exploit
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [39]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action_idx])
     
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
   # Usar action_idx para actualizar Q
   
    total_reward += reward
    print('->', state, action, reward, obs, done)
print('total_reward', total_reward)

[-0.04584055 -0.99894875 -0.20420326]
-> (5, 1, 16) -2.0 -2.61773666451505 [-0.09339108 -0.9956295  -0.95341486] False
-> (5, 1, 16) -2.0 -2.86087334586817 [-0.17758729 -0.98410505 -1.700137  ] False
-> (5, 1, 16) -2.0 -3.349203920730886 [-0.29594532 -0.9552049  -2.4382157 ] False
-> (5, 1, 16) -2.0 -4.096033729889532 [-0.44231296 -0.8968608  -3.1546195 ] False
-> (5, 1, 16) -2.0 -5.1118909780832835 [-0.60481954 -0.7963626  -3.827265  ] False
-> (5, 1, 16) -2.0 -6.394685715957852 [-0.7648227 -0.6442408 -4.4245367] False
-> (5, 1, 16) -1.4 -7.9228796652775975 [-0.8917293 -0.4525692 -4.6077175] False
-> (5, 1, 16) -2.0 -9.262413709545935 [-0.9753958  -0.22046107 -4.9471445 ] False
-> (5, 1, 16) -2.0 -10.969768429892227 [-0.99944437  0.03333119 -5.11249   ] False
-> (5, 1, 16) -1.2 -12.279006636767392 [-0.963044    0.26934415 -4.787492  ] False
-> (5, 1, 16) 1.8000000000000003 -10.526484312978784 [-0.8837472   0.46796453 -4.285484  ] False
-> (5, 1, 16) -2.0 -8.883472836618456 [-0.7752335

In [40]:
max_steps = env.max_steps

In [41]:
iterations = 10

In [42]:
alpha = 0.01
alpha

0.01

In [43]:
gamma = 0.9
gamma

0.9

In [44]:
episodes=1000

In [45]:
i=0
for i in tqdm(range(episodes)):
    obs,_ = env.reset()
    done = False
    total_reward = 0    
    total_steps = 0 
    state = get_state(obs)
    while not done:        
        action = epsilon_greedy_policy(state, Q, 0.2)
        action_idx = actions.index(action)    
        real_action = np.array([action_idx])

        obs, reward, done, _, _ = env.step(real_action)

        next_state = get_state(obs)

    
        total_steps += 1 
        total_reward += reward

        value = Q[state][action_idx] + alpha * (reward + (gamma* (np.max(Q[next_state]))) - Q[state][action_idx] )
        Q[state][action_idx] = value

        state = next_state
    i+=1

100%|██████████| 1000/1000 [00:41<00:00, 24.25it/s]


In [46]:
total_steps

1000

In [47]:
total_reward

-7052.894079227838

In [48]:
Q

array([[[[  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         ...,
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ]],

        [[  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.     