Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [6]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [7]:
env = PendulumEnvExtended(render_mode='rgb_array')

In [8]:
max_steps= env.max_steps
max_steps

700

Discretización de los estados

In [9]:
x_space_partitions = 11
y_space_partitions = 11
vel_space_partitions = 33

In [10]:
x_space = np.linspace(-1, 1, x_space_partitions)
x_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [11]:
y_space = np.linspace(-1, 1, y_space_partitions)
y_space

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [12]:
vel_space = np.linspace(-8, 8, vel_space_partitions)
vel_space

array([-8. , -7.5, -7. , -6.5, -6. , -5.5, -5. , -4.5, -4. , -3.5, -3. ,
       -2.5, -2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,  1.5,  2. ,  2.5,
        3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,  6.5,  7. ,  7.5,  8. ])

Obtener el estado a partir de la observación

In [13]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [14]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

(3, 6, 17)

Discretización de las acciones

In [15]:
actions_partitions = 21

In [16]:
actions = list(np.linspace(-2, 2, actions_partitions))
actions

[-2.0,
 -1.8,
 -1.6,
 -1.4,
 -1.2,
 -1.0,
 -0.7999999999999998,
 -0.5999999999999999,
 -0.3999999999999999,
 -0.19999999999999996,
 0.0,
 0.20000000000000018,
 0.40000000000000036,
 0.6000000000000001,
 0.8000000000000003,
 1.0,
 1.2000000000000002,
 1.4000000000000004,
 1.6,
 1.8000000000000003,
 2.0]

In [17]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [18]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [19]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [20]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    # exploit
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [21]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action])
     
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
   # Usar action_idx para actualizar Q
   
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)

[ 0.05426924 -0.99852633 -0.9910597 ]
-> (6, 1, 15) 0.20000000000000018 -2.398033435437321 [-0.03119675 -0.99951327 -1.7099545 ] False
-> (6, 1, 15) -2.0 -2.8627924491942123 [-0.16837537 -0.98572296 -2.7595894 ] False
-> (6, 1, 15) 0.8000000000000003 -3.7896954325342196 [-0.3317192  -0.94337815 -3.3788817 ] False
-> (6, 1, 15) 2.0 -4.789666097303909 [-0.5033282  -0.86409533 -3.7864153 ] False
-> (6, 1, 15) -2.0 -5.8403154109372855 [-0.69193834 -0.7219566  -4.7344866 ] False
-> (6, 1, 15) -1.2 -7.695046354523516 [-0.8608655 -0.5088326 -5.455954 ] False
-> (6, 1, 15) -2.0 -9.781179265934762 [-0.97435737 -0.22500603 -6.1375785 ] False
-> (6, 1, 15) -2.0 -12.266133443271542 [-0.99466234  0.10318345 -6.6063333 ] False
-> (6, 1, 15) -2.0 -13.59917628325549 [-0.9026904   0.43029064 -6.8289456 ] False
-> (6, 1, 15) -2.0 -11.940061039994324 [-0.7072991   0.70691437 -6.8062277 ] False
-> (6, 1, 15) -2.0 -10.189408082711248 [-0.44113964  0.89743847 -6.5760417 ] False
-> (6, 1, 15) -2.0 -8.4398558

In [22]:
max_steps = env.max_steps

In [23]:
alpha = 1/(max_steps+1)
alpha

0.0014265335235378032

In [24]:
gamma = 0.9
gamma

0.9

In [35]:
episodes=20000

In [26]:
epsilon = 0.99

In [27]:
from tqdm import tqdm

In [38]:
i=0
below1=0
for i in tqdm(range(episodes)):
    obs,_ = env.reset()
    done = False
    total_reward = 0    
    total_steps = 0 
    state = get_state(obs)
    
    while not done:        
        x, y, vel = state
        action = epsilon_greedy_policy(state, Q, 0.1)
        action_idx = actions.index(action)    
        real_action = np.array([action_idx])

        obs, reward, done, _, _ = env.step(real_action)

        next_state = get_state(obs)
    
        total_steps += 1 
        total_reward += reward
        if(reward>-1):
            below1+=1

        value = Q[state][action_idx] + alpha * (reward + (gamma* (np.max(Q[next_state]))) - Q[state][action_idx] )
        Q[state][action_idx] = value

        state = next_state
    i+=1
    if (i%10==0 & epsilon>0.1):
        epsilon *= 0.999

epsilon

100%|██████████| 20000/20000 [10:55<00:00, 30.53it/s]


0.0024466088958458427

In [40]:
below1/episodes/max_steps*100

3.0177714285714288

In [31]:
total_reward

-5004.44294716926

In [32]:
total_reward/episodes

-0.250222147358463

In [33]:
Q

array([[[[  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         ...,
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ]],

        [[  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.        ],
         [  0.        ,   0.        ,   0.        , ...,   0.        ,
            0.        ,   0.     