Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Descent Env"

In [1]:
import numpy as np
from descent_env import DescentEnv
import random 

pygame 2.5.2 (SDL 2.28.3, Python 3.10.8)
Hello from the pygame community. https://www.pygame.org/contribute.html
Using Python-based geo functions


In [3]:
# Cambiar render_mode a rgb_array para entrenar/testear
# env = DescentEnv(render_mode='human')
env = DescentEnv()

Reading config from C:\Users\agusp\bluesky\settings.cfg
Reading magnetic variation data
Loading global navigation database...
Reading cache: C:\Users\agusp\bluesky\cache\navdata.p
Successfully loaded OpenAP performance model
Failed to load BADA performance model
Successfully loaded legacy performance model
Successfully loaded plugin AREA
Successfully loaded plugin DATAFEED


Observation Space

In [4]:
env.observation_space

Dict('altitude': Box(-inf, inf, (1,), float64), 'runway_distance': Box(-inf, inf, (1,), float64), 'target_altitude': Box(-inf, inf, (1,), float64), 'vz': Box(-inf, inf, (1,), float64))

Action Space

In [5]:
env.action_space

Box(-1.0, 1.0, (1,), float64)

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

**Discretizacion actualizada**

In [14]:
ALT_MIN = 2000
ALT_MAX = 4000
ALT_MEAN = 1500
ALT_STD = 3000
VZ_MEAN = 0
VZ_STD = 5
RWY_DIS_MEAN = 100
RWY_DIS_STD = 200
altitude_space = np.linspace(0, 1, 30)
vertical_velocity_space = np.linspace(-10, 10, 30) 
target_altitude_space = np.linspace(0, 1, 30)
runway_distance_space = np.linspace(0, 0.5, 10)
altitude_space

array([0.        , 0.03448276, 0.06896552, 0.10344828, 0.13793103,
       0.17241379, 0.20689655, 0.24137931, 0.27586207, 0.31034483,
       0.34482759, 0.37931034, 0.4137931 , 0.44827586, 0.48275862,
       0.51724138, 0.55172414, 0.5862069 , 0.62068966, 0.65517241,
       0.68965517, 0.72413793, 0.75862069, 0.79310345, 0.82758621,
       0.86206897, 0.89655172, 0.93103448, 0.96551724, 1.        ])

Obtener el estado a partir de la observación

In [7]:
def get_state(obs):
    alt = obs['altitude'][0]
    vz = obs['vz'][0]
    target_alt = obs['target_altitude'][0]
    runway_dist = obs['runway_distance'][0]
    alt_idx = np.clip(np.digitize(alt, altitude_space) - 1, 0, len(altitude_space) - 1)
    vz_idx = np.clip(np.digitize(vz, vertical_velocity_space) - 1, 0, len(vertical_velocity_space) - 1)
    target_alt_idx = np.clip(np.digitize(target_alt, target_altitude_space) - 1, 0, len(target_altitude_space) - 1)
    runway_dist_idx = np.clip(np.digitize(runway_dist, runway_distance_space) - 1, 0, len(runway_distance_space) - 1)
    return alt_idx, vz_idx, target_alt_idx, runway_dist_idx

In [8]:
obs = env.observation_space.sample()
print(obs)
state = get_state(obs) # Ejemplo de obs
state

OrderedDict([('altitude', array([0.34711037])), ('runway_distance', array([0.99038471])), ('target_altitude', array([0.56185805])), ('vz', array([-1.23348683]))])


(0, 0, 0, 4)

Discretización de las acciones

In [9]:
actions = list(np.linspace(-1, 1, 10))
actions

[-1.0,
 -0.7777777777777778,
 -0.5555555555555556,
 -0.33333333333333337,
 -0.11111111111111116,
 0.11111111111111116,
 0.33333333333333326,
 0.5555555555555554,
 0.7777777777777777,
 1.0]

In [10]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [18]:
Q = np.zeros((len(altitude_space), len(vertical_velocity_space), len(target_altitude_space), len(runway_distance_space), len(actions)))
Q.shape

(30, 30, 30, 10, 10)

Obtención de la acción a partir de la tabla Q

In [12]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [112]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [135]:
obs, _ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
steps = 0

min_runway_distance = float('inf')
max_runway_distance = float('-inf')

for _ in range(10000):
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    action_idx = actions.index(action)
    real_action = np.array([action])
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
    # Guardar min y max runway_distance
    runway_distance = obs['runway_distance'][0]
    if runway_distance < min_runway_distance:
        min_runway_distance = runway_distance
    if runway_distance > max_runway_distance:
        max_runway_distance = runway_distance

    state = next_state
    total_reward += reward
    steps += 1
    if done:
        obs, _ = env.reset()
        state = get_state(obs)
        done = False

env.close()
print('total_reward', total_reward)
print('steps', steps)
print('min_runway_distance:', min_runway_distance)
print('max_runway_distance:', max_runway_distance)


{'altitude': array([0.28284088]), 'vz': array([0.27777778]), 'target_altitude': array([0.38833333]), 'runway_distance': array([0.37619696])}
total_reward -82423.0027900281
steps 10000
min_runway_distance: -0.5212062905283475
max_runway_distance: 0.47556561005149406


# Q-learning


In [20]:
i = 0
total_reward = 0
rewards = []
max_steps = 1

obs, _ = env.reset()
done = False

def get_explore_prob(i):
    # Entre 0 y 1000: explorar 90%
    # Entre 1001 y 2000: explotar 80%
    # Entre 2001 y 3000: explorar 90%
    # Entre 3001 y 4000: explotar 80%
    if 0 <= i <= 1000:
        return 0.9
    elif 1001 <= i <= 2000:
        return 0.2
    elif 2001 <= i <= 3000:
        return 0.9
    elif 3001 <= i <= 4000:
        return 0.2
    else:
        return 0.1  # default

while i < 4000:
    obs, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        p = random.uniform(0, 1)
        state = get_state(obs)
        prob = get_explore_prob(i)
        if p < prob:
            action = get_sample_action()
        else:
            action = optimal_policy(state, Q)
        next_obs, reward, done, _, _ = env.step(np.array([action]))
        next_state = get_state(next_obs)
        action_idx = actions.index(action)
        Q[state][action_idx] = Q[state][action_idx] + 0.9 * (reward + 0.9 * np.max(Q[next_state]) - Q[state][action_idx])
        obs = next_obs
        episode_reward += reward
    rewards.append(episode_reward)
    print(f"Episodio {i+1}: reward = {episode_reward}, exploración = {prob}")
    if (i + 1) % 100 == 0:
        mean_reward = np.mean(rewards[-100:])
        print(f"Media de reward en episodios {i-98} a {i+1}: {mean_reward}")
    i += 1

Episodio 1: reward = -89.16632848333334, exploración = 0.9
Episodio 2: reward = -100.519939815, exploración = 0.9
Episodio 3: reward = -203.70997538, exploración = 0.9
Episodio 4: reward = -217.73698022000002, exploración = 0.9
Episodio 5: reward = -87.87749048333335, exploración = 0.9
Episodio 6: reward = -70.47574244666667, exploración = 0.9
Episodio 7: reward = -103.54508782166667, exploración = 0.9
Episodio 8: reward = -121.14217666666667, exploración = 0.9
Episodio 9: reward = -79.280641, exploración = 0.9
Episodio 10: reward = -62.34428961166667, exploración = 0.9
Episodio 11: reward = -142.98870174166666, exploración = 0.9
Episodio 12: reward = -75.49618119666667, exploración = 0.9
Episodio 13: reward = -93.066142765, exploración = 0.9
Episodio 14: reward = -66.78909018, exploración = 0.9
Episodio 15: reward = -76.55805046166667, exploración = 0.9
Episodio 16: reward = -83.22737474166667, exploración = 0.9
Episodio 17: reward = -118.28935846499999, exploración = 0.9
Episodio 18:

KeyboardInterrupt: 

In [145]:
env = DescentEnv(render_mode='human')     
obs, _ = env.reset()
done = False
total_reward = 0
steps = 0

while not done:
    state = get_state(obs)
    action = optimal_policy(state, Q)
    obs, reward, done, _, _ = env.step(np.array([action]))
    total_reward += reward
    steps += 1
    env.render()

env.close()
print(f"Total reward (Q final): {total_reward}")
print(f"Steps: {steps}")   

Reading config from C:\Users\npere\bluesky\settings.cfg
Reading magnetic variation data
Loading global navigation database...
Reading cache: C:\Users\npere\bluesky\cache\navdata.p
Attempt to reimplement AREA from <bound method Area.set_area of <bluesky.plugins.area.Area object at 0x0000028E21095E40>> to <bound method Area.set_area of <bluesky.plugins.area.Area object at 0x0000028E21095E40>>
Attempt to reimplement EXP from <function init_plugin.<locals>.<lambda> at 0x0000028E21005630> to <function init_plugin.<locals>.<lambda> at 0x0000028E45E94820>
Attempt to reimplement TAXI from <bound method Area.set_taxi of <bluesky.plugins.area.Area object at 0x0000028E21095E40>> to <bound method Area.set_taxi of <bluesky.plugins.area.Area object at 0x0000028E21095E40>>
Successfully loaded plugin AREA
Attempt to reimplement DATAFEED from <bound method Modesbeast.toggle of <bluesky.plugins.adsbfeed.Modesbeast object at 0x0000028E21097790>> to <bound method Modesbeast.toggle of <bluesky.plugins.adsb