Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Descent Env"

In [1]:
import numpy as np
from descent_env import DescentEnv
import random 

pygame 2.5.2 (SDL 2.28.3, Python 3.10.8)
Hello from the pygame community. https://www.pygame.org/contribute.html
Using Python-based geo functions


In [2]:
# Cambiar render_mode a rgb_array para entrenar/testear
# env = DescentEnv(render_mode='human')
env = DescentEnv()

Reading config from C:\Users\npere\bluesky\settings.cfg
Reading magnetic variation data
Loading global navigation database...
Reading cache: C:\Users\npere\bluesky\cache\navdata.p
Successfully loaded OpenAP performance model
Failed to load BADA performance model
Successfully loaded legacy performance model
Successfully loaded plugin AREA
Successfully loaded plugin DATAFEED


Observation Space

In [3]:
env.observation_space

Dict('altitude': Box(-inf, inf, (1,), float64), 'runway_distance': Box(-inf, inf, (1,), float64), 'target_altitude': Box(-inf, inf, (1,), float64), 'vz': Box(-inf, inf, (1,), float64))

Action Space

In [4]:
env.action_space

Box(-1.0, 1.0, (1,), float64)

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

**Discretizacion actualizada**

In [5]:
ALT_MIN = 2000
ALT_MAX = 4000
ALT_MEAN = 1500
ALT_STD = 3000
VZ_MEAN = 0
VZ_STD = 5
RWY_DIS_MEAN = 100
RWY_DIS_STD = 200
altitude_space = np.linspace(0, 1, 20)
vertical_velocity_space = np.linspace(-10, 10, 20) 
target_altitude_space = np.linspace(0, 1, 20)
runway_distance_space = np.linspace(0, 0.5, 10)
altitude_space

array([0.        , 0.05263158, 0.10526316, 0.15789474, 0.21052632,
       0.26315789, 0.31578947, 0.36842105, 0.42105263, 0.47368421,
       0.52631579, 0.57894737, 0.63157895, 0.68421053, 0.73684211,
       0.78947368, 0.84210526, 0.89473684, 0.94736842, 1.        ])

Obtener el estado a partir de la observación

In [6]:
def get_state(obs):
    alt = obs['altitude'][0]
    vz = obs['vz'][0]
    target_alt = obs['target_altitude'][0]
    runway_dist = obs['runway_distance'][0]
    alt_idx = np.clip(np.digitize(alt, altitude_space) - 1, 0, len(altitude_space) - 1)
    vz_idx = np.clip(np.digitize(vz, vertical_velocity_space) - 1, 0, len(vertical_velocity_space) - 1)
    target_alt_idx = np.clip(np.digitize(target_alt, target_altitude_space) - 1, 0, len(target_altitude_space) - 1)
    runway_dist_idx = np.clip(np.digitize(runway_dist, runway_distance_space) - 1, 0, len(runway_distance_space) - 1)
    return alt_idx, vz_idx, target_alt_idx, runway_dist_idx

In [7]:
obs = env.observation_space.sample()
print(obs)
state = get_state(obs) # Ejemplo de obs
state

OrderedDict([('altitude', array([0.98839454])), ('runway_distance', array([-0.5512305])), ('target_altitude', array([1.95661496])), ('vz', array([0.18128256]))])


(18, 9, 19, 0)

Discretización de las acciones

In [8]:
actions = list(np.linspace(-1, 1, 10))
actions

[-1.0,
 -0.7777777777777778,
 -0.5555555555555556,
 -0.33333333333333337,
 -0.11111111111111116,
 0.11111111111111116,
 0.33333333333333326,
 0.5555555555555554,
 0.7777777777777777,
 1.0]

In [9]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [10]:
Q = np.zeros((len(altitude_space), len(vertical_velocity_space), len(target_altitude_space), len(runway_distance_space), len(actions)))
Q.shape

(20, 20, 20, 10, 10)

Obtención de la acción a partir de la tabla Q

In [11]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [12]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [13]:
obs, _ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
steps = 0

min_runway_distance = float('inf')
max_runway_distance = float('-inf')

for _ in range(1):
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    action_idx = actions.index(action)
    real_action = np.array([action])
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
    # Guardar min y max runway_distance
    runway_distance = obs['runway_distance'][0]
    if runway_distance < min_runway_distance:
        min_runway_distance = runway_distance
    if runway_distance > max_runway_distance:
        max_runway_distance = runway_distance

    state = next_state
    total_reward += reward
    steps += 1
    if done:
        obs, _ = env.reset()
        state = get_state(obs)
        done = False

env.close()
print('total_reward', total_reward)
print('steps', steps)
print('min_runway_distance:', min_runway_distance)
print('max_runway_distance:', max_runway_distance)


{'altitude': array([0.28333333]), 'vz': array([0.]), 'target_altitude': array([0.28766667]), 'runway_distance': array([0.5])}
total_reward -0.5281066666666667
steps 1
min_runway_distance: 0.4747557822672158
max_runway_distance: 0.4747557822672158


# Q-learning


In [None]:
i = 0
total_reward = 0
rewards = []
max_steps = 1

obs, _ = env.reset()
done = False

def get_explore_prob(i):
    # Entre 0 y 1000: explorar 90%
    # Entre 1001 y 2000: explotar 80%
    # Entre 2001 y 3000: explorar 90%
    # Entre 3001 y 4000: explotar 80%
    if 0 <= i <= 500:
        return 0.7
    elif 501 <= i <= 1000:
        return 0.3
    elif 1001 <= i <= 1500:
        return 0.7
    elif 1501 <= i <= 2000:
        return 0.3
    elif 2001 <= i <= 2500:
        return 0.7
    elif 2501 <= i <= 3000:
        return 0.3
    elif 3001 <= i <= 3500:
        return 0.7
    elif 3501 <= i <= 4000:
        return 0.3
    elif 4001 <= i <= 4500:
        return 0.7
    elif 4501 <= i <= 5000:
        return 0.3

while i < 5000:
    obs, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        p = random.uniform(0, 1)
        state = get_state(obs)
        prob = get_explore_prob(i)
        if p < prob:
            action = get_sample_action()
        else:
            action = optimal_policy(state, Q)
        next_obs, reward, done, _, _ = env.step(np.array([action]))
        next_state = get_state(next_obs)
        action_idx = actions.index(action)
        Q[state][action_idx] = Q[state][action_idx] + 0.9 * (reward + 0.9 * np.max(Q[next_state]) - Q[state][action_idx])
        obs = next_obs
        episode_reward += reward
    rewards.append(episode_reward)
    if (i + 1) % 100 == 0:
        mean_reward = np.mean(rewards[-100:])
        print(f"Media de reward en episodios {i-98} a {i+1}: {mean_reward} con exploración {prob}")
    i += 1

Media de reward en episodios 1 a 100: -140.51660472803334 con exploración 0.7
Media de reward en episodios 101 a 200: -127.92853751565002 con exploración 0.7
Media de reward en episodios 201 a 300: -119.98375159543333 con exploración 0.7
Media de reward en episodios 301 a 400: -108.68940492473334 con exploración 0.7
Media de reward en episodios 401 a 500: -107.22313709083335 con exploración 0.7
Media de reward en episodios 501 a 600: -117.8310943839 con exploración 0.3
Media de reward en episodios 601 a 700: -107.22835869780002 con exploración 0.3
Media de reward en episodios 701 a 800: -107.66582846231668 con exploración 0.3
Media de reward en episodios 801 a 900: -98.61195811773332 con exploración 0.3
Media de reward en episodios 901 a 1000: -95.30076135106665 con exploración 0.3
Media de reward en episodios 1001 a 1100: -95.8419418746 con exploración 0.7
Media de reward en episodios 1101 a 1200: -97.34409445776666 con exploración 0.7
Media de reward en episodios 1201 a 1300: -93.626

In [None]:
env = DescentEnv(render_mode='human')     
obs, _ = env.reset()
done = False
total_reward = 0
steps = 0

while not done:
    state = get_state(obs)
    action = optimal_policy(state, Q)
    obs, reward, done, _, _ = env.step(np.array([action]))
    total_reward += reward
    steps += 1
    env.render()

env.close()
print(f"Total reward (Q final): {total_reward}")
print(f"Steps: {steps}")   

Reading config from C:\Users\npere\bluesky\settings.cfg
Reading magnetic variation data
Loading global navigation database...
Reading cache: C:\Users\npere\bluesky\cache\navdata.p
Attempt to reimplement AREA from <bound method Area.set_area of <bluesky.plugins.area.Area object at 0x00000278435B1A50>> to <bound method Area.set_area of <bluesky.plugins.area.Area object at 0x00000278435B1A50>>
Attempt to reimplement EXP from <function init_plugin.<locals>.<lambda> at 0x000002781D7D3EB0> to <function init_plugin.<locals>.<lambda> at 0x000002784355FE20>
Attempt to reimplement TAXI from <bound method Area.set_taxi of <bluesky.plugins.area.Area object at 0x00000278435B1A50>> to <bound method Area.set_taxi of <bluesky.plugins.area.Area object at 0x00000278435B1A50>>
Successfully loaded plugin AREA
Attempt to reimplement DATAFEED from <bound method Modesbeast.toggle of <bluesky.plugins.adsbfeed.Modesbeast object at 0x00000278435B1AB0>> to <bound method Modesbeast.toggle of <bluesky.plugins.adsb