Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Descent Env"

In [4]:
import numpy as np
from descent_env import DescentEnv
import random 

In [3]:
# Cambiar render_mode a rgb_array para entrenar/testear
env = DescentEnv(render_mode='human')

Reading config from C:\Users\npere\bluesky\settings.cfg
Reading magnetic variation data
Loading global navigation database...
Reading cache: C:\Users\npere\bluesky\cache\navdata.p
Successfully loaded OpenAP performance model
Failed to load BADA performance model
Successfully loaded legacy performance model
Successfully loaded plugin AREA
Successfully loaded plugin DATAFEED


Observation Space

In [3]:
env.observation_space

Dict('altitude': Box(-inf, inf, (1,), float64), 'runway_distance': Box(-inf, inf, (1,), float64), 'target_altitude': Box(-inf, inf, (1,), float64), 'vz': Box(-inf, inf, (1,), float64))

Action Space

In [4]:
env.action_space

Box(-1.0, 1.0, (1,), float64)

In [None]:
obs, _ = env.reset()
print(obs['runway_distance'])

[0.5]


: 

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

**Discretizacion actualizada**

In [None]:
ALT_MIN = 2000
ALT_MAX = 4000
ALT_MEAN = 1500
ALT_STD = 3000
VZ_MEAN = 0
VZ_STD = 5
RWY_DIS_MEAN = 100
RWY_DIS_STD = 200
altitude_space = np.linspace((ALT_MIN - ALT_MEAN)/ALT_STD, (ALT_MAX - ALT_MEAN)/ALT_STD, 100)
vertical_velocity_space = np.linspace(-10, 10, 100) 
target_altitude_space = np.linspace((ALT_MIN - ALT_MEAN)/ALT_STD, (ALT_MAX - ALT_MEAN)/ALT_STD, 100)
runway_distance_space = np.linspace(-2, 2, 100)
altitude_space

array([0.16666667, 0.17340067, 0.18013468, 0.18686869, 0.19360269,
       0.2003367 , 0.20707071, 0.21380471, 0.22053872, 0.22727273,
       0.23400673, 0.24074074, 0.24747475, 0.25420875, 0.26094276,
       0.26767677, 0.27441077, 0.28114478, 0.28787879, 0.29461279,
       0.3013468 , 0.30808081, 0.31481481, 0.32154882, 0.32828283,
       0.33501684, 0.34175084, 0.34848485, 0.35521886, 0.36195286,
       0.36868687, 0.37542088, 0.38215488, 0.38888889, 0.3956229 ,
       0.4023569 , 0.40909091, 0.41582492, 0.42255892, 0.42929293,
       0.43602694, 0.44276094, 0.44949495, 0.45622896, 0.46296296,
       0.46969697, 0.47643098, 0.48316498, 0.48989899, 0.496633  ,
       0.503367  , 0.51010101, 0.51683502, 0.52356902, 0.53030303,
       0.53703704, 0.54377104, 0.55050505, 0.55723906, 0.56397306,
       0.57070707, 0.57744108, 0.58417508, 0.59090909, 0.5976431 ,
       0.6043771 , 0.61111111, 0.61784512, 0.62457912, 0.63131313,
       0.63804714, 0.64478114, 0.65151515, 0.65824916, 0.66498

Obtener el estado a partir de la observación

In [38]:
def get_state(obs):
    alt = obs['altitude'][0]
    vz = obs['vz'][0]
    target_alt = obs['target_altitude'][0]
    runway_dist = obs['runway_distance'][0]
    alt_idx = np.digitize(alt, altitude_space) - 1
    vz_idx = np.digitize(vz, vertical_velocity_space) - 1
    target_alt_idx = np.digitize(target_alt, target_altitude_space) - 1
    runway_dist_idx = np.digitize(runway_dist, runway_distance_space) - 1
    return alt_idx, vz_idx, target_alt_idx, runway_dist_idx

In [28]:
obs = env.observation_space.sample()
print(obs)
state = get_state(obs) # Ejemplo de obs
state

OrderedDict([('altitude', array([0.22241416])), ('runway_distance', array([1.65613662])), ('target_altitude', array([-0.03552804])), ('vz', array([-2.21991442]))])


(9, 39, 0, 91)

Discretización de las acciones

In [8]:
actions = list(np.linspace(-1, 1, 10))
actions

[-1.0,
 -0.7777777777777778,
 -0.5555555555555556,
 -0.33333333333333337,
 -0.11111111111111116,
 0.11111111111111116,
 0.33333333333333326,
 0.5555555555555554,
 0.7777777777777777,
 1.0]

In [29]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [31]:
Q = np.zeros((len(altitude_space), len(vertical_velocity_space), len(target_altitude_space), len(runway_distance_space), len(actions)))
Q.shape

(100, 100, 100, 100, 10)

Obtención de la acción a partir de la tabla Q

In [32]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [13]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [None]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
steps = 0
while not done:
    steps += 1
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action])
     
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
   # Usar action_idx para actualizar Q
   # Q[state][action_idx] = ... # Completar
   
   # Actualizar estado
    state = next_state
   
    total_reward += reward

    env.render()

env.close()    
print('total_reward', total_reward)
print('steps', steps)

# Q-learning


In [None]:
i = 0
total_reward = 0
step_reward = 0
step_count = 0
max_steps = 1000

obs, _ = env.reset()
done = False

while step_count < max_steps:
    p = random.uniform(0, 1)
    # Aumenta la probabilidad de exploración al inicio y la reduce con los pasos
    exploration_threshold = max(0.1, 1.0 - (step_count / max_steps))
    state = get_state(obs)
    if p < exploration_threshold:
        action = get_sample_action()
    else:
        action = optimal_policy(state, Q)
    next_obs, reward, done, _, _ = env.step(np.array([action]))
    next_state = get_state(next_obs)
    action_idx = actions.index(action)
    Q[state][action_idx] = Q[state][action_idx] + 0.9 * (reward + 0.9 * np.max(Q[next_state]) - Q[state][action_idx])
    obs = next_obs
    total_reward += reward
    step_reward += reward
    step_count += 1
    if step_count % 10 == 0:
        mean_reward = step_reward / 10
        print(f"Step {step_count}: Mean Reward (last 10 steps): {mean_reward}")
        step_reward = 0
    if step_count % 100 == 0:
        env.render()
    if done:
        obs, _ = env.reset()
        done = False
print(f"Total Reward after {max_steps} steps: {total_reward}")
print("Training complete.")

Step 10: Mean Reward (last 10 steps): -2.9774563533333334
Step 20: Mean Reward (last 10 steps): -13.472223342499998
Step 30: Mean Reward (last 10 steps): -1.1708529288333334
Step 40: Mean Reward (last 10 steps): -1.4093149175000002
Step 50: Mean Reward (last 10 steps): -11.811536489000002
Step 60: Mean Reward (last 10 steps): -0.5766127571666668
Step 70: Mean Reward (last 10 steps): -12.017627941499999
Step 80: Mean Reward (last 10 steps): -13.598227178000002
Step 90: Mean Reward (last 10 steps): -2.3394006890000005
Step 100: Mean Reward (last 10 steps): -12.499439333166668
Total Reward after 100 steps: -718.7269193000004
Training complete.


: 