Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Descent Env"

In [None]:
import numpy as np
from descent_env import DescentEnv
import random 

In [None]:
# Cambiar render_mode a rgb_array para entrenar/testear
# env = DescentEnv(render_mode='human')
env = DescentEnv()

Observation Space

In [None]:
env.observation_space

Action Space

In [None]:
env.action_space

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

**Discretizacion actualizada**

In [None]:
ALT_MIN = 2000
ALT_MAX = 4000
ALT_MEAN = 1500
ALT_STD = 3000
VZ_MEAN = 0
VZ_STD = 5
RWY_DIS_MEAN = 100
RWY_DIS_STD = 200
altitude_space = np.linspace(0, 1, 70)
vertical_velocity_space = np.linspace(-10, 10, 70) 
target_altitude_space = np.linspace(0, 1, 70)
runway_distance_space = np.linspace(0, 0.5, 70)
altitude_space

Obtener el estado a partir de la observación

In [None]:
def get_state(obs):
    alt = obs['altitude'][0]
    vz = obs['vz'][0]
    target_alt = obs['target_altitude'][0]
    runway_dist = obs['runway_distance'][0]
    alt_idx = np.clip(np.digitize(alt, altitude_space) - 1, 0, len(altitude_space) - 1)
    vz_idx = np.clip(np.digitize(vz, vertical_velocity_space) - 1, 0, len(vertical_velocity_space) - 1)
    target_alt_idx = np.clip(np.digitize(target_alt, target_altitude_space) - 1, 0, len(target_altitude_space) - 1)
    runway_dist_idx = np.clip(np.digitize(runway_dist, runway_distance_space) - 1, 0, len(runway_distance_space) - 1)
    return alt_idx, vz_idx, target_alt_idx, runway_dist_idx

In [None]:
obs = env.observation_space.sample()
print(obs)
state = get_state(obs) # Ejemplo de obs
state

Discretización de las acciones

In [None]:
actions = list(np.linspace(-1, 1, 30))
actions

In [None]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [None]:
Q = np.zeros((len(altitude_space), len(vertical_velocity_space), len(target_altitude_space), len(runway_distance_space), len(actions)))
Q.shape

Obtención de la acción a partir de la tabla Q

In [None]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [None]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [None]:
obs, _ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
steps = 0

min_runway_distance = float('inf')
max_runway_distance = float('-inf')

for _ in range(1):
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    action_idx = actions.index(action)
    real_action = np.array([action])
    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
    # Guardar min y max runway_distance
    runway_distance = obs['runway_distance'][0]
    if runway_distance < min_runway_distance:
        min_runway_distance = runway_distance
    if runway_distance > max_runway_distance:
        max_runway_distance = runway_distance

    state = next_state
    total_reward += reward
    steps += 1
    if done:
        obs, _ = env.reset()
        state = get_state(obs)
        done = False

env.close()
print('total_reward', total_reward)
print('steps', steps)
print('min_runway_distance:', min_runway_distance)
print('max_runway_distance:', max_runway_distance)


# Q-learning


In [21]:
import pickle
# import sys
# sys.stdout = open('output.txt', 'w')

i = 0
total_reward = 0
rewards = []
max_steps = 1

obs, _ = env.reset()
done = False

def get_explore_prob(i):
    initial_epsilon = 0.9
    min_epsilon = 0.05
    decay_steps = 3530  # Redondeado hacia arriba
    epsilon = max(initial_epsilon - (i // decay_steps) * 0.1, min_epsilon)
    return epsilon

while True:
    obs, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        p = random.uniform(0, 1)
        state = get_state(obs)
        prob = get_explore_prob(i)
        if p < prob:
            action = get_sample_action()
        else:
            action = optimal_policy(state, Q)
        next_obs, reward, done, _, _ = env.step(np.array([action]))
        next_state = get_state(next_obs)
        action_idx = actions.index(action)
        Q[state][action_idx] = Q[state][action_idx] + 0.9 * (reward + 0.9 * np.max(Q[next_state]) - Q[state][action_idx])
        obs = next_obs
        episode_reward += reward
    rewards.append(episode_reward)
    print(f"Episode {i+1}, Reward: {episode_reward}, Epsilon: {prob}")
    if (i + 1) % 100 == 0:
        mean_reward = np.mean(rewards[-100:])
        print(f"Media de reward en episodios {i-98} a {i+1}: {mean_reward} con exploración {prob}")
        with open('Q.pkl', 'wb') as f:
            pickle.dump(Q, f)
        
    i += 1

KeyboardInterrupt: 

In [None]:
import pickle

# Supongamos que Q es tu diccionario o matriz Q
# Q = {...}

# Guardar Q en un archivo .pkl
with open('Q.pkl', 'wb') as f:
    pickle.dump(Q, f)
    
import pprint

pprint.pprint(Q)

In [None]:
env = DescentEnv(render_mode='human')     
obs, _ = env.reset()
done = False
total_reward = 0
steps = 0

while not done:
    state = get_state(obs)
    action = optimal_policy(state, Q)
    obs, reward, done, _, _ = env.step(np.array([action]))
    total_reward += reward
    steps += 1
    env.render()

env.close()
print(f"Total reward (Q final): {total_reward}")
print(f"Steps: {steps}")   

# Stochastic Q Learning

In [None]:
import math


def stoch_argmax(Q_values, k=None):
    n = len(Q_values)
    if k is None:
        k = max(1, int(math.log2(n)))  # O(log(n))
    subset = random.sample(range(n), k)
    best_action = subset[0]
    best_value = Q_values[best_action]
    for i in subset[1:]:
        if Q_values[i] > best_value:
            best_action = i
            best_value = Q_values[i]
    return best_action

In [None]:
k_stochmax = 3  
rewards = []
i=0
def get_explore_prob(i):
    """
    Devuelve la probabilidad de exploración (epsilon) según el número de episodio i.
    Alterna entre 0.7 y 0.3 cada 500 episodios.
    """
    if 0 <= i <= 500:
        return 0.7
    elif 501 <= i <= 1000:
        return 0.3
    elif 1001 <= i <= 1500:
        return 0.7
    elif 1501 <= i <= 2000:
        return 0.3
    elif 2001 <= i <= 2500:
        return 0.7
    elif 2501 <= i <= 3000:
        return 0.3
    elif 3001 <= i <= 3500:
        return 0.7
    elif 3501 <= i <= 4000:
        return 0.3
    elif 4001 <= i <= 4500:
        return 0.7
    elif 4501 <= i <= 5000:
        return 0.3
    else:
        return 0.1  # valor por defecto fuera de rango

while i < 5000:
    obs, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        p = random.uniform(0, 1)
        state = get_state(obs)
        prob = get_explore_prob(i)
        if p < prob:
            action = get_sample_action()
        else:
            action_idx = stoch_argmax(Q[state], k=k_stochmax)
            action = actions[action_idx]
        next_obs, reward, done, _, _ = env.step(np.array([action]))
        next_state = get_state(next_obs)
        best_next_action_idx = stoch_argmax(Q[next_state], k=k_stochmax)
        Q[state][action_idx] += 0.9 * (reward + 0.9 * Q[next_state][best_next_action_idx] - Q[state][action_idx])
        obs = next_obs
        episode_reward += reward
    rewards.append(episode_reward)
    if (i + 1) % 100 == 0:
        mean_reward = np.mean(rewards[-100:])
        print(f"Media de reward en episodios {i-98} a {i+1}: {mean_reward} con exploración {prob}")
    i += 1