In [11]:
import numpy as np
import gym

# Crear el entorno
env = gym.make("CartPole-v1")

# Discretizar el espacio de observación
num_buckets = (1, 1, 6, 12)  # Número de buckets para cada dimensión
state_bounds = list(zip(env.observation_space.low, env.observation_space.high))

# Ajustar los límites de observación
state_bounds[1] = [-0.5, 0.5]
state_bounds[3] = [-np.radians(50), np.radians(50)]

# Parámetros de Q-Learning
alpha = 0.1  # Tasa de aprendizaje
gamma = 0.99  # Factor de descuento
epsilon = 1.0  # Tasa de exploración
epsilon_decay = 0.995
epsilon_min = 0.01
num_episodes = 1000

# Inicializar la tabla Q
q_table = np.zeros(num_buckets + (env.action_space.n,))

def discretize_state(state):
    ratios = [(state[i] - state_bounds[i][0]) / (state_bounds[i][1] - state_bounds[i][0]) for i in range(len(state))]
    new_state = [int(round((num_buckets[i] - 1) * ratios[i])) for i in range(len(state))]
    new_state = [min(num_buckets[i] - 1, max(0, new_state[i])) for i in range(len(state))]
    return tuple(new_state)

def choose_action(state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(q_table[state])

# Entrenamiento del agente
for episode in range(num_episodes):
    state = env.reset()
    if isinstance(state, tuple) and len(state) == 2:
        state = state[0]  # Si state es un tuple, extraer el estado real
    state = discretize_state(state)
    done = False
    while not done:
        action = choose_action(state)
        result = env.step(action)
        
        # Manejar diferentes retornos de env.step(action)
        if len(result) == 4:
            next_state, reward, done, _ = result
        else:
            next_state = result[0]
            reward = result[1]
            done = result[2]
            _ = result[3] if len(result) > 3 else None
        
        if isinstance(next_state, tuple) and len(next_state) == 2:
            next_state = next_state[0]  # Si next_state es un tuple, extraer el estado real
        next_state = discretize_state(next_state)
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + gamma * q_table[next_state][best_next_action]
        td_error = td_target - q_table[state][action]
        q_table[state][action] += alpha * td_error
        state = next_state

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

print("Entrenamiento completado.")
print(q_table)


Entrenamiento completado.
[[[[[ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]]

   [[ 9.11256501 18.58127565]
    [ 4.43169791 16.44690834]
    [ 1.37895739 16.02329728]
    [ 1.1490466  14.66299231]
    [ 0.57992171 10.55516611]
    [ 0.1        10.76887492]
    [ 0.64602759 19.1718508 ]
    [13.32372184  0.        ]
    [ 0.          0.        ]
    [ 0.1         0.        ]
    [ 0.          1.58154876]
    [ 0.          0.        ]]

   [[84.27775877 67.0151474 ]
    [99.99993067 77.41911966]
    [93.1725438  76.74277701]
    [99.98674383 99.62278216]
    [99.9864701  90.08735566]
    [99.98635329 93.82145893]
    [99.96716711 99.98649181]
    [98.55598324 99.98628202]
    [99.98