In [11]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from matplotlib import animation

# Crear el entorno
env = gym.make("CartPole-v1")

# Discretizar el espacio de observación
num_buckets = (1, 1, 6, 12)  # Número de buckets para cada dimensión
state_bounds = list(zip(env.observation_space.low, env.observation_space.high))

# Ajustar los límites de observación
state_bounds[1] = [-0.5, 0.5]
state_bounds[3] = [-np.radians(50), np.radians(50)]

# Parámetros de Q-Learning
alpha = 0.1  # Tasa de aprendizaje
gamma = 0.99  # Factor de descuento
epsilon = 1.0  # Tasa de exploración
epsilon_decay = 0.995
epsilon_min = 0.01
num_episodes = 1000

# Inicializar la tabla Q
q_table = np.zeros(num_buckets + (env.action_space.n,))


def discretize_state(state):
    if not isinstance(state, (list, tuple)):
        state = [state]
    ratios = [(state[i] - state_bounds[i][0]) / (state_bounds[i][1] - state_bounds[i][0]) for i in range(len(state))]
    new_state = [int(round((num_buckets[i] - 1) * ratios[i])) for i in range(len(state))]
    new_state = [min(num_buckets[i] - 1, max(0, new_state[i])) for i in range(len(state))]
    return tuple(new_state)




def choose_action(state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(q_table[state])

# Entrenamiento del agente
for episode in range(num_episodes):
    state = env.reset()
    state = discretize_state(state)
    done = False
    while not done:
        action = choose_action(state)
        result = env.step(action)
        
        # Manejar diferentes retornos de env.step(action)
        if len(result) == 4:
            next_state, reward, done, _ = result
        else:
            next_state = result[0]
            reward = result[1]
            done = result[2]
            _ = result[3] if len(result) > 3 else None
        
        next_state = discretize_state(next_state)
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + gamma * q_table[next_state][best_next_action]
        td_error = td_target - q_table[state][action]
        q_table[state][action] += alpha * td_error
        state = next_state

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

print("Entrenamiento completado.")

# Visualización del agente en acción
def run_episode(env, q_table):
    frames = []
    state = env.reset()
    state = discretize_state(state)
    done = False
    while not done:
        frames.append(env.render(mode="rgb_array"))
        action = np.argmax(q_table[state])
        state, _, done, _ = env.step(action)
        state = discretize_state(state)
    env.close()
    return frames

def save_animation(frames, path='./cartpole.gif', fps=30):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    
    def animate(i):
        patch.set_data(frames[i])
    
    anim = animation.FuncAnimation(fig, animate, frames=len(frames), interval=1000/fps)
    anim.save(path, writer='imagemagick', fps=fps)

# Generar y guardar la animación
frames = run_episode(env, q_table)
save_animation(frames)

print("Animación guardada como cartpole.gif")


TypeError: unsupported operand type(s) for -: 'dict' and 'float'