In [1]:
import time
import numpy as np
import gymnasium as gym
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
import random
from collections import deque
from ParabolicShot import ParabolicShotEnv

# Hiperparámetros
GAMMA = 0.99
LEARNING_RATE_ACTOR = 0.0001
LEARNING_RATE_CRITIC = 0.001
TAU = 0.001  # Para actualización suave de las redes objetivo
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64
EXPLORATION_NOISE = 0.1

# Definición del Actor
def create_actor(state_dim, action_dim, action_bound):
    inputs = Input(shape=(state_dim,))
    out = Dense(400, activation="relu")(inputs)
    out = Dense(300, activation="relu")(out)
    outputs = Dense(action_dim, activation="tanh")(out)
    scaled_outputs = outputs * action_bound  # Asume que el límite de acción es simétrico
    model = Model(inputs, scaled_outputs)
    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE_ACTOR))
    return model

# Definición del Crítico
def create_critic(state_dim, action_dim):
    state_inputs = Input(shape=(state_dim,))
    action_inputs = Input(shape=(action_dim,))
    concatenated = Concatenate()([state_inputs, action_inputs])
    out = Dense(400, activation="relu")(concatenated)
    out = Dense(300, activation="relu")(out)
    outputs = Dense(1, activation="linear")(out)
    model = Model([state_inputs, action_inputs], outputs)
    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE_CRITIC), loss='mse')
    return model

# Memoria de repetición de experiencia
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# Ruido de exploración
class OUNoise:
    def __init__(self, action_dimension, scale=0.1, mu=0, theta=0.15, sigma=0.2):
        self.action_dimension = action_dimension
        self.scale = scale
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
    
    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu
    
    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state * self.scale

# Agente DDPG
class DDPGAgent:
    def __init__(self, state_dim, action_dim, action_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        
        self.actor = create_actor(self.state_dim, self.action_dim, self.action_bound)
        self.critic = create_critic(self.state_dim, self.action_dim)
        self.target_actor = create_actor(self.state_dim, self.action_dim, self.action_bound)
        self.target_critic = create_critic(self.state_dim, self.action_dim)
        
        # Inicializar los modelos objetivo
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        
        self.buffer = ReplayBuffer(MEMORY_CAPACITY)
        self.noise = OUNoise(self.action_dim)

    def policy(self, state):
        state = np.reshape(state, [1, -1])  # Asegúrate de que state es un array 2D con la forma correcta
        action = self.actor.predict(state)[0]
        return action + self.noise.noise()

    def train(self):
        if len(self.buffer) < BATCH_SIZE:
            return
        
        samples = self.buffer.sample(BATCH_SIZE)
        states, actions, rewards, next_states, dones = map(np.array, zip(*samples))
        
        # Preparación de la actualización del crítico
        target_actions = self.target_actor.predict(next_states)
        future_rewards = self.target_critic.predict([next_states, target_actions]).flatten()
        q_values = rewards + GAMMA * future_rewards * (1 - dones)
        self.critic.train_on_batch([states, actions], q_values.reshape(-1, 1))
        
        # Preparación de la actualización del actor usando GradientTape para calcular los gradientes
        with tf.GradientTape() as tape:
            tape.watch(self.actor.trainable_variables)
            actions_pred = self.actor(states)
            q_values_pred = self.critic([states, actions_pred])
            actor_loss = -tf.reduce_mean(q_values_pred)

        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
        
        # Actualización de las redes objetivo
        self.update_target(self.target_actor.variables, self.actor.variables, TAU)
        self.update_target(self.target_critic.variables, self.critic.variables, TAU)


    @staticmethod
    def update_target(target_weights, weights, tau):
        for (a, b) in zip(target_weights, weights):
            a.assign(b * tau + a * (1 - tau))


2024-04-12 01:48:22.696127: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 01:48:22.739897: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 01:48:22.740607: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Crea el entorno
env = ParabolicShotEnv()
episodes = 100  # Define el número de episodios de entrenamiento

# Instancia el agente
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]  # Asume acción simétrica alrededor de cero

agent = DDPGAgent(state_dim, action_dim, action_bound)

# Proceso de entrenamiento
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False

    while not done:
        action = agent.policy(state)
        next_state, reward, done, info = env.step(action)
        agent.remember(state, action, reward, next_state, done)  # Guardar en memoria de repetición
        agent.train()  # Entrenamiento del agente
        state = next_state
        episode_reward += reward

        env.render()  # Visualiza el entorno
        time.sleep(1)  # Pausa entre pasos para visualización

    print(f'Episode: {episode+1}, Reward: {episode_reward}')

    if (episode + 1) % 10 == 0:  # Actualizar el modelo objetivo cada 10 episodios
        agent.update_target_model()

env.close()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


ValueError: in user code:

    File "/home/jd/.local/lib/python3.11/site-packages/keras/engine/training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "/home/jd/.local/lib/python3.11/site-packages/keras/engine/training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/jd/.local/lib/python3.11/site-packages/keras/engine/training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "/home/jd/.local/lib/python3.11/site-packages/keras/engine/training.py", line 2111, in predict_step
        return self(x, training=False)
    File "/home/jd/.local/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/jd/.local/lib/python3.11/site-packages/keras/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 7), found shape=(None, 5)


: 