In [None]:
import time 
import random
import numpy as np

from tqdm import tqdm
from collections import deque

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

import tensorflow as tf


from ParabolicShot import ParabolicShotEnv
import gymnasium as gym

import matplotlib.pyplot as plt

In [None]:
from tensorflow.keras.layers import Layer
import tensorflow_probability as tfp
tfd = tfp.distributions


class ScaleLayer(Layer):
    def __init__(self, scale_factor, **kwargs):
        super().__init__(**kwargs)
        self.scale_factor = scale_factor

    def call(self, inputs):
        return inputs * self.scale_factor

class SoftplusLayer(Layer):
    def __init__(self, epsilon, **kwargs):
        super().__init__(**kwargs)
        self.epsilon = epsilon

    def call(self, inputs):
        return tf.nn.softplus(inputs) + self.epsilon
    
    
class DistributionLayer(tf.keras.layers.Layer):
    def __init__(self, action_dim, action_bound, **kwargs):
        super().__init__(**kwargs)
        self.action_dim = action_dim
        self.action_bound = action_bound

    def build(self, input_shape):
        self.mu_layer = Dense(self.action_dim, activation='tanh')
        self.sigma_layer = Dense(self.action_dim, activation='softplus')

    def call(self, inputs):
        mu = self.mu_layer(inputs) * self.action_bound 
        sigma = tf.clip_by_value(self.sigma_layer(inputs), 1e-6, 1e+6)
        return mu, sigma

In [None]:
class PPOAgent:
    def __init__(self, state_dim, continuous_action_dim, continuous_action_bound, actor_lr=0.001, critic_lr=0.01, network_size=[64, 64], clip_param=0.2):
        self.state_dim = state_dim
        self.continuous_action_dim = continuous_action_dim
        self.continuous_action_bound = continuous_action_bound
        self.actor_optimizer = Adam(learning_rate=actor_lr)
        self.critic_optimizer = Adam(learning_rate=critic_lr)
        self.network_size = network_size
        self.clip_param = clip_param
        self.actor = self.build_actor()
        self.critic = self.build_critic()

    def build_actor(self):
        inputs = Input(shape=(self.state_dim,))
        x = self.build_network(inputs, self.network_size)
        mu, sigma = DistributionLayer(self.continuous_action_dim, self.continuous_action_bound)(x)
        return Model(inputs=inputs, outputs=[mu, sigma])
    
    def build_critic(self):
        inputs = Input(shape=(self.state_dim,))
        x = self.build_network(inputs, self.network_size)
        outputs = Dense(1)(x)
        return Model(inputs=inputs, outputs=outputs)
    
    def build_network(self, inputs, layers_sizes):
        x = inputs
        for size in layers_sizes:
            x = Dense(size, activation='relu', kernel_initializer='he_uniform')(x)
        return x

    def policy(self, state):
        mu, sigma = self.actor(np.array([state]))
        dist = tfd.TruncatedNormal(loc=mu[0], scale=sigma[0], low=-self.continuous_action_bound, high=self.continuous_action_bound)
        action = dist.sample()
        return action.numpy()

    def train(self, replay_buffer):
        batch_size = 64
        if len(replay_buffer) < batch_size:
            return None, None
        samples = np.array(replay_buffer, dtype=object)
        states, actions, rewards, next_states, dones = map(np.stack, zip(*samples))
        mean_rewards = np.mean(rewards)
        std_rewards = np.std(rewards) + 1e-8
        rewards = (rewards - mean_rewards) / std_rewards

        with tf.GradientTape(persistent=True) as tape:
            mu, sigma = self.actor(states, training=True)
            dist = tfd.TruncatedNormal(loc=mu, scale=sigma, low=-self.continuous_action_bound, high=self.continuous_action_bound)
            log_probs = dist.log_prob(actions)
            log_probs = tf.reduce_sum(log_probs, axis=-1)
            
            critic_value = self.critic(states, training=True)
            advantage = (rewards - tf.squeeze(critic_value))
            advantage = (advantage - tf.reduce_mean(advantage)) / (tf.math.reduce_std(advantage) + 1e-8)

            actor_loss = -tf.reduce_mean(log_probs * advantage)
            entropy_bonus = -0.01 * tf.reduce_mean(dist.entropy())
            total_actor_loss = actor_loss + entropy_bonus
            
            critic_loss = tf.reduce_mean((rewards - tf.squeeze(critic_value))**2)

        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        actor_grads = tape.gradient(total_actor_loss, self.actor.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

        return actor_loss, critic_loss


In [None]:
def train_agent(episodes=1000):
    env = ParabolicShotEnv(mode=None)  # Assumiendo que 'None' significa sin visualización en tiempo real
    agent = PPOAgent(env.observation_space.shape[0], 3, env.action_space.spaces['continuous'].high)
    replay_buffer = deque(maxlen=1000)
    
    for episode in tqdm(range(episodes)):
        state = env.reset()
        done = False
        while not done:
            action = agent.policy(state)
            hybrid_action = {'continuous': action, 'discrete': 1 if np.random.rand() > 0.5 else 0}
            next_state, reward, done, info = env.step(hybrid_action)
            replay_buffer.append((state.copy(), hybrid_action['continuous'], reward, next_state.copy(), done))
            state = next_state

        if episode % 10 == 0:
            agent.train(list(replay_buffer))
    
    agent.actor.save_weights('/home/jd/Documentos/CODIGO/OpenAIGym/trained/parabolic_actor.weights.h5')
    agent.critic.save_weights('/home/jd/Documentos/CODIGO/OpenAIGym/trained/parabolic_critic.weights.h5')
    env.close()

In [None]:
train_agent()

In [None]:
def simulate_agent():
    env = ParabolicShotEnv()
    agent = PPOAgent(env.observation_space.shape[0], 3, env.action_space.spaces['continuous'].high)
    agent.actor.load_weights('/home/jd/Documentos/CODIGO/OpenAIGym/trained/parabolic_actor.weights.h5')
    agent.critic.load_weights('/home/jd/Documentos/CODIGO/OpenAIGym/trained/parabolic_critic.weights.h5')
    
    for _ in tqdm(range(10)):
        state = env.reset()
        done = False
        total_reward = 0
        steps = 0
        while not done:
            action = agent.policy(state)
            hybrid_action = {'continuous': action, 'discrete': 1 if np.random.rand() > 0.5 and steps > 10 else 0}
            print(hybrid_action)  # Debugging
            state, reward, done, _ = env.step(hybrid_action)
            total_reward += reward
            steps+=1
            env.render()
            time.sleep(1)
            

        print(f'Total Reward: {total_reward}')
    env.close()

In [None]:
# simulate_agent()