In [None]:
import tensorflow as tf
import numpy as np
import gym

In [None]:
import tensorflow as tf

class PPOPolicy:

    def __init__(self, obs_size, act_size):
        self.model = tf.keras.Sequential([
            tf.keras.layers.Input(obs_size),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(act_size, activation="softmax")
        ])

    def predict(self, states):
        probs = self.model(states)
        return probs

    def update(self, states, advantages):
        loss = tf.keras.losses.mse(advantages, self.model(states))
        self.model.fit(states, advantages, epochs=1, verbose=0)


In [None]:
import tensorflow as tf

class CriticNetwork:

    def __init__(self):
        self.model = tf.keras.Sequential([
            tf.keras.layers.Input(obs_size + act_size),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1, activation="linear")
        ])

    def predict(self, states, actions):
        state_action = tf.concat([states, actions], axis=1)
        value = self.model(state_action)
        return value



In [None]:

class PPOAgent:

    def __init__(self, env, policy, critic):
        self.env = env
        self.policy = policy
        self.critic = critic

    def train(self, num_epochs, batch_size):
        for epoch in range(num_epochs):
            # Collect a batch of data from the environment.
            states, actions, rewards, next_states = self.env.sample(batch_size)

            # Calculate the advantage function.
            advantages = self.critic.predict(states, next_states) - self.critic.predict(states)

            # Update the policy.
            old_policy_probs = self.policy.predict(states)
            new_policy_probs = self.policy.predict(states, actions)
            ratios = new_policy_probs / old_policy_probs
            advantages = advantages * tf.clip_by_value(ratios, 1.0 - 0.05, 1.0 + 0.05)
            self.policy.update(states, advantages)

    def evaluate(self):
        # Evaluate the policy on a set of test episodes.
        rewards = []
        for _ in range(10):
            state = self.env.reset()
            episode_reward = 0
            while not self.env.done:
                action = self.policy.predict(state)
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward
                state = next_state
            rewards.append(episode_reward)
        return np.mean(rewards)

In [16]:
import gym

env = gym.make("Pendulum-v0",  # use 'Pendulum-v0' for discrete action space
                action_space=gym.spaces.Discrete(3))
obs_size = env.observation_space.shape[0]
act_size = env.action_space.n
policy = PPOPolicy(obs_size, act_size)
critic = CriticNetwork()
agent = PPOAgent(env, policy, critic)
agent.train(100, 100)
reward = agent.evaluate()
print("The average reward is:", reward)


DeprecatedEnv: Environment version v0 for `Pendulum` is deprecated. Please use `Pendulum-v1` instead.