In [1]:
import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.dqn import DQNTrainer
from ray.rllib.agents.a3c import A3CTrainer, A2CTrainer
from ray.tune.registry import register_env

from collections import defaultdict
import gym

import imageio

import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


Run tensorboard ... `tensorboard --logdir results`

In [None]:
class IterationOrRewardStopper(tune.Stopper):
    def __init__(self, max_iterations=10000, target_reward=100, reward_window=100):
        self._max_iterations = max_iterations
        self._target_reward = target_reward
        self._reward_window = reward_window

        self._iter = defaultdict(lambda: 0)
        self._rewards = defaultdict(lambda: [])

    def __call__(self, trial_id, result):
        self._iter[trial_id] += 1

        if self._iter[trial_id] >= self._max_iterations:
            return True
        
        self._rewards[trial_id].append(result["episode_reward_mean"])
        if len(self._rewards[trial_id]) > self._reward_window:
            self._rewards[trial_id].pop(0)

        if sum(self._rewards[trial_id]) >= self._target_reward * self._reward_window:
            return True

    def stop_all(self):
        return False

In [None]:
experiments = []

In [None]:
experiments.append(
    tune.Experiment(
        name="ppo_cartpole_restore_test",
        run=PPOTrainer,
        config={
            "env": "CartPole-v1",
            "framework": "torch",
        },
        local_dir="./results",
        stop=IterationOrRewardStopper(target_reward=475, reward_window=100),

        checkpoint_at_end=True,
    )
)

In [None]:
experiment_results = tune.run_experiments(
    experiments
)

In [None]:
checkpoint_path = "/home/ibraheem/Desktop/tf-angents-tests/results/ppo_cartpole_restore_test/PPOTrainer_CartPole-v1_3ff75_00000_0_2022-07-30_15-52-36/checkpoint_000166/checkpoint-166" #experiment_results[0].checkpoint.value

# Load model as RLlib Trainer

In [None]:
eval_env = gym.make("CartPole-v1")

eval_trainer = PPOTrainer(
    env="CartPole-v1",
    config={
        "framework": "torch",
    }
)

eval_trainer.restore(checkpoint_path)


In [None]:
with imageio.get_writer(f"videos/cartpole_restore_test.mp4", fps=30) as video:
    observation = eval_env.reset()
    video.append_data(eval_env.render("rgb_array"))
    for i in range(500):
        action = eval_trainer.compute_single_action(observation)
        observation, reward, done, info = eval_env.step(action)
        video.append_data(eval_env.render("rgb_array"))
        if done:
            break

# Export and load the model as pytorch model

In [2]:
EXPORT_DIR = "tf_export/ppo_cartpole_restore_test"

In [3]:

eval_trainer.export_policy_model(EXPORT_DIR)

NameError: name 'eval_trainer' is not defined

In [59]:
eval_model_tf = tf.saved_model.load(EXPORT_DIR).signatures["serving_default"]

def eval_model_tf_with_sess(obs, sess):
    actions = eval_model_tf(
        is_training=tf.convert_to_tensor(False),
        observations=tf.convert_to_tensor(obs, dtype=tf.float32),
        timestep=tf.convert_to_tensor(0, dtype=tf.int32),
    )

    return sess.run(actions["actions_0"])


In [63]:
with tf.compat.v1.Session() as sess:
    tf.compat.v1.initialize_all_variables().run()

    actions = eval_model_tf(
        is_training=tf.convert_to_tensor(False),
        observations=tf.convert_to_tensor([[0, 0, 0.1, 0]], dtype=tf.float32),
        timestep=tf.convert_to_tensor(0, dtype=tf.int32),
    )

    print(sess.run(actions["actions_0"]))

NameError: name 'obs' is not defined

In [61]:
eval_env_tf = gym.make("CartPole-v1")

In [64]:
with imageio.get_writer(f"videos/cartpole_restore_test_tf.mp4", fps=30) as video, \
    tf.compat.v1.Session() as sess:

    tf.compat.v1.initialize_all_variables().run()

    observation = eval_env_tf.reset()
    video.append_data(eval_env_tf.render("rgb_array"))
    for i in range(500):
        action = eval_model_tf_with_sess(observation, sess)
        observation, reward, done, info = eval_env_tf.step(action)
        video.append_data(eval_env_tf.render("rgb_array"))
        if done:
            break

: 

: 