In [None]:
import time

import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from os.path import join
from os.path import dirname
from src.components.eval import eval_env
from src.components.train import train_env
from src.components.tune import tune_hyper_param
from src.envs.creator import env_creator
from src.envs.creator import register_platform_env
from src.files.utills import pickle_obj

# Define Configs

In [None]:
total_workers = 12
num_envs_per_worker = 24
algo = "PPO"
env_name = "Platform-v0"
register_platform_env(env_name)
config = (
    PPOConfig()
    .environment(env_name)
    .training(gamma=0.995, num_sgd_iter=10, sgd_minibatch_size=1000, clip_param=0.1, lr=1e-4, train_batch_size=2000,
              entropy_coeff=1e-4)
    .resources(num_gpus=1, num_cpus_per_worker=1)
    .rollouts(num_rollout_workers=total_workers, num_envs_per_worker=num_envs_per_worker)
    .framework("torch")
    .training(
        model={"fcnet_hiddens": [64, 64, 64], "vf_share_layers": False, "use_lstm": tune.grid_search([True, False]),
               "lstm_cell_size": 32,
               "max_seq_len": 5})
    .evaluation(evaluation_num_workers=1)
)
log_dir = "tmp/pipeline_logs"

# Tune Hyper-parameters

The hyper-parameters to be optimized can be defined in the config using tune API. For example in this case *use_lstm* is a hyper-parameter with values *[True, False]*. The *tune_hyper_param*  searches for the optimal parameter values and returns the best config.

In [None]:
tune_results = tune_hyper_param(
    algo=algo,
    config=config,
    log_dir=log_dir,
    iterations=30,
    name="platform_solver_tuning"
)

best_config = tune_results.get_best_result(metric="episode_reward_mean", mode="max").config

# Train RL Agent

The model is trained here using the best config from the tune step. The best training checkpoint is then chosen for evaluation

In [None]:
train_results = train_env(
    algo=algo,
    config=best_config,
    log_dir=log_dir,
    iterations=200,
    stop_reward_mean=1,
    name="platform_solver"
)
best_checkpoint = train_results.get_best_result(metric="episode_reward_mean", mode="max").best_checkpoints[0]
best_checkpoint_path = best_checkpoint[0]._local_path

# Evaluate Trained Agent

The best checkpoint is evaluated for and the results are returned as dataframe for further interpretations

In [None]:
print(best_checkpoint_path)
start = time.time()
results = eval_env(ray=ray, epochs=1000, workers=8, env_creator=env_creator, env_kwargs=dict(env_config=''),
                   checkpoint_path=best_checkpoint_path)
print(time.time() - start)

In [None]:
pickle_obj(results,join('tmp','evaluation_results.pkl'))