In [None]:
import ray

ray.init(num_cpus=12, num_gpus=1, memory=1024 * 1024 * 1024 * 10, object_store_memory=1024 * 1024 * 1024 * 30, 
#          use_pickle=True,
#          temp_dir='/home/projects/satellite_rl',
        )

In [None]:
from ray.tune.registry import register_env
import gym

def env_creator(env_config):
    import gym, satellite_gym
    return gym.make("satellite_gym:SatelliteEnv-v1", sat_id=41)

# env = gym.make("satellite_gym:SatelliteEnv-v1", sat_id=41)
# env_creator = lambda x: env
# env = SatelliteEnv(df, sat_id=41)
register_env("SatelliteEnv-v2", lambda x: env_creator(x))

In [None]:
import ray.rllib.agents.ars as ars
from ray.tune.logger import pretty_print

def on_train_result(info):
    result = info["result"]
    if result["episode_reward_mean"] > 45:
        phase = 2
    elif result["episode_reward_mean"] > 22:
        phase = 1
    else:
        phase = 0
    trainer = info["trainer"]
    trainer.workers.foreach_worker(
        lambda ev: ev.foreach_env(
            lambda env: env.set_phase(phase)))
    
    
config = ars.DEFAULT_CONFIG.copy()
# config['model']['use_lstm'] = True
# config["model"]["vf_share_layers"] = True
config["num_workers"] = 10
config["seed"] = 0
config["eager"] = False
config["noise_stdev"] = .01
config["num_rollouts"] = 1
config["rollouts_used"] = 1
config["sgd_stepsize"] = .02
config["noise_size"] = 2500000000
config["eval_prob"] = .02
config["observation_filter"] = "NoFilter"
# config["callbacks"] = { "on_train_result": on_train_result }

trainer = ars.ARSTrainer(config=config, env="satellite_gym:SatelliteEnv-v1")



In [None]:


for i in range(4010):
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    print(pretty_print(result))
    
    if i % 50 == 0:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)

In [None]:
from ray import tune
import ray.rllib.agents.ars as ars
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.bayesopt import BayesOptSearch
import numpy as np

config = ars.DEFAULT_CONFIG.copy()
config["observation_filter"] = "NoFilter"
# config['model']['use_lstm'] = True
# config["model"]["vf_share_layers"] = True

async_hb_scheduler = AsyncHyperBandScheduler(
    time_attr='training_iteration',
    metric='episode_reward_mean',
    mode='max',
    max_t=200,
    grace_period=10,
    reduction_factor=3,
    brackets=3
)


space = {
    "noise_stdev": (0.01, 0.1),
#     "num_rollouts": (1, 32),
#     "rollouts_used": (1, 32),
    "sgd_stepsize": (0.01, 0.1),
#     "noise_size": (2500000, 250000000),
    "eval_prob": (0.01, 0.1),
}

bayes_search = BayesOptSearch(space, max_concurrent=4, metric="episode_reward_mean", mode="max", 
                              utility_kwargs={
            "kind": "ucb",
            "kappa": 2.5,
            "xi": 0.0
        }, use_early_stopped_trials=True)

# def train(config, reporter):
#     trainer = ppo.PPOTrainer(config=config, env="SatelliteEnv-v2")
#     while True:
#         result = trainer.train()
#         reporter(**result)
#         if result["episode_reward_mean"] > 44:
#             phase = 2
#         elif result["episode_reward_mean"] > 22:
#             phase = 1
#         else:
#             phase = 0
#         trainer.workers.foreach_worker(
#             lambda ev: ev.foreach_env(
#                 lambda env: env.set_phase(phase)))


tune.run(
    "ARS",
    stop={"training_iteration": 200},
    config={
        "env": "satellite_gym:SatelliteEnv-v1",
        "num_workers": 10,
        "eager": False,
        "seed": 0,
        "observation_filter": "NoFilter",
    }, 
    scheduler=async_hb_scheduler, 
    search_alg=bayes_search,
#     resources_per_trial={
#         "cpu": 1,
#         "gpu": .2,
#         "extra_cpu": 1,
#     },
)

In [None]:
from ray import tune
import ray.rllib.agents.ars as ars
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.hyperopt import HyperOptSearch
from hyperopt import hp
import numpy as np

config = ars.DEFAULT_CONFIG.copy()
# config['model']['use_lstm'] = True
# config["model"]["vf_share_layers"] = True

async_hb_scheduler = AsyncHyperBandScheduler(
    time_attr='training_iteration',
    metric='episode_reward_mean',
    mode='max',
    max_t=200,
    grace_period=10,
    reduction_factor=3,
    brackets=3
)


def train(config, reporter):
    trainer = ars.ARSTrainer(config=config, env="SatelliteEnv-v2")
    while True:
        result = trainer.train()
        reporter(**result)
#         if result["episode_reward_mean"] > 44:
#             phase = 2
#         elif result["episode_reward_mean"] > 22:
#             phase = 1
#         else:
#             phase = 0
#         trainer.workers.foreach_worker(
#             lambda ev: ev.foreach_env(
#                 lambda env: env.set_phase(phase)))

space = {
    "noise_stdev": hp.loguniform("noise_stdev", 1e-2, 0.1),
#     "num_rollouts": hp.qlognormal('num_rollouts', 32, 1, 8),
#     "rollouts_used":hp.qlognormal('rollouts_used', 32, 1, 8),  # number of perturbs to keep in gradient estimate
    "sgd_stepsize": hp.loguniform("sgd_stepsize", 1e-2, 0.1),  # sgd step-size
#     "noise_size":hp.choice("noise_size", [25000000, 250000000, 2500000000]),  # number of perturbs to keep in gradient estimate
    "eval_prob": hp.loguniform("eval_prob", 1e-2, 0.1),  # sgd step-size
}

hyperopt_search = HyperOptSearch(space, max_concurrent=4, metric="episode_reward_mean", mode="max")


tune.run(
    "ARS",
    stop={"training_iteration": 200},
    config={
        "env": "SatelliteEnv-v2",
#         "num_workers": 1,
        "eager": False,
        "seed": 0,
#         "iterations": 200,
#         "noise_stdev": tune.uniform(0.01, 0.1),
    }, 
    num_samples=10,
    scheduler=async_hb_scheduler,
    search_alg=hyperopt_search,
#     resources_per_trial={
#         "cpu": 1,
# #         "gpu": .2,
# #         "extra_cpu": 1,
#     },
)