In [None]:
import ray
from ray.rllib.utils import PolynomialSchedule
from src.components.train import train_env
from src.simglucose.env import register_simglucose_env
import warnings


warnings.filterwarnings('ignore')
ray.init(log_to_driver=False)

# Define Configs

In [None]:
from src.simglucose.rewards import tan_reward

schedule_timesteps = 2000000
pl_sch = PolynomialSchedule(schedule_timesteps=2000000, initial_p=1e-3, final_p=1e-5, framework="torch", power=3)
lr_schedule = list(map(lambda t: [t, pl_sch.value(t)], range(0, schedule_timesteps, 2000)))
entropy_pl_sch = PolynomialSchedule(schedule_timesteps=2000000, initial_p=1e-3, final_p=1e-6, framework="torch",
                                    power=3)
ent_schedule = list(map(lambda t: [t, entropy_pl_sch.value(t)], range(0, schedule_timesteps, 2000)))
print(lr_schedule)
total_workers = 10
num_envs_per_worker = 10

env_name = "Simglucose-v0"
register_simglucose_env(env_name)
env_configs = dict(reward_fun=tan_reward, patient_type='adult')

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig

lstm_model = {"fcnet_hiddens": [32, 32], "vf_share_layers": False, "use_lstm": True,
               "lstm_cell_size": 32, "max_seq_len": 200}
algo = "PPO"
config = (
    PPOConfig()
    .environment(env_name, env_config=env_configs)
    .training(gamma=0.996, num_sgd_iter=3, sgd_minibatch_size=400, clip_param=0.1, lr=1e-3,
              train_batch_size=4000,
              entropy_coeff=1e-3, entropy_coeff_schedule=ent_schedule,lr_schedule=lr_schedule)
    .resources(num_gpus=1, num_cpus_per_worker=1)
    .rollouts(num_rollout_workers=total_workers, num_envs_per_worker=num_envs_per_worker)
    .framework("torch")
    .training(
        model= lstm_model)
    .evaluation(evaluation_num_workers=1)
)


# Tune Hyper-parameters

The hyper-parameters to be optimized can be defined in the config using tune API. For example in this case *use_lstm* is a hyper-parameter with values *[True, False]*. The *tune_hyper_param*  searches for the optimal parameter values and returns the best config.

In [None]:
log_dir = "tmp/pipeline_logs"

# Train RL Agent

The model is trained here using the best config from the tune step. The best training checkpoint is then chosen for evaluation

In [None]:
train_results = train_env(
    algo=algo,
    config=config,
    log_dir=log_dir,
    iterations=20000,
    stop_reward_mean=1000,
    name="simglucose_solver",
    checkpoint_frequency=5
)
best_checkpoint = train_results.get_best_result(metric="episode_reward_mean", mode="max").best_checkpoints[0]
best_checkpoint_path = best_checkpoint[0]._local_path