In [1]:
import time
import ray
from ray.rllib.algorithms.ppo import PPOConfig
from os.path import join
from src.components.eval import eval_env
from src.components.train import train_env
from src.components.tune import tune_hyper_param
from src.simglucose.env import env_creator
from src.simglucose.env import register_simglucose_env
from src.files.utills import pickle_obj
from src.simglucose.rewards import no_negativityV2

  if (distutils.version.LooseVersion(tf.__version__) <


# Define Configs

In [2]:


total_workers = 10
num_envs_per_worker = 1
algo = "PPO"
env_name = "Simglucose-v0"
register_simglucose_env(env_name)
env_configs = dict(reward_fun=no_negativityV2, patient_type='adult')
config = (
    PPOConfig()
    .environment(env_name, env_config=env_configs)
    .training(gamma=0.995, num_sgd_iter=3, sgd_minibatch_size=50, clip_param=0.1, lr=1e-4, train_batch_size=1000,
              entropy_coeff=1e-5)
    .resources(num_gpus=1, num_cpus_per_worker=1)
    .rollouts(num_rollout_workers=total_workers, num_envs_per_worker=num_envs_per_worker)
    .framework("torch")
    .training(
        model={"fcnet_hiddens": [64, 64, 64], "vf_share_layers": False, "use_lstm": True,
               "lstm_cell_size": 32, "max_seq_len": 100,"lstm_use_prev_action":False})
    .evaluation(evaluation_num_workers=1)
)
log_dir = "tmp/pipeline_logs"

# Tune Hyper-parameters

The hyper-parameters to be optimized can be defined in the config using tune API. For example in this case *use_lstm* is a hyper-parameter with values *[True, False]*. The *tune_hyper_param*  searches for the optimal parameter values and returns the best config.

In [None]:
tune_results = tune_hyper_param(
    algo=algo,
    config=config,
    log_dir=log_dir,
    iterations=1,
    name="simglucose_tuning"
)

best_config = tune_results.get_best_result(metric="episode_reward_mean", mode="max").config

2023-03-30 11:57:48,291	INFO worker.py:1538 -- Started a local Ray instance.


[2m[36m(pid=1247977)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(PPO pid=1247977)[0m 2023-03-30 11:57:53,021	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1248061)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248064)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248058)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248057)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248063)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248059)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248062)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248066)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[2m[36m(pid=1248060)[0m   if (distutils.version.LooseVersion(tf.__v

# Train RL Agent

The model is trained here using the best config from the tune step. The best training checkpoint is then chosen for evaluation

In [None]:
train_results = train_env(
    algo=algo,
    config=best_config,
    log_dir=log_dir,
    iterations=20000,
    stop_reward_mean=100,
    name="simglucose_solver",
    checkpoint_frequency=5
)
best_checkpoint = train_results.get_best_result(metric="episode_reward_mean", mode="max").best_checkpoints[0]
best_checkpoint_path = best_checkpoint[0]._local_path

# Evaluate Trained Agent

The best checkpoint is evaluated for and the results are returned as dataframe for further interpretations

In [None]:
print(best_checkpoint_path)
start = time.time()
results = eval_env(ray=ray, epochs=1000, workers=8, env_creator=env_creator, env_kwargs=env_configs,
                   checkpoint_path=best_checkpoint_path)
print(time.time() - start)

In [None]:
pickle_obj(results,join('tmp','evaluation_results.pkl'))