In [None]:
%pip install -r requirements.txt

In [3]:
import gym
import shutil
from stable_baselines3.common.callbacks import CheckpointCallback
from model_utils import createOrLoadModel

def train_ppo(env: gym.Env, subfolder, learning_rate, clip_range):
    model, steps_done = createOrLoadModel('ppo', env, 'ppo_', '_steps.zip', subfolder, learning_rate=learning_rate, clip_range=clip_range)

    checkpoint_callback = CheckpointCallback(save_freq=50000, save_path=f'./models/{subfolder}', name_prefix='ppo')

    callbacks=[checkpoint_callback]
    training_steps = int(1e6)
    model = model.learn(total_timesteps=(training_steps - steps_done), log_interval=4, callback=callbacks)

    shutil.rmtree(f'./models/{subfolder}')
    model.save(f'./models/{subfolder}/ppo_{training_steps}_steps.zip')

    sum_rewards = 0
    for _ in range(1000):
        obs = env.reset()
        done = False

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(int(action))
            sum_rewards += reward

    return sum_rewards / 1000

  if not hasattr(tensorboard, "__version__") or LooseVersion(
2023-09-03 12:17:04.297334: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-03 12:17:04.706007: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  import sre_constants


In [4]:
import optuna
import gym    
import slimevolleygym
from slimevolleygym import SurvivalRewardEnv
import numpy as np
from gym.wrappers.gray_scale_observation import GrayScaleObservation
from gym.wrappers.resize_observation import ResizeObservation
from atari_wrappers import RenderWrapper, BufferWrapper, ImageToPyTorch
from IPython.display import clear_output

def optimize_ppo(trial : optuna.Trial):
    env_name = 'SlimeVolleyNoFrameskip-v0'
    env = gym.make(env_name)
    env = SurvivalRewardEnv(env)
    env = RenderWrapper(env)
    env = ResizeObservation(env, (84, 84))
    env = GrayScaleObservation(env, True)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4, np.uint8)

    lr = trial.suggest_float ('learning_rate', 0.0001, 0.001)
    clip_range = trial.suggest_float ('clip_range', 0.1, 0.5)

    trial_date = trial.datetime_start.isoformat(timespec='seconds')
    mean_reward = train_ppo(env, f'ppo_train_{trial_date}', lr, clip_range)

    # limpa a saída da célula do notebook
    clear_output()

    # média dos retornos dos últimos 1000 episódios
    return mean_reward

In [4]:
study = optuna.create_study(direction='maximize',
                        storage='sqlite:///resultado_optuna.db',
                        study_name='ppo_slime_volley',
                        load_if_exists=True)

# maximiza o valor de retorno de train_exp_sarsa_continuous, rodando "n_trials" vezes
study.optimize(optimize_ppo, n_trials=10)

print("BEST PARAMETERS:", study.best_params)
print("BEST MODEL:", f'ppo_train_{study.best_trial.datetime_start}')

[I 2023-09-01 00:07:34,642] Trial 19 finished with value: -0.0002299999999999961 and parameters: {'learning_rate': 0.00042264165874822634, 'clip_range': 0.4084153184980618}. Best is trial 12 with value: 0.003289999999999954.


BEST PARAMETERS: {'learning_rate': 0.00038629218695293544, 'clip_range': 0.48591957734571395}
BEST MODEL: ppo_train_2023-08-31 13:20:23.720199


In [7]:
study = optuna.create_study(direction='maximize',
                        storage='sqlite:///resultado_optuna.db',
                        study_name='ppo_slime_volley',
                        load_if_exists=True)

env_name = 'SlimeVolleyNoFrameskip-v0'
env = gym.make(env_name)
env = RenderWrapper(env)
env = ResizeObservation(env, (84, 84))
env = GrayScaleObservation(env, True)
env = ImageToPyTorch(env)
env = BufferWrapper(env, 4, np.uint8)

best_trial_date = study.best_trial.datetime_start.isoformat(timespec='seconds')
model = createOrLoadModel('ppo', env, 'ppo_', '_steps.zip', f'ppo_train_{best_trial_date}/')

sum_rewards = 0
for _ in range(1000):
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, _ = env.step(int(action))
        sum_rewards += reward

[I 2023-09-03 12:20:52,534] Using an existing study with name 'ppo_slime_volley' instead of creating a new one.


Loading ./models/ppo_train_2023-08-31T13:20:23/ppo_1000000_steps.zip


  deprecation(
  deprecation(
