# Environment Setup

In [1]:
#!pip install gym gym-retro
#!pip install opencv-python

In [2]:
# Running Environment_Setup.ipynb to download ROM and setup custom Sonic environment.
%run Environment_Setup.ipynb

Importing SonicTheHedgehog2-Genesis
Imported 1 games


# Hyperparameter Search

In [3]:
#!pip install stable-baselines3[extra] optuna

In [4]:
# Searching for best parameters
import optuna
# PPO algorythm
from stable_baselines3 import PPO
# Eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# To deal with filepaths
import os

In [5]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [6]:
# Function to return test hyperparameters - define the object function
def optimize_ppo(trial):
    return {
        'gamma': trial.suggest_float('gamma', 0.98, 0.999, log=True),
        'n_steps': trial.suggest_int('n_steps', 4096, 16_384, 2048),
        'learning_rate': trial.suggest_float('learning_rate', 5e-8, 9e-6),
        'clip_range': trial.suggest_float('clip_range', 0.10, 0.24),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.98, 0.99)
    }

In [8]:
# Run a training loop and return mean reward 
def optimize_agent(trial):
    model_params = optimize_ppo(trial) 

    # Create environment 
    env = Sonic()
    env = ActionDiscretizer(env)
    env = Monitor(env, LOG_DIR)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')

    # Create model 
    model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
    model.set_parameters("train/old_gen/PPO-Sonic-Agent1")
    model.learn(total_timesteps=1_400_000)

    # Evaluate model 
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=1)
    env.close()

    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
    model.save(SAVE_PATH)

    return mean_reward

In [None]:
# Creating the experiment
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=40, n_jobs=1, timeout=60*60*42)

[32m[I 2022-11-21 07:45:23,768][0m A new study created in memory with name: no-name-77355664-01f4-449c-b409-5fb8ca38ddd3[0m
[32m[I 2022-11-21 10:29:17,144][0m Trial 0 finished with value: -1089.2 and parameters: {'gamma': 0.9956193908499814, 'n_steps': 10240, 'learning_rate': 1.3846472113026198e-06, 'clip_range': 0.11181817995205309, 'gae_lambda': 0.9837323249216188}. Best is trial 0 with value: -1089.2.[0m
[32m[I 2022-11-21 14:00:15,255][0m Trial 1 finished with value: -1146.21 and parameters: {'gamma': 0.9837511303348656, 'n_steps': 6144, 'learning_rate': 6.880575471085832e-06, 'clip_range': 0.15161587643785795, 'gae_lambda': 0.9837665274982914}. Best is trial 0 with value: -1089.2.[0m


In [None]:
# Ploting history of trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# Plotting most important parameters that influence score of our agent
optuna.visualization.plot_param_importances(study, target_name="score")

In [None]:
# Plotting scatter plots of every parameter
optuna.visualization.plot_slice(study, params=study.best_params.keys(), target_name="score")

In [None]:
# Plotting countour plot that shows coorelation between parameters
optuna.visualization.plot_contour(study, target_name="score")

In [None]:
# Getting best parameters to train our agent
best_trial = study.best_trial
best_trial