In [9]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecVideoRecorder
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium
import ale_py
import os
import optuna 

In [19]:
def obj(trial):
    # Environment creation
    env = make_atari_env("SpaceInvadersNoFrameskip-v4",n_envs=4,seed=0)
    # Stack 4 frames
    env = VecFrameStack(env, n_stack=4)

    learning_starts = trial.suggest_int("learning_starts", 1000, 100_000)
    target_update_interval = trial.suggest_int("target_update_interval", 1, 1000)
    exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5)
    exploration_final_eps = trial.suggest_float("exploration_final_eps", 0.01, 0.1)
    batch_size = trial.suggest_int("batch_size", 32, 128)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    gradient_steps = trial.suggest_int("gradient_steps", 1, 10)
    buffer_size = trial.suggest_int("buffer_size", 1000, 100_000)
    train_freq = trial.suggest_int("train_freq", 1, 10)

    # Create the agent and train it
    agent = DQN("CnnPolicy", env, verbose=1,buffer_size=buffer_size, learning_starts=learning_starts, train_freq=train_freq, 
                target_update_interval=target_update_interval, exploration_fraction=exploration_fraction, 
                exploration_final_eps=exploration_final_eps, batch_size=batch_size, learning_rate=learning_rate, 
                gradient_steps=gradient_steps, optimize_memory_usage=False)
    agent.learn(total_timesteps=10_000)

    mean_reward, _= evaluate_policy(agent, env, n_eval_episodes=10, deterministic=True)
    env.close()
    return mean_reward

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(obj, n_trials=100, n_jobs=1)

print("Best hyperparameters:", study.best_params)

[I 2024-11-26 20:46:36,657] A new study created in memory with name: no-name-378962ba-ce17-4e33-9179-9044dd9e7b37
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)


Using cpu device
Wrapping the env in a VecTransposeImage.
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.376    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 845      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1512     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.92e+03 |
|    ep_rew_mean      | 128      |
|    exploration_rate | 0.216    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 868      |
|    time_elapsed     | 2        |
|    total_timesteps  | 1900     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | 212      |
|    exploration_rate | 0.0926   |
| time/               |          |
|    episodes         | 12      

[I 2024-11-26 20:46:55,108] Trial 0 finished with value: 18.5 and parameters: {'learning_starts': 16798, 'target_update_interval': 17, 'exploration_fraction': 0.22003061018182227, 'exploration_final_eps': 0.09260000030386376, 'batch_size': 126, 'learning_rate': 2.1659979960734803e-05, 'gradient_steps': 6, 'buffer_size': 2429, 'train_freq': 1}. Best is trial 0 with value: 18.5.


Using cpu device
Wrapping the env in a VecTransposeImage.
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.614    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1139     |
|    time_elapsed     | 1        |
|    total_timesteps  | 1512     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.92e+03 |
|    ep_rew_mean      | 128      |
|    exploration_rate | 0.515    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1174     |
|    time_elapsed     | 1        |
|    total_timesteps  | 1900     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | 212      |
|    exploration_rate | 0.314    |
| time/               |          |
|    episodes         | 12      

[I 2024-11-26 20:47:11,257] Trial 1 finished with value: 18.5 and parameters: {'learning_starts': 90416, 'target_update_interval': 907, 'exploration_fraction': 0.36055464260842873, 'exploration_final_eps': 0.07934436918078966, 'batch_size': 115, 'learning_rate': 3.078659648710743e-05, 'gradient_steps': 7, 'buffer_size': 50197, 'train_freq': 3}. Best is trial 0 with value: 18.5.


Using cpu device
Wrapping the env in a VecTransposeImage.
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.372    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1089     |
|    time_elapsed     | 1        |
|    total_timesteps  | 1512     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.92e+03 |
|    ep_rew_mean      | 128      |
|    exploration_rate | 0.21     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1139     |
|    time_elapsed     | 1        |
|    total_timesteps  | 1900     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | 212      |
|    exploration_rate | 0.0697   |
| time/               |          |
|    episodes         | 12      

In [None]:
# Create a folder to save videos
video_folder = "videos/"
os.makedirs(video_folder, exist_ok=True)
# Wrap environment for video recording
recording_env = VecVideoRecorder(env, video_folder,
                    record_video_trigger=lambda x: True,
                    video_length=1000) # Record every 5000 steps
obs = recording_env.reset()

# Record video
for _ in range(1000):
    action, _ = agent.predict(obs, deterministic=True)
    obs, reward, done, info = recording_env.step(action)
    if done.all():
        obs = recording_env.reset()
recording_env.close() 
print(f"Video saved in {video_folder}")