In [1]:
import retro
import time
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from matplotlib import pyplot as plt
import os

#Troubleshooting
#!python --version
#%pip list
#%pip install gym
#%pip install gym-retro
#!python -m retro.import StreetFighterIISpecialChampionEdition-Genesis.md

In [2]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84,84,1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
        
    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        reward = info['score'] - self.score
        self.score = info['score']
        
        return frame_delta, reward, done, info
    
    def render(self):
        self.game.render()
    
    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.score = 0
        return obs
    
    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation = cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def close(self):
        self.game.close()

In [None]:
#test render
frames = 0
env = StreetFighter()
obs = env.reset()
while True:
    if frames > 10000:
        break
    obs, rew, done, info = env.step(env.action_space.sample())
    env.render()
    frames += 1
    if done:
        obs = env.reset()

env.close()

In [3]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [4]:
def optimize_ppo(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    } 

In [5]:
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)
        
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        model = PPO('CnnPolicy', env, tensorboard_log = LOG_DIR, verbose = 0, **model_params)
        model.learn(total_timesteps = 100000)
        
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()
        
        SAVE_PATH = os.path.join(OPT_DIR, 'tiral_{}_best_mmodel'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    
    except Exception as e:
        return -1000
    
    
# def optimize_agent(trial):
#     try:
#         model_params = optimize_ppo(trial) 

#         # Create environment 
#         env = StreetFighter()
#         env = Monitor(env, LOG_DIR)
#         env = DummyVecEnv([lambda: env])
#         env = VecFrameStack(env, 4, channels_order='last')

#         # Create algo 
#         model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
#         model.learn(total_timesteps=30000)
#         #model.learn(total_timesteps=100000)

#         # Evaluate model 
#         mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
#         env.close()

#         SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
#         model.save(SAVE_PATH)

#         return mean_reward

#     except Exception as e:
        
#         return -1000

In [6]:
study = optuna.create_study(direction = 'maximize')
study.optimize(optimize_agent, n_trials = 10, n_jobs = 1)

[32m[I 2023-05-06 16:45:08,424][0m A new study created in memory with name: no-name-ea5744ee-6443-492b-a632-fb2b363af41c[0m
  'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
  'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
  'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, 0.99)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3263 and n_envs=1)
[32m[I 2023-05-06 16:48:41,702][0m Trial 0 finished with value: 2000.0 and parameters: {'n_steps': 3263, 'gamma': 0.9307606182488234, 'learning_rate': 3.7920860371647525e-05, 'clip_range': 0.13892361577235163, 'gae_lambda': 0.9288354759011572}. Best is trial 0 with value: 2000.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2558 and n_envs=1)
[32m[I 2023-05-06 16:51:33,618][0m Trial 1 finished with value: 0.0 and parameters: {'n_steps': 2558, 'gamma'

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6749 and n_envs=1)
[32m[I 2023-05-06 17:18:42,773][0m Trial 9 finished with value: 4200.0 and parameters: {'n_steps': 6749, 'gamma': 0.8530391376461304, 'learning_rate': 4.538756147210725e-05, 'clip_range': 0.11635498231932431, 'gae_lambda': 0.9795911164556375}. Best is trial 8 with value: 20400.0.[0m
