In [1]:
import retro

# Setup Environment

Preprocess - grayscale, frame delta, resize the frame so we have less pixels
Filter the actions - parameter 
reward function 

In [2]:
# Import environment base class for the wrapper
from gym import Env
# import the space shapes for the environment
from gym.spaces import MultiBinary, Box
# import numpy for calculations of frame delta
import numpy as np 
#import open cv for grayscaling 
import cv2
# import matplotlib - image plotting 
from matplotlib import pyplot as plt
import time

In [3]:
# create custom environment 
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84,84,1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # start game retro.Actions.FILTERED filters it to only valid button combinations for input
        self.game = retro.make(game ='StreetFighterIISpecialChampionEdition-Genesis', 
                               use_restricted_actions = retro.Actions.FILTERED)
    
    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        # frame delta 
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs 
        # reshape the reward function 
        reward = info['score'] - self.score 
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def reset(self):
        # return first frame
        obs = self.game.reset()
        obs = self.preprocess(obs)
        # pre-processing ToDo
        self.previous_frame = obs
        # create score delta variable 
        self.score = 0
        return obs
    
    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # resize 
        resize = cv2.resize(gray, (84,84), cv2.INTER_CUBIC)
        resize = np.reshape(resize,(84,84,1))
        return resize

    def render(self, *args, **kwargs):
        self.game.render()

    def close(self): 
        self.game.close()

# DO NOT RUN JUST USED FOR TESTING 

In [4]:
# this is just for running the inital environment to see if the game is loading correctly
env = StreetFighter()

In [5]:
env.close()

In [5]:
# rest to starting state 
obs = env.reset()
# set flag to false 
done = False 
# play game once 
for game in range(1): 
    # play until not dead
    while not done: 
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        plt.imshow(cv2.cvtColor(obs, cv2.COLOR_BGR2RGBA)) # show changes in frames 
        time.sleep(0.01)
        if reward > 0:
            print(reward)



500
500
100
100
1000
500
500
1000
500
500
1500
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
1000
1000
1000
1000
1000
1000
1000
1000
10000
500
1000
500
500
300
100


: 

# Start Running HERE

In [4]:
import optuna
# eval policy for metric calculation
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
# import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# set directories for logs and optimization functions 
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [8]:
#Function to return test hyperparameters - define the object function 
def optimize_ppo(trial): # optimize will return this dictionary with values between the provided values
    return{
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-4, log=True),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, 0.99)
    }

In [9]:
# run a training loop and return a mean reward 
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)

        # create environment 
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        # 30k is under shooting - just for speed and example - timesteps are frames in game
        model.learn(total_timesteps=30000)

        # evaluate model n_eval should be higher, is low for testing 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=300)
        env.close()

        # save copies of optimized models 
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward
    except Exception as e: 
        return -1000

In [9]:
# creates a study that wants to maximize the return 
study = optuna.create_study(direction='maximize')
# n_trails is 1 for example, 100 is better for actual training 
study.optimize(optimize_agent, n_trials=30, n_jobs=1)

[I 2023-10-01 16:51:19,717] A new study created in memory with name: no-name-64c78297-7762-4ac0-9c2b-3380fafb85e6
[I 2023-10-01 16:51:20,382] Trial 1 finished with value: -1000.0 and parameters: {'n_steps': 5304, 'gamma': 0.8721358974710086, 'learning_rate': 5.1288944547369e-05, 'clip_range': 0.32179574293687196, 'gae_lambda': 0.8895567432134891}. Best is trial 1 with value: -1000.0.
[I 2023-10-01 16:51:20,444] Trial 2 finished with value: -1000.0 and parameters: {'n_steps': 7089, 'gamma': 0.8444697160941264, 'learning_rate': 2.2133328450608565e-05, 'clip_range': 0.1986997786948363, 'gae_lambda': 0.912620980319527}. Best is trial 1 with value: -1000.0.
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4996 and n_envs=1)
[I 2023-10-01 16:51:20,538] Trial 3 finished with value: -1000.0 and parameters: {'n_steps': 3971, 'gamma': 0.8866416986830854, 'learning_rate': 1.3159131842757735e-05, 'clip_range': 0.18012774837568582, 'gae_lambda': 0.98234416536

In [None]:
# show the best parameters returned from the study
study.best_params

# Load finished hyperparameters 

In [10]:
# this will load a model from the saved models 
study = PPO.load(os.path.join(OPT_DIR, 'trial_7_best_model.zip'))

# Build Model

In [11]:
from stable_baselines3.common.callbacks import BaseCallback

In [12]:
class TrainAndLoggingCallback(BaseCallback): 

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq 
        self.save_path = save_path 

    def _init_callback(self):
        if self.save_path is not None: 
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0: 
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [13]:
CHECKPOINT_DIR = './train/'

In [14]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [6]:
# Create environment 
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [18]:
model_params = {
    'n_steps': 7516,
    'gamma': 0.9085173842732223,
    'learning_rate': 5.02771591344835e-05,
    'clip_range': 0.39105070719865653
}
# make sure to overide the params step values in the params to the nearest num divisible by 64

In [15]:
model_params['n_steps'] = 7488

In [16]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)

In [17]:
model.load(os.path.join(OPT_DIR, 'trial_7_best_model.zip'))
model.learn(total_timesteps=30000, callback=callback)
# again 30k is pretty big undershoot 
# if poor perfomance maybe put model_params['learning_rate'] = 5e-7

<stable_baselines3.ppo.ppo.PPO at 0x20921265bb0>

# Evaluate Model

In [7]:
model = PPO.load('./train/best_model_30000')
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=5)



KeyboardInterrupt: 

: 

# Test Model

In [None]:
obs = env.reset()
obs.shape

In [None]:
# rest to starting state 
obs = env.reset()
# set flag to false 
done = False 
# play game once 
for game in range(1): 
    # play until not dead
    while not done: 
        if done:
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0] # get an action prediction from model
        obs, reward, done, info = env.step(action) # pass into the game
        time.sleep(0.01)
        if reward > 0:
            print(reward)