## Setting up Street Fighter

- Importing gym and retro.
- Loading the ROM file.
- Analysing the game space.
- Testing the game.

In [None]:
%pip install opencv-python matplotlib
%pip install torch==2.1.2+cu121 torchvision==0.16.2+cu121 torchaudio==2.1.2+cu121 --extra-index-url https://download.pytorch.org/whl/cu121
%pip install stable-baselines3==1.7.0
%pip install optuna
%pip install "shimmy>=2.0"

In [None]:
%pip install gym==0.21.0 gym-retro==0.8.0

In [None]:
import gym, retro

import os
import sys

import time

# Libraries necessary for data preprocessing.

from gym import Env  # Base environment class for a wrapper
from gym.spaces import MultiBinary, Box  # Ensure we pick the correct action space type. (Space shapes for the environment)

import numpy as np  # To calculate frame delta
import cv2  # For grayscaling

from matplotlib import pyplot as plt  # For plotting observation images


# Libraries for training
import optuna  # Optimization framework that allows to both train and tune at the same time
from stable_baselines3 import PPO  # PPO algorithm for RL
from stable_baselines3.common.evaluation import evaluate_policy  # Metric calculation of agent performance
from stable_baselines3.common.monitor import Monitor  # SB3 Monitor for logging
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack  # Vec wrappers to vectorize and frame stack

from stable_baselines3.common.callbacks import BaseCallback


In [None]:
# Optional: Check versions

print(f"Python version: {sys.version}")
print(f"Retro version: {retro.__version__}")
print(f"Gym version: {gym.__version__}")

In [None]:
retro.data.list_games()

In [None]:
# Start game environment
env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')

In [None]:
# Sample the actions available - MultiBinary
env.action_space.sample()

In [None]:
# Sample the observation space
env.observation_space.sample()

In [None]:
# Test to see everything working

# Reset game to starting state
obs = env.reset()

# Flag to false
done = False

# We only play one game
for game in range(1):

    # If game is not over.
    while not done:
        if done:
            # We reset the game
            obs = env.reset()

        # Render environment
        env.render()

        # We take random actions inside the environment
        obs, reward, done, info = env.step(env.action_space.sample())

        # We slow down the renders so they are watchable
        time.sleep(0)

        # We print the reward
        print(reward)

In [None]:
# Once the testing is finished we close the environment and see what happened.

env.close()
info

## Preprocessing the Environment

Observation preprocessing:
1. Calculate change in pixels to capture movement (frame delta).
2. Increase game efficiency by grayscaling and reshaping frames from 200x256x3 to 84x84x1  (153,600 pixels vs 7,056) for faster training.

Action preprocessing:
1. Filtering actions (parameters).
2. Redefine reward functions.

In [None]:
# Create custom environment
class StreetFighter(Env):
    def __init__(self):

        # Inherit from our base environment
        super().__init__()

        # Specify action and observation spaces
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)  # We create our observation space based on the new size and colors
        self.action_space = MultiBinary(12)  # We replicate the base action environment

        # Startup and instance the game
        # The second parameter will limit actions to only valid ones.
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions = retro.Actions.FILTERED)

    def reset(self):
        # Return first frame, preprocess the frame, and define score back to 0.

        self.previous_frame = np.zeros(self.game.observation_space.shape)

        obs = self.game.reset()  # Will return our observation
        obs = self.preprocess(obs)  # We preprocess the observation

        self.health = 176
        self.enemy_health = 176
        

        # Attribute to hold delta score.
        self.score = 0

        return obs
    
    def preprocess(self, observation):
        # Grayscale, and resize frame
        
        # Grayscaling
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)

        # Resizing
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        
        channel = np.reshape(resize, (84, 84, 1))  # We add the grayscale layer since its what gym expects

        return channel

    def step(self, action):
        # We take a step, preprocess the observation, calculate frame delta and reshape the reward function

        # Take a step
        obs, reward, done, info = self.game.step(action)  # New step based on an action

        obs = self.preprocess(obs)  # We preprocess the observation

        # Frame delta
        frame_delta = obs

        # Reshape the reward function based on relative score
        delta_enemy = self.enemy_health - info['enemy_health']
        delta_self = info['health'] - self.health

        reward = delta_enemy * 2 + delta_self


       # Update values
        self.health = info['health']
        self.enemy_health = info['enemy_health'] 

        return frame_delta, reward, done, info


    def render(self, *args, **kwargs):
        # We render the game
        self.game.render()

    def close(self):
        # We close the game
        self.game.close()

In [None]:
# We close any environment that could be open
env.close()

In [None]:
env = StreetFighter()  # We instance the created class

In [None]:
env.observation_space.shape

In [None]:
env.action_space

In [None]:
# Test to see everything working

# Reset game to starting state
obs = env.reset()

# Flag to false
done = False

# We only play one game
for game in range(1):

    # If game is not over.
    while not done:
        if done:
            # We reset the game
            obs = env.reset()

        # Render environment
        env.render()

        # We take random actions inside the environment
        obs, reward, done, info = env.step(env.action_space.sample())

        # We slow down the renders so they are watchable
        time.sleep(0.01)

        # We print the reward
        if reward > 0:
            print(reward)

## Hyperparameter tuning

We will use PyTorch, Stable Baselines3 and Optuna to get the model's best training parameters.

For PPO (Proximal Policy Optimization) we will tune the following hyperparameters:
- n_steps: batch size (frames in buffer)
- gamma: discount rate for calculating returns
- learning_rate: learning coefficient for optimizer
- clip_range: clipping amount for advantage calculation
- gae_lambda: advantages smoothing parameter

PyTorch: https://pytorch.org/get-started/locally/

Stable Baselines3: https://stable-baselines3.readthedocs.io/en/master/guide/install.html

Optuna: https://optuna.org/#installation

In [None]:
# Directories where saved optimization models are going to be saved

LOG_DIR = './logs/'  # SB3 has the ability to log out to a support log
OPT_DIR = './opt/'  # Location to save every single model after every try

In [None]:
# Hyperparameter function to return test hyperparameters - define the objective function

def optimize_ppo(trial):  # i.e. objective
    return {
        # Ranges of possible values that will be optimized
        'n_steps': trial.suggest_int('n_steps', 2048, 8192, step=64),  # SB3 requires  the range to be a multiple of 64
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, 0.99),
    }

# When we train we will get a set of best parameters

In [None]:
# Hyperparameter function to run a training loop and return mean 

eval_episodes = 5  # Number of times the model is evaluated. More = better.
n_steps = 30000  # Number of steps we train the model for. More = better but also a longer training time. 100k is good, 30k is quick but inaccurate.

def optimize_agent(trial):
    # A try - except section can prevent the model from breaking mid-training
    try:
        model_params = optimize_ppo(trial)  # Variable where we store the parameters from the previous function

        # Create environment
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)  # We specify the location where monitor values will be exported to
        env = DummyVecEnv([lambda: env])  # We wrap the environment on a DummyVec
        env = VecFrameStack(env, 4, channels_order='last')  # We will stack 4 different frames

        # Create training algorithm
        # model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)  # We unpack the model parameters obtained from the tuner and pass them to the PPO model
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps= n_steps)  # We train the model. Longer timesteps means a better model, but also a longer training time. 100k is good, 30k is quick but inaccurate.
        
        # Evaluate model
        mean_reward = evaluate_policy(model, env, n_eval_episodes= eval_episodes)  # We unpack the results obtained from evaluate policy. We will evaluate the model on 5 different games (more == better)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)  # We save all models to get the best one

        # We have to give optuna a value it expects, so if its a tuple we return only an int
        if isinstance(mean_reward, (tuple, list)):
            mean_reward = mean_reward[0]

        return mean_reward 

    except Exception as e:
        return -1000  # Model did not work, we resume training


In [None]:
# Tuning

study = optuna.create_study(direction='maximize')  # We create the experiment / study that seeks to maximize the mean reward
study.optimize(optimize_agent, n_trials=10, n_jobs=1)  # We optimize the study based on the agent created, and how many sets we will set. 10 is good for testing, 100+ is recommended for a good model

# NOTE: Using 100k timesteps on the model and 100 trials can take a long time to train (depending on the strength of the gpu from a few hours to a couple of days)

# If we wanted to speed things up whilst keeping accuracy, we could raise n_jobs, however retro does not support more than one environment at once. We can fix
# this by using retrowrapper: https://github.com/MaxStrange/retrowrapper. This allows for multiple instances at once which exponentially speeds trainig up.

In [None]:
study.best_params

In [None]:
# To pass it through a model we use
# model = PPO.load(os.path.join(OPT_DIR, 'trial_0_best_model.zip'))

## Fine Tuning

In [None]:
# Setup Callback

class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
    
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        
        return True

In [None]:
CHECKPOINT_DIR = './train/'

In [None]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)  # We will save the model every 10k steps

In [None]:
model_params = study.best_params

In [None]:
env.close()

In [None]:
# Model definition

# env.close()
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

model.load(os.path.join(OPT_DIR, 'trial_24_best_model.zip'))  # We reload previous weights from HPO

In [None]:
# Training
model.learn(total_timesteps=100000, callback=callback)  # Bigger is better, for example 5-20 million.
# model.learn(total_timesteps=5000000)

## Model Testing and Evaluating

We can load and visualize the training result.

In [None]:
model = PPO.load('./train/best_model_10000.zip')

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=False, n_eval_episodes=5)
mean_reward

In [None]:
# Test to see everything working

# Reset game to starting state
obs = env.reset()

# Flag to false
done = False

# We only play one game
for game in range(2):

    # If game is not over.
    while not done:
        if done:
            # We reset the game
            obs = env.reset()

        # Render environment
        env.render()

        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)

        # We slow down the renders so they are watchable
        time.sleep(0.01)

        # We print the reward
        if reward > 0:
            print(reward)