In [None]:
%pip install gym==0.21.0 gym-retro==0.8.0
%pip install opencv-python matplotlib
%pip install torch==2.1.2+cu121 torchvision==0.16.2+cu121 torchaudio==2.1.2+cu121 --extra-index-url https://download.pytorch.org/whl/cu121
%pip install stable-baselines3==1.7.0
%pip install stable-baselines3[extra] optuna
%pip install optuna
%pip install "shimmy>=2.0"
%pip install tensorboard tensorboardX

%pip install git+https://github.com/MaxStrange/retrowrapper.git
%pip install --upgrade protobuf==3.20.3

In [2]:
import gym, retro
import retrowrapper

import time
import os
import sys

from gym import Env
from gym.spaces import MultiBinary, Box

import numpy as np
import cv2

from matplotlib import pyplot as plt

import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback
import tensorboard
from tensorboard import program

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"Python version: {sys.version}")
print(f"Retro version: {retro.__version__}")
print(f"Gym version: {gym.__version__}")

Python version: 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]
Retro version: 0.8.0
Gym version: 0.21.0


In [3]:
# Create custom environment
class StreetFighter(Env):
    def __init__(self):

        # Inherit from our base environment
        super().__init__()

        # Specify action and observation spaces
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)  # We create our observation space based on the new size and colors
        self.action_space = MultiBinary(12)  # We replicate the base action environment

        # Startup and instance the game
        # The second parameter will limit actions to only valid ones.
        self.game = retrowrapper.RetroWrapper(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions = retro.Actions.FILTERED)

        self.w_enemy_damage = 1.5
        self.w_self_damage = -1.0
        self.w_score = 0.001
        self.w_repeat_action = -0.01
        self.w_match_won = 50

    def reset(self):
        # Return first frame, preprocess the frame, and define score back to 0.

        self.previous_frame = np.zeros(self.game.observation_space.shape)

        obs = self.game.reset()  # Will return our observation
        obs = self.preprocess(obs)  # We preprocess the observation

        self.health = 176
        self.enemy_health = 176
        self.matches_won = 0
        self.enemy_matches_won = 0
        self.score = 0
        
        # Game delta = Current_frame - Previous_frame
        # Preprocess
        self.previous_frame = obs


        return obs
    
    def preprocess(self, observation):
        # Grayscale, and resize frame
        
        # Grayscaling
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)

        # Resizing
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        
        channel = np.reshape(resize, (84, 84, 1))  # We add the grayscale layer since its what gym expects

        return channel

    def step(self, action):
        # We take a step, preprocess the observation, calculate frame delta and reshape the reward function

        # Take a step
        obs, reward, done, info = self.game.step(action)  # New step based on an action

        obs = self.preprocess(obs)  # We preprocess the observation

        if hasattr(self, 'prev_action') and np.array_equal(action, self.prev_action):
            reward += self.w_repeat_action
        self.prev_action = action


        # Frame delta

        # We subtract the current one from the previous one and then we set the current as the last one.
        frame_delta = obs  # - self.previous_frame
        # self.previous_frame = obs

        delta_enemy = self.enemy_health - info['enemy_health']
        delta_self = info['health'] - self.health
        delta_score = info['score'] - self.score

        match_result = 0

        if info['matches_won'] > self.matches_won:
            match_result = self.w_match_won
        if info['enemy_matches_won'] > self.enemy_matches_won:
            match_result = -self.w_match_won


        reward = (self.w_enemy_damage * delta_enemy) + (self.w_self_damage * delta_self) + (self.w_score * delta_score) + match_result

       # Update values
        self.health = info['health']
        self.enemy_health = info['enemy_health']
        self.matches_won = info['matches_won']
        self.enemy_matches_won = info['enemy_matches_won']
        self.score = info['score']

        return frame_delta, reward, done, info


    def render(self, *args, **kwargs):
        # We render the game
        self.game.render()

    def close(self):
        # We close the game
        self.game.close()

In [4]:
env = StreetFighter()
env.close()

In [5]:
# Directories where saved optimization models are going to be saved

LOG_DIR = './logs/'  # SB3 has the ability to log out to a support log
OPT_DIR = './opt/'  # Location to save every single model after every try

In [6]:
# Hyperparameter function to return test hyperparameters - define the objective function

def optimize_ppo(trial):  # i.e. objective
    return {
        # Ranges of possible values that will be optimized
        'n_steps': trial.suggest_int('n_steps', 2048, 8192, step=64),  # SB3 requires  the range to be a multiple of 64  
        'gamma': trial.suggest_float('gamma', 0.8, 0.999, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-8, 1e-5, log=True),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, 0.99),
    }

# When we train we will get a set of best parameters

In [7]:
# Hyperparameter function to run a training loop and return mean 

def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)  # Variable where we store the parameters from the previous function

        # número de entornos paralelos que quieres
        n_envs = 16

        # Asegúrate de que LOG_DIR existe
        os.makedirs(LOG_DIR, exist_ok=True)

        # factory para generar cada env (monitoreado y con archivo único por env)
        def make_env(rank):
            def _init():
                env = StreetFighter()                                     # usa la clase modificada que ahora usa retrowrapper
                monitor_path = os.path.join(LOG_DIR, f"monitor_{rank}.csv")
                env = Monitor(env, filename=monitor_path)                 # evita colisiones de Monitor
                return env
            return _init

        env = DummyVecEnv([make_env(i) for i in range(n_envs)])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create training algorithm
        # model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)  # We unpack the model parameters obtained from the tuner and pass them to the PPO model
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=200000)  # We train the model. Longer timesteps means a better model, but also a longer training time. 100k is good, 30k is quick but inaccurate
        
        # Evaluate model
        mean_reward = evaluate_policy(model, env, n_eval_episodes=16)  # We unpack the results obtained from evaluate policy. We will evaluate the model on 5 different games (more == better)
        env.close()

        os.makedirs(OPT_DIR, exist_ok=True)

        # Recupera el mejor valor guardado (si existe)
        best_path = os.path.join(OPT_DIR, "best_overall.txt")
        best_reward = None
        if os.path.exists(best_path):
            with open(best_path, "r") as f:
                try:
                    best_reward = float(f.read().strip())
                except:
                    best_reward = None

        # Normaliza mean_reward a escalar
        if isinstance(mean_reward, (tuple, list)):
            mean_reward = mean_reward[0]

        # Si es mejor, guardamos y actualizamos registro
        if best_reward is None or mean_reward > best_reward:
            save_path = os.path.join(OPT_DIR, f"best_model_trial_{trial.number}_reward_{mean_reward:.2f}.zip")
            model.save(save_path)
            with open(best_path, "w") as f:
                f.write(str(mean_reward))

        env.close()
        return mean_reward

    except Exception as e:
        print(e)
        return -1000  # Model did not work, we resume training


In [8]:
import torch

print("Versión de PyTorch:", torch.__version__)
print("CUDA disponible:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Dispositivo actual:", torch.cuda.current_device())
    print("Nombre GPU:", torch.cuda.get_device_name(0))
    print("Número de GPUs visibles:", torch.cuda.device_count())
    print("Capacidad de cómputo:", torch.cuda.get_device_capability(0))


Versión de PyTorch: 2.1.2+cu121
CUDA disponible: True
Dispositivo actual: 0
Nombre GPU: NVIDIA GeForce RTX 4090
Número de GPUs visibles: 1
Capacidad de cómputo: (8, 9)


In [9]:
# Tuning

study = optuna.create_study(direction='maximize')  # We create the experiment / study that seeks to maximize the mean reward
study.optimize(optimize_agent, n_trials=100, n_jobs=1)  # We optimize the study based on the agent created, and how many sets we will set. 10 is good for testing, 100+ is recommended for a good model

# NOTE: Using 100k timesteps on the model and 100 trials can take a long time to train (depending on the strength of the gpu from a few hours to a couple of days)

# If we wanted to speed things up whilst keeping accuracy, we could raise n_jobs, however retro does not support more than one environment at once. We can fix
# this by using retrowrapper: https://github.com/MaxStrange/retrowrapper. This allows for multiple instances at once which exponentially speeds trainig up.

[I 2025-09-29 15:02:57,363] A new study created in memory with name: no-name-ad399f33-d55f-423f-a76d-783fb2c00b1c
[W 2025-09-29 15:02:58,899] Trial 0 failed with parameters: {'n_steps': 7552, 'gamma': 0.9289923804138016, 'learning_rate': 1.0057960818473213e-08, 'clip_range': 0.24373743985828955, 'gae_lambda': 0.8119359800584447} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\Code\Machine Learning\Stree Fighter Final\Optimized\venv\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Santiago64\AppData\Local\Temp\ipykernel_37548\2178521669.py", line 22, in optimize_agent
    env = DummyVecEnv([make_env(i) for i in range(n_envs)])
  File "d:\Code\Machine Learning\Stree Fighter Final\Optimized\venv\lib\site-packages\stable_baselines3\common\vec_env\dummy_vec_env.py", line 26, in __init__
    self.envs = [fn() for fn in env_fns]
  File "d:\Code\Machine Learning\Stree Figh

KeyboardInterrupt: 

In [None]:
# on logs cmd: tensorboard --logdir=.

In [10]:
study.best_params

ValueError: No trials are completed yet.

In [11]:
# Setup Callback

class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
    
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        
        return True

In [12]:
CHECKPOINT_DIR = './train/'

In [13]:
callback = TrainAndLoggingCallback(check_freq=50000, save_path=CHECKPOINT_DIR)  # We will save the model every 10k steps

In [13]:
model_params = study.best_params

In [14]:
# Model definition
N_ENVS = 16

def make_env(rank):
    def _init():
        env = StreetFighter()
        monitor_path = os.path.join(LOG_DIR, f"monitor_{rank}.csv")
        env = Monitor(env, filename=monitor_path)
        return env
    return _init

env = DummyVecEnv([make_env(i) for i in range(N_ENVS)])
env = VecFrameStack(env, 4, channels_order='last')

In [16]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

model.load(os.path.join(OPT_DIR, 'best_model_trial_93_reward_420.70.zip'), env=env, tensorboard_log=LOG_DIR, verbose=1, **model_params)  # We reload previous weights from HPO

Using cuda device
Wrapping the env in a VecTransposeImage.


<stable_baselines3.ppo.ppo.PPO at 0x23fccbce700>

In [17]:
# Training
model.learn(total_timesteps=20000000, callback=callback)  # Bigger is better, for example 5 million
# model.learn(total_timesteps=5000000)

Logging to ./logs/PPO_101
------------------------------
| time/              |       |
|    fps             | 778   |
|    iterations      | 1     |
|    time_elapsed    | 47    |
|    total_timesteps | 36864 |
------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 576          |
|    iterations           | 2            |
|    time_elapsed         | 127          |
|    total_timesteps      | 73728        |
| train/                  |              |
|    approx_kl            | 4.070882e-07 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.32        |
|    explained_variance   | 0.000261     |
|    learning_rate        | 3.4e-08      |
|    loss                 | 0.9          |
|    n_updates            | 10           |
|    policy_gradient_loss | -5.14e-06    |
|    value_loss           | 25.1         |
--------------------------

<stable_baselines3.ppo.ppo.PPO at 0x23fccbce610>

In [15]:
model = PPO.load('./train/best_model_1450000.zip', env=env)

Wrapping the env in a VecTransposeImage.


In [None]:
# Para rapido
model.learning_rate = 1e-7   # o 1e-6 para más agresivo

model.learn(
    total_timesteps=100_000_000,   # otros 20M
    reset_num_timesteps=False,
    tb_log_name="PPO_lr_up",
    callback=callback
)


Logging to ./logs/PPO_lr_up_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.19e+04 |
|    ep_rew_mean     | 481      |
| time/              |          |
|    fps             | 838      |
|    iterations      | 1        |
|    time_elapsed    | 43       |
|    total_timesteps | 35991808 |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.19e+04      |
|    ep_rew_mean          | 480           |
| time/                   |               |
|    fps                  | 588           |
|    iterations           | 2             |
|    time_elapsed         | 125           |
|    total_timesteps      | 36028672      |
| train/                  |               |
|    approx_kl            | 1.4876761e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -7.93         |
|    explained

In [None]:
# Para lento
# bajar el learning rate para el entrenamiento largo y estable
model.learning_rate = 1e-7

model.learn(
    total_timesteps=200_000_000,  # 100M–200M según quieras
    reset_num_timesteps=False,
    tb_log_name="PPO_lr_up",
    callback=callback
)


Logging to ./logs/PPO_lr_down_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.47e+04 |
|    ep_rew_mean     | 518      |
| time/              |          |
|    fps             | 851      |
|    iterations      | 1        |
|    time_elapsed    | 43       |
|    total_timesteps | 32808960 |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.49e+04     |
|    ep_rew_mean          | 522          |
| time/                   |              |
|    fps                  | 620          |
|    iterations           | 2            |
|    time_elapsed         | 118          |
|    total_timesteps      | 32845824     |
| train/                  |              |
|    approx_kl            | 9.496619e-06 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -7.91        |
|    explained_variance   

In [None]:
model = PPO.load('./train/best_model_5460000.zip', env=env)

In [None]:
# If one wants to train the model even more:
# model.learn(total_timesteps=5_000_000, callback=callback, reset_num_timesteps=False)

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=False, n_eval_episodes=5)
mean_reward

In [None]:
# Test to see everything working

# Reset game to starting state
obs = env.reset()

# Flag to false
done = False

# We only play one game
for game in range(1):

    # If game is not over.
    while not done:
        if done:
            # We reset the game
            obs = env.reset()

        # Render environment
        env.render()

        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)

        # We slow down the renders so they are watchable
        time.sleep(0.01)

        # We print the reward
        if reward > 0:
            print(reward)