# Environment Setup

In [1]:
#!pip install gym_super_mario_bros
#!pip install stable_baselines3

In [2]:
# PPO algorythm
from stable_baselines3 import PPO
# Monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import simpler movements
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
# Import to reduce number of movments
from nes_py.wrappers import JoypadSpace
# To deal with filepaths
import os

In [3]:
# Running Environment_Setup.ipynb to setup custom Mario environment.
%run Environment_Setup.ipynb

In [4]:
LOG_DIR = './logs/'

In [5]:
# Create environment 
env = Mario()
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

# Setup Callback To Monitor Training Process

In [6]:
#!pip install stable-baselines3[extra]

In [7]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [8]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [9]:
CHECKPOINT_DIR = './train/'

In [10]:
callback = TrainAndLoggingCallback(check_freq=100_000, save_path=CHECKPOINT_DIR)

# Continuing Training Best Model

In [11]:
# Best model
model = PPO.load("train/old_gen/Newestt_Mario_PPO", env, verbose=1, tensorboard_log=LOG_DIR)

# Displaying current learning rate 
print(model.learning_rate)
print(model.n_steps)

Wrapping the env in a VecTransposeImage.
1.2134019828545453e-07
6144


In [14]:
# Lowering learning rate if needed
model.learning_rate /= 2
print(model.learning_rate)

9.707215862836362e-07


In [15]:
# Training
model.learn(total_timesteps=600_000, callback=callback)

Logging to ./logs/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.44e+03 |
|    ep_rew_mean     | 226      |
| time/              |          |
|    fps             | 100      |
|    iterations      | 1        |
|    time_elapsed    | 61       |
|    total_timesteps | 6144     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.51e+03     |
|    ep_rew_mean          | 363          |
| time/                   |              |
|    fps                  | 88           |
|    iterations           | 2            |
|    time_elapsed         | 138          |
|    total_timesteps      | 12288        |
| train/                  |              |
|    approx_kl            | 0.0003369546 |
|    clip_fraction        | 0            |
|    clip_range           | 0.187        |
|    entropy_loss         | -0.589       |
|    explained_variance   | 0.304 

KeyboardInterrupt: 

In [18]:
model.save('train/best_model_1700000')

# Continuing Training Many Best Models

In [None]:
best_models = [
    "trial_10_best_model",
    "trial_11_best_model",
    "trial_14_best_model",
    "trial_15_best_model"
]

In [None]:
for model_name in best_models:
    # Loading model
    model = PPO.load(f"opt/{model_name}", env, verbose=1, tensorboard_log=LOG_DIR)
    # Lowering learning rate
    model.learning_rate /= 3
    # Training
    model.learn(total_timesteps=2_000_000, callback=callback)
    # Saving model
    model.save(f'train/old_gen/{model_name}')

# Creating New Model And Loading Parameters of Diffrent One

In [None]:
# Best params
parameters = {'gamma': 0.9905260007134729, 
              'n_steps': 6_144,
              'learning_rate': 6.0670099142727265e-06, 
              'clip_range': 0.1865481147395506, 
              'gae_lambda': 0.9758617427012113
             }

In [None]:
model = PPO('CnnPolicy', env, **parameters, verbose=1, tensorboard_log=LOG_DIR)

In [None]:
model.set_parameters("train/old_gen/New_Mario_PPO")

In [None]:
# Training
model.learn(total_timesteps=1_000_000, callback=callback)

In [None]:
model.save("train/Mario_PPO.zip")