### Import Required Libraries

In [1]:
!pip install gym_super_mario_bros==7.3.0 nes_py
# Install pytorch
!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# Installation of stable baselines for Reinforced Learning
!pip install stable-baselines3[extra]

Collecting gym_super_mario_bros==7.3.0
  Downloading gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198 kB)
Collecting nes_py
  Downloading nes_py-8.1.8.tar.gz (76 kB)
Collecting gym>=0.17.2
  Downloading gym-0.23.1.tar.gz (626 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting pyglet<=1.5.11,>=1.4.0
  Downloading pyglet-1.5.11-py3-none-any.whl (1.1 MB)
Collecting importlib-metadata>=4.10.0
  Downloading importlib_metadata-4.11.4-py3-none-any.whl (18 kB)
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.6-py3-none-any.whl (2.7 kB)
Building wheels for collected packages: nes-py, gym
  Building wheel for nes-py (setup.py): started
  Building wheel for nes-py (setup.py): finished with status 'done'
  Creat

In [2]:
# importing the game from gym
import gym_super_mario_bros
# next was to import Joypad wrapper
from nes_py.wrappers import JoypadSpace
# Followed by SIMPLIFIED controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
# Importing os for management of file path
import os 
# Import DQN, Gail as RL learning agents
from stable_baselines3 import DQN
from stable_baselines3 import A2C
# Import Base Callback to save models
from stable_baselines3.common.callbacks import BaseCallback

### Setup Mario

In [3]:
# Setting up the game
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

### Train the RL Model

In [4]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, freq_check, store_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.freq_check = freq_check
        self.store_path = store_path

    def _init_callback(self):
        if self.store_path is not None:
            os.makedirs(self.store_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.freq_check == 0:
            model_path = os.path.join(self.store_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [5]:
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

In [6]:
# Setup model saving callback
callback = TrainAndLoggingCallback(freq_check=10000, store_path=CHECKPOINT_DIR)

In [7]:
# This is the AI model started
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.000001, seed = 10,
             n_steps = 256)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [8]:
# Train the AI model, this is where the AI model starts to learn
model.learn(total_timesteps=2000000, callback=callback)

Logging to ./logs/A2C_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 2.41e+04 |
|    ep_rew_mean        | 529      |
| time/                 |          |
|    fps                | 302      |
|    iterations         | 100      |
|    time_elapsed       | 84       |
|    total_timesteps    | 25600    |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | 0.0141   |
|    learning_rate      | 1e-06    |
|    n_updates          | 99       |
|    policy_loss        | -0.256   |
|    value_loss         | 0.588    |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 1.62e+04  |
|    ep_rew_mean        | 576       |
| time/                 |           |
|    fps                | 303       |
|    iterations         | 200       |
|    time_elapsed       | 168       |
|    total_timesteps    | 51200     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.93e+04 |
|    ep_rew_mean        | 569      |
| time/                 |          |
|    fps                | 302      |
|    iterations         | 1400     |
|    time_elapsed       | 1183     |
|    total_timesteps    | 358400   |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | -0.00102 |
|    learning_rate      | 1e-06    |
|    n_updates          | 1399     |
|    policy_loss        | -0.177   |
|    value_loss         | 0.42     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 1.95e+04  |
|    ep_rew_mean        | 566       |
| time/                 |           |
|    fps                | 302       |
|    iterations         | 1500      |
|    time_elapsed       | 1269      |
|    total_timesteps    | 384000    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.93e+04 |
|    ep_rew_mean        | 561      |
| time/                 |          |
|    fps                | 302      |
|    iterations         | 2700     |
|    time_elapsed       | 2282     |
|    total_timesteps    | 691200   |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | 0.00236  |
|    learning_rate      | 1e-06    |
|    n_updates          | 2699     |
|    policy_loss        | -0.0574  |
|    value_loss         | 0.715    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.95e+04 |
|    ep_rew_mean        | 560      |
| time/                 |          |
|    fps                | 302      |
|    iterations         | 2800     |
|    time_elapsed       | 2366     |
|    total_timesteps    | 716800   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.95e+04 |
|    ep_rew_mean        | 566      |
| time/                 |          |
|    fps                | 302      |
|    iterations         | 4100     |
|    time_elapsed       | 3466     |
|    total_timesteps    | 1049600  |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | -0.00624 |
|    learning_rate      | 1e-06    |
|    n_updates          | 4099     |
|    policy_loss        | -0.911   |
|    value_loss         | 0.434    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.95e+04 |
|    ep_rew_mean        | 567      |
| time/                 |          |
|    fps                | 302      |
|    iterations         | 4200     |
|    time_elapsed       | 3550     |
|    total_timesteps    | 1075200  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.88e+04 |
|    ep_rew_mean        | 576      |
| time/                 |          |
|    fps                | 303      |
|    iterations         | 5500     |
|    time_elapsed       | 4641     |
|    total_timesteps    | 1408000  |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | 0.00263  |
|    learning_rate      | 1e-06    |
|    n_updates          | 5499     |
|    policy_loss        | 0.314    |
|    value_loss         | 0.222    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.87e+04 |
|    ep_rew_mean        | 580      |
| time/                 |          |
|    fps                | 303      |
|    iterations         | 5600     |
|    time_elapsed       | 4725     |
|    total_timesteps    | 1433600  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.84e+04 |
|    ep_rew_mean        | 587      |
| time/                 |          |
|    fps                | 303      |
|    iterations         | 6800     |
|    time_elapsed       | 5737     |
|    total_timesteps    | 1740800  |
| train/                |          |
|    entropy_loss       | -1.94    |
|    explained_variance | 0.00573  |
|    learning_rate      | 1e-06    |
|    n_updates          | 6799     |
|    policy_loss        | -0.778   |
|    value_loss         | 0.58     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.84e+04 |
|    ep_rew_mean        | 589      |
| time/                 |          |
|    fps                | 303      |
|    iterations         | 6900     |
|    time_elapsed       | 5821     |
|    total_timesteps    | 1766400  |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x18b88536790>

In [9]:
model.save("a2c_mlp_model_seed10")