### Import Required Libraries

In [1]:
!pip install gym_super_mario_bros==7.3.0 nes_py
# Install pytorch
!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# Installation of stable baselines for Reinforced Learning
!pip install stable-baselines3[extra]

Collecting gym_super_mario_bros==7.3.0
  Downloading gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198 kB)
Collecting nes_py
  Downloading nes_py-8.1.8.tar.gz (76 kB)
Collecting gym>=0.17.2
  Downloading gym-0.23.1.tar.gz (626 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting pyglet<=1.5.11,>=1.4.0
  Downloading pyglet-1.5.11-py3-none-any.whl (1.1 MB)
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.6-py3-none-any.whl (2.7 kB)
Collecting importlib-metadata>=4.10.0
  Downloading importlib_metadata-4.11.4-py3-none-any.whl (18 kB)
Building wheels for collected packages: nes-py, gym
  Building wheel for nes-py (setup.py): started
  Building wheel for nes-py (setup.py): finished with status 'done'
  Creat

In [2]:
# importing the game from gym
import gym_super_mario_bros
# next was to import Joypad wrapper
from nes_py.wrappers import JoypadSpace
# Followed by SIMPLIFIED controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
# Importing os for management of file path
import os 
# Import DQN, Gail as RL learning agents
from stable_baselines3 import DQN
from stable_baselines3 import A2C
# Import Base Callback to save models
from stable_baselines3.common.callbacks import BaseCallback

### Setup Mario

In [3]:
# Setting up the game
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

### Train the RL Model

In [4]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, freq_check, store_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.freq_check = freq_check
        self.store_path = store_path

    def _init_callback(self):
        if self.store_path is not None:
            os.makedirs(self.store_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.freq_check == 0:
            model_path = os.path.join(self.store_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [5]:
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

In [6]:
# Setup model saving callback
callback = TrainAndLoggingCallback(freq_check=10000, store_path=CHECKPOINT_DIR)

In [7]:
# This is the AI model started
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.000001, seed = 20,
             n_steps = 256)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [8]:
# Train the AI model, this is where the AI model starts to learn
model.learn(total_timesteps=2000000, callback=callback)

Logging to ./logs/A2C_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 2.41e+04 |
|    ep_rew_mean        | 657      |
| time/                 |          |
|    fps                | 219      |
|    iterations         | 100      |
|    time_elapsed       | 116      |
|    total_timesteps    | 25600    |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | -0.0245  |
|    learning_rate      | 1e-06    |
|    n_updates          | 99       |
|    policy_loss        | -0.0647  |
|    value_loss         | 0.264    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 2.01e+04 |
|    ep_rew_mean        | 650      |
| time/                 |          |
|    fps                | 221      |
|    iterations         | 200      |
|    time_elapsed       | 231      |
|    total_timesteps    | 51200    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.64e+04 |
|    ep_rew_mean        | 588      |
| time/                 |          |
|    fps                | 228      |
|    iterations         | 1500     |
|    time_elapsed       | 1683     |
|    total_timesteps    | 384000   |
| train/                |          |
|    entropy_loss       | -1.95    |
|    explained_variance | 0.0261   |
|    learning_rate      | 1e-06    |
|    n_updates          | 1499     |
|    policy_loss        | -0.712   |
|    value_loss         | 0.338    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.67e+04 |
|    ep_rew_mean        | 581      |
| time/                 |          |
|    fps                | 228      |
|    iterations         | 1600     |
|    time_elapsed       | 1794     |
|    total_timesteps    | 409600   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.76e+04 |
|    ep_rew_mean        | 589      |
| time/                 |          |
|    fps                | 227      |
|    iterations         | 2900     |
|    time_elapsed       | 3259     |
|    total_timesteps    | 742400   |
| train/                |          |
|    entropy_loss       | -1.94    |
|    explained_variance | 0.0836   |
|    learning_rate      | 1e-06    |
|    n_updates          | 2899     |
|    policy_loss        | 0.434    |
|    value_loss         | 0.245    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.75e+04 |
|    ep_rew_mean        | 588      |
| time/                 |          |
|    fps                | 226      |
|    iterations         | 3000     |
|    time_elapsed       | 3391     |
|    total_timesteps    | 768000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.82e+04 |
|    ep_rew_mean        | 591      |
| time/                 |          |
|    fps                | 226      |
|    iterations         | 4200     |
|    time_elapsed       | 4744     |
|    total_timesteps    | 1075200  |
| train/                |          |
|    entropy_loss       | -1.94    |
|    explained_variance | -0.00483 |
|    learning_rate      | 1e-06    |
|    n_updates          | 4199     |
|    policy_loss        | 0.744    |
|    value_loss         | 0.329    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.82e+04 |
|    ep_rew_mean        | 589      |
| time/                 |          |
|    fps                | 226      |
|    iterations         | 4300     |
|    time_elapsed       | 4856     |
|    total_timesteps    | 1100800  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.78e+04 |
|    ep_rew_mean        | 605      |
| time/                 |          |
|    fps                | 227      |
|    iterations         | 5600     |
|    time_elapsed       | 6299     |
|    total_timesteps    | 1433600  |
| train/                |          |
|    entropy_loss       | -1.94    |
|    explained_variance | 0.0804   |
|    learning_rate      | 1e-06    |
|    n_updates          | 5599     |
|    policy_loss        | 0.519    |
|    value_loss         | 0.234    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.79e+04 |
|    ep_rew_mean        | 603      |
| time/                 |          |
|    fps                | 227      |
|    iterations         | 5700     |
|    time_elapsed       | 6409     |
|    total_timesteps    | 1459200  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.81e+04 |
|    ep_rew_mean        | 598      |
| time/                 |          |
|    fps                | 226      |
|    iterations         | 7000     |
|    time_elapsed       | 7913     |
|    total_timesteps    | 1792000  |
| train/                |          |
|    entropy_loss       | -1.94    |
|    explained_variance | 0.0139   |
|    learning_rate      | 1e-06    |
|    n_updates          | 6999     |
|    policy_loss        | -0.0985  |
|    value_loss         | 0.107    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.79e+04 |
|    ep_rew_mean        | 598      |
| time/                 |          |
|    fps                | 226      |
|    iterations         | 7100     |
|    time_elapsed       | 8026     |
|    total_timesteps    | 1817600  |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x1cc6e4a8760>

In [9]:
model.save("a2c_cnn_model_seed20")