In [86]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3 import PPO
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import VecFrameStack
import os
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_util import make_vec_env
import gym
from stable_baselines3.common.atari_wrappers import AtariWrapper
from gym import Wrapper
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage

In [87]:
# Create custom environment
class Mario(Wrapper):
    def __init__(self, env):
        super(Mario, self).__init__(env)
        self._current_score = 0

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        reward += (info['score'] - self._current_score) / 40.0
        self._current_score = info['score']
        if done:
            if info['flag_get']:
                print('We got it!!!!!')
                reward += 350.0
            else:
                reward -= 50.0
        return state, reward / 10.0, done, info

    def reset(self):
        """Reset the environment and return the initial observation."""
        return self.env.reset()

    def render(self, *args, **kwargs):
        self.env.render()

    def close(self):
        self.env.close()


In [88]:
monitor_dir = './logs/'
os.makedirs(monitor_dir,exist_ok=True)

In [89]:
def mario_wrapper(env):
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = AtariWrapper(env, terminal_on_life_loss=False, clip_reward=False)
    env = Mario(env)
    return env

In [90]:
env = make_vec_env('SuperMarioBros-v0', n_envs=16, seed=3994448089, wrapper_class=mario_wrapper)

env = VecFrameStack(env,4,channels_order='last')
env = VecTransposeImage(env)

In [91]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq, save_model_dir, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = os.path.join(save_model_dir, 'best_model/')
        self.best_mean_reward = -np.inf

    # def _init_callback(self) -> None:
    def _init_callback(self):
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    # def _on_step(self) -> bool:
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            print('self.n_calls: ',self.n_calls)
            model_path1 = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path1)

        return True

In [92]:
save_model_dir = './train/'
callback1 = SaveOnBestTrainingRewardCallback(10000, save_model_dir)

# Training

In [93]:
model_param_1={
    'n_steps': 7424,
 'gamma': 0.8692871366327747,
 'learning_rate': 6.442559213980066e-05,
 'clip_range': 0.31688308594665404,
 'gae_lambda': 0.8710254680014865
}

In [94]:
tensorboard_log = r'./tensorboard_log/'

model = PPO("CnnPolicy", env, verbose=1,
            tensorboard_log = monitor_dir,**model_param_1)
model.learn(total_timesteps=4000000,callback=callback1)
model.save("mario_model")

Using cuda device
Logging to ./logs/PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4e+03    |
|    ep_rew_mean     | 1.58e+03 |
| time/              |          |
|    fps             | 110      |
|    iterations      | 1        |
|    time_elapsed    | 1071     |
|    total_timesteps | 118784   |
---------------------------------
self.n_calls:  10000
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.32e+03    |
|    ep_rew_mean          | 1.79e+03    |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 2           |
|    time_elapsed         | 2362        |
|    total_timesteps      | 237568      |
| train/                  |             |
|    approx_kl            | 0.115697004 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.317       |
|    entropy_loss         | -1.89       |
|    expl

KeyboardInterrupt: 

# Testing

In [114]:
env = make_vec_env('SuperMarioBros-v0', n_envs=1, seed=3994448089, wrapper_class=mario_wrapper)
# env = make_vec_env('SuperMarioBros-1-4-v0', n_envs=1, seed=3994448089, wrapper_class=mario_wrapper)
env = VecFrameStack(env,4,channels_order='last')
env = VecTransposeImage(env)

In [115]:
model = PPO.load(r'C:\Users\admin\Desktop\NZH\Mario Code\3.train_game2_multiple_agent\best_model\model_140000.zip')


obs = env.reset()
obs=obs.copy()
done = True
while True:
    if done:
        state = env.reset()
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    obs=obs.copy()
    env.render()
    time.sleep(0.01)

KeyboardInterrupt: 

In [116]:
env.close()