In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import numpy as np 
env = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode="human")
env = JoypadSpace(env, SIMPLE_MOVEMENT)


  logger.warn(
  logger.warn(


In [2]:
from gym.spaces import Box
from gym import Wrapper, ObservationWrapper
from gym.wrappers import FrameStack, GrayScaleObservation, ResizeObservation

class CustomWrapper(Wrapper):
    def reset(self, **kwargs):
        kwargs.pop('seed', None)  # Remove the 'seed' argument
        kwargs.pop('options', None)  # Remove the 'options' argument
        return self.env.reset(**kwargs)

# Wrap your environment
env = CustomWrapper(env)

class SkipFrame(Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, trunk, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, trunk, info

    
class RemoveChannelDim(ObservationWrapper):
    def observation(self, observation):
        return np.squeeze(observation)
    
# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = RemoveChannelDim(env)
env = FrameStack(env, num_stack=4)


In [3]:
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
import os

2023-06-10 23:52:40.167243: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-10 23:52:40.212135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [5]:
CHECKPOINT_DIR = './dqn/train/'
LOG_DIR = './dqn/logs/'

In [6]:
# Setup model saving callback
callback = TrainAndLoggingCallback(check_freq=1000, save_path=CHECKPOINT_DIR)

In [7]:
import torch as th
import torch.nn as nn
from gymnasium import spaces

from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor


class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: spaces.Box, features_dim: int = 256):
        super().__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=n_input_channels, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(3136, 512),
                                    nn.ReLU(),
                                    nn.Linear(512, features_dim))
            #nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
)


In [8]:
model = DQN('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.00001, 
            policy_kwargs=policy_kwargs, buffer_size=80000) 

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  return torch._C._cuda_getDeviceCount() > 0


In [9]:
from stable_baselines3.common.logger import configure

def train_model():
    env.reset()
    new_logger = configure(LOG_DIR, ["stdout", "csv"])
    # Set new logger 
    model.set_logger(new_logger)

    model.learn(total_timesteps=40000, callback=callback)

train_model()

Logging to ./dqn/logs/
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.11e+03 |
|    ep_rew_mean      | 1.54e+03 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 88       |
|    time_elapsed     | 50       |
|    total_timesteps  | 4443     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.05e+03 |
|    ep_rew_mean      | 1.8e+03  |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 90       |
|    time_elapsed     | 93       |
|    total_timesteps  | 8435     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.08e+03 |
|    ep_rew_mean      | 1.75e+03 |
|    exploration_rate | 0.05     |
| time/               |         

In [10]:
model.save('test_model')

In [11]:
env.close()

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

model = DQN.load("test_model")
obs = env.reset()

print(evaluate_policy(model, env, n_eval_episodes=2, deterministic=False, render=True))

print("Done.")


In [None]:
env.close()

In [None]:
import pandas as pd

df = pd.read_csv("ppo/logs/progress.csv")
df

In [None]:
df = df.fillna(0)
df

In [None]:
import matplotlib.pyplot as plt
ypoints = np.array(df['rollout/ep_rew_mean'].to_numpy())
plt.plot(ypoints, color = 'b')
plt.show()
plt.savefig('dqn/dqn.png')