# 1. Install and Import Dependencies

In [None]:
!pip install nes-py gym-super-mario-bros
!pip install stable-baselines3[extra]

# Only works on NVIDIA GPUs with CUDA installed
!conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

In [None]:
# For the Environment
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from gym.wrappers import GrayScaleObservation
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecTransposeImage

# For the Learning Model
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, CallbackList
from stable_baselines3 import PPO

# 2. Define Constants

In [None]:
ENV_NAME = 'SuperMarioBros-v3'
SAVE_FREQ = 500000
EVAL_FREQ = 100000
TOT_TIMESTEPS = 6000000

# 3. Create and Preprocess Environments

In [None]:
def create_and_preprocess_env():
    env = gym_super_mario_bros.make(ENV_NAME)
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = GrayScaleObservation(env, keep_dim=True)
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')
    env = VecTransposeImage(env)
    return env
    
train_env = create_and_preprocess_env()
eval_env = create_and_preprocess_env()

Note: There are now two environments, one for training and one for evaluation. This is because most learning models use exploration noise during training, and using a separate environment for evaluation prevents any conflicts with this.

# 4. Create and Train Agents

## A. Control Agent

Note: The hyperparameters for Control Agent are set to the default values. The Control Agent is equivalent to the agent created in 'MarioAI'.

#### Train

In [None]:
save_path = './Control/Saved Models/'
log_path = './Control/Logs/'

checkpoint_callback = CheckpointCallback(
    save_freq = SAVE_FREQ, 
    save_path = save_path,
    name_prefix = 'Control')

eval_callback = EvalCallback(
    eval_env, 
    eval_freq = EVAL_FREQ, 
    best_model_save_path = save_path)

callback = CallbackList([checkpoint_callback, eval_callback])

In [None]:
model = PPO('CnnPolicy', train_env, verbose=1, tensorboard_log=log_path,
            # These are the default values
            learning_rate = 3e-4,
            n_steps = 2048,
            batch_size = 64,
            n_epochs = 10)

model.learn(total_timesteps=TOT_TIMESTEPS, callback=callback)

#### Evaluate

In [None]:
state = eval_env.reset()
while True:
    action, _ = model.predict(state)
    state, reward, done, info = eval_env.step(action)
    eval_env.render()

In [None]:
eval_env.close()

## B. Experiment Agent 1

Note: The hyperparameters for Experiment Agent 1 were chosen based on the auto-tuned hyperparameters for the CarRacing-v0 environment found here: <u>https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml#LC320</u>.

#### Train

In [None]:
save_path = './Experiment1/Saved Models/'
log_path = './Experiment1/Logs/'

checkpoint_callback = CheckpointCallback(
    save_freq = SAVE_FREQ, 
    save_path = save_path,
    name_prefix = 'Experiment1')
eval_callback = EvalCallback(
    eval_env, 
    eval_freq = EVAL_FREQ, 
    best_model_save_path = save_path)

callback = CallbackList([checkpoint_callback, eval_callback])

In [None]:
model = PPO('CnnPolicy', train_env, verbose=1, tensorboard_log=log_path,
            learning_rate = 3e-5,
            n_steps = 512,
            batch_size = 128,
            n_epochs = 20)

model.learn(total_timesteps=TOT_TIMESTEPS, callback=callback)

#### Evaluate

In [None]:
state = eval_env.reset()
while True:
    action, _ = model.predict(state)
    state, reward, done, info = eval_env.step(action)
    eval_env.render()

In [None]:
eval_env.close()

## C. Experiment Agent 2

Note: The hyperparameters for Experiment Agent 2 were chosen based on the auto-tuned hyperparameters for the Atari environments found here: <u>https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml#LC1</u>.

#### Train

In [None]:
save_path = './Experiment2/Saved Models/'
log_path = './Experiment2/Logs/'

checkpoint_callback = CheckpointCallback(
    save_freq = SAVE_FREQ, 
    save_path = save_path,
    name_prefix = 'Experiment2')
eval_callback = EvalCallback(
    eval_env, 
    eval_freq = EVAL_FREQ, 
    best_model_save_path = save_path)

callback = CallbackList([checkpoint_callback, eval_callback])

In [None]:
model = PPO('CnnPolicy', train_env, verbose=1, tensorboard_log=log_path,
            learning_rate = 2.5e-4,
            n_steps = 128,
            batch_size = 256,
            n_epochs = 4)

model.learn(total_timesteps=TOT_TIMESTEPS, callback=callback)

#### Evaluate

In [None]:
state = eval_env.reset()
while True:
    action, _ = model.predict(state)
    state, reward, done, info = eval_env.step(action)
    eval_env.render()

In [None]:
eval_env.close()

## D. Experiment Agent 3

Note: The hyperparameters for Experiment Agent 3 were chosen based on my best estimations.

#### Train

In [None]:
save_path = './Experiment3/Saved Models/'
log_path = './Experiment3/Logs/'

checkpoint_callback = CheckpointCallback(
    save_freq = SAVE_FREQ, 
    save_path = save_path,
    name_prefix = 'Experiment3')
eval_callback = EvalCallback(
    eval_env, 
    eval_freq = EVAL_FREQ, 
    best_model_save_path = save_path)

callback = CallbackList([checkpoint_callback, eval_callback])

In [None]:
model = PPO('CnnPolicy', train_env, verbose=1, tensorboard_log=log_path,
            learning_rate = 5e-7,
            n_steps = 512,
            batch_size = 128,
            n_epochs = 15)

model.learn(total_timesteps=TOT_TIMESTEPS, callback=callback)

#### Evaluate

In [None]:
state = eval_env.reset()
while True:
    action, _ = model.predict(state)
    state, reward, done, info = eval_env.step(action)
    eval_env.render()

In [None]:
eval_env.close()