## Start by installing all the necessary libraries

In [None]:
pip install -r requirements.txt

## Don't forget to restart the kernel when the installation is completed, then you can execute the cell below

In [1]:
import gym_super_mario_bros
import os
import gym
import csv
import numpy as np
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv, VecVideoRecorder, VecNormalize
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback
from gym_super_mario_bros import make
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, RIGHT_ONLY
from gym.wrappers import GrayScaleObservation, ResizeObservation, TimeLimit, AtariPreprocessing
from nes_py.wrappers import JoypadSpace
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.env_util import make_vec_env, SubprocVecEnv




Tensorboard graphs and your models will be saved in LOG_DIR, rewards for each episodes 
will be saved in reward_dir. It's not necessary to save the rewards, but they can be
a good indication of how well your agent is performing (especially if you're new to
reinforcement learning and you don't posses a good mathematical background)

In [2]:
LOG_DIR = './mariologs/'
reward_dir = './mariologreward/'

## Preprocess of the environement

I used RIGHT_ONLY wich gives my agent only two actions : go right OR go right and jump.
If you want to change the control, just check the documentation : https://github.com/Kautenja/gym-super-mario-bros
If you want to change the level in wich Mario is playing you can change the values of the gym.make function
according to this tempalte : 'SuperMarioBros-<world>-<stage>-v<version>'.
Timelimit is use so that the episode stops after 2000 steps, this prevents Mario getting stuck to long
in front of a pipe.
Make_vec_env is used to be able to have parralel environements, wich accelerates the training.
VecFrameStack is used so that the agent receives the 4 last frames when he plays and have to take
an action. This allows him to have a "sense" of direction.

In [3]:
def make_env():
    env = gym.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = TimeLimit(env, max_episode_steps=2000)
    return env
# Create a vectorized environment using make_vec_env and the make_env function
env = make_vec_env(make_env, n_envs=2, monitor_dir=reward_dir)
# Wrap the environment with VecFrameStack
env = VecFrameStack(env, n_stack=4)

## Start the training

In [None]:
# Calculate the adjusted save frequency based on the number of environments (2)
save_freq=400000
adjusted_save_freq = max(save_freq // 2, 1)
# This code creates a CheckpointCallback object with a frequency of "save_freq" steps
# and saves the best model to the LOG_DIR directory
LOG_DIR = './mariologsgen4.6/'
checkpoint_callback = CheckpointCallback(save_freq=adjusted_save_freq, save_path=LOG_DIR)
# Create the PPO agent
# Defining the model,we had frames (4) as input so we use CnnPolicy
model = PPO(
    policy="CnnPolicy",
    env=env,
    verbose=1,
    learning_rate=0.00003,
    tensorboard_log=LOG_DIR,
)
# Train the model
model.learn(total_timesteps=20000000, callback=checkpoint_callback)

## Start the training of a previously trained model

When you want to restart the training it's very important that your model is trained
in exactly the same environement he started its training in. This is especially important 
when using online solutions like Google Colab or Papersapce Gradient.

In [None]:
LOG_DIR = './mariologsgen4.5/'
reward_dir = './mariologsgen4.5reward/'

def make_env():
    env = gym.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = TimeLimit(env, max_episode_steps=2000)
    return env
# Create a vectorized environment using make_vec_env and the make_env function, two parralel environements are created
env = make_vec_env(make_env, n_envs=2, monitor_dir=reward_dir)
# Wrap the environment with VecFrameStack
env = VecFrameStack(env, n_stack=4)

# This code creates a CheckpointCallback object with a frequency of "save_freq" steps
# and saves the best model to the LOG_DIR directory
save_freq=200000
adjusted_save_freq = max(save_freq // 2, 1)
checkpoint_callback = CheckpointCallback(save_freq=adjusted_save_freq, save_path=LOG_DIR)

# Load the saved model
model = PPO.load("path/to/your/saved/model", env=env)

# Print the number of timesteps the model has trained for before continuing the training, just to make sure
# you don't start from 0
print("Number of timesteps before continuing the training:", model.num_timesteps)

# Train the model for an additional "total_timesteps" 
model.learn(total_timesteps=17000000, reset_num_timesteps=False, callback=checkpoint_callback)



## That's the part where you can record videos of the agent in the environement

Being able to visualize your agent in its environement in a notebook cell is a true nightmare, this is why I did
this bit of code to be able to record a video instead. Don't forget to create exactly the same environment in wich
your agent was trained in. The only modification you should do is change the number of environement to one instead of two or more if you used parralel environments.
If you want to see how the model behave in another level, you can change the gym.make argument. For example, if
you want to see how the agent behave in world 2, level 2 : 'SuperMarioBros-2-2-v0'. For more information
just check : https://github.com/Kautenja/gym-super-mario-bros.

In [None]:
def make_env():
    env = gym.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = TimeLimit(env, max_episode_steps=2000)
    return env
# Create a vectorized environment using make_vec_env and the make_env function
env = make_vec_env(make_env, n_envs=1)
# Wrap the environment with VecFrameStack
env = VecFrameStack(env, n_stack=4)

# Load the saved model
model = PPO.load("mariologsgen4.5/rl_model_1000000_steps.zip")

# Create a video recorder
video_length = 160000
video_fps = 30
video_folder = "videos/"
video_name = "mario.mp4"
env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix=video_name)

# Run the model in the environment
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        break

# Save the video
env.close()

## This is the part where you can evaluate the performance of your agent

The evaluation method : getting the mean, the standard deviation and the 
number of flags reached (succesful completion of the level) are based on
the master thesis from Zone wich you can access in my Github repo.

As always you can change the world or level by changing the gym.make
argument. Also remember to make the exact same preprocess of the environement
you did when first training your model, just change the number of environment
"n_env" to one if you used two or more parallel environment during the training.
If you want to evaluate the model I created just change the path of the saved 
model to the one I uploaded on my Github repo. 

In [None]:
def make_env():
    env = gym.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = TimeLimit(env, max_episode_steps=2000)
    return env

# Create a vectorized environment using make_vec_env and the make_env function
env = make_vec_env(make_env, n_envs=1)

# Wrap the environment with VecFrameStack
env = VecFrameStack(env, n_stack=4)

# Load the saved model
model = PPO.load("path/to/your/saved/model")

# Run the model for 1000 episodes
episode_rewards = []
flags_reached = 0

for i in range(1000):
    obs = env.reset()
    episode_reward = 0
    done = False

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward

        # Check if the flag has been reached
        if info[0]['flag_get']:
            flags_reached += 1
            print("Agent completed the level in episode {}".format(i+1))

    episode_rewards.append(episode_reward)
    print("Reward for episode {}: {}".format(i+1, episode_reward))

# Calculate statistics
mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)
max_reward = np.max(episode_rewards)

# Print statistics
print("Mean reward over 1000 episodes:", mean_reward)
print("Standard deviation of rewards:", std_reward)
print("Maximum reward:", max_reward)
print("Number of flags reached:", flags_reached)

# Write statistics to a CSV file
csv_file = "statistics.csv"
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Episode", "Reward"])
    for i, reward in enumerate(episode_rewards):
        writer.writerow([i+1, reward])
    writer.writerow([])  # Empty row
    writer.writerow(["Mean Reward", mean_reward])
    writer.writerow(["Standard Deviation", std_reward])
    writer.writerow(["Maximum Reward", max_reward])
    writer.writerow(["Flags Reached", flags_reached])

print("Statistics written to", csv_file)


## Last note for visualizing tensorboard logs

You should have tensorboard installed on your computer. You should know the path to
the tensorboard logs. From there you can just execute "python -m tensorboard.main --logdir=C:\Users\path\to\your\tensorboardlogs". The console will give you a link that you can put in your browser and you'll
be able to visualize the graphs.