# Double Deep Q Newtork (DDQN) for Super Mario Bros.


## Imports


In [None]:
import os
import warnings

import gym_super_mario_bros
import numpy as np
import torch
from gym.wrappers import (
    FrameStack,
    GrayScaleObservation,
    Monitor,
    ResizeObservation,
    TransformObservation,
)
from nes_py.wrappers import JoypadSpace
from tqdm import tqdm

from setup import create_directories
from src.DDQNAgent import MarioAgentEpsilonGreedy
from src.utils import plot_results

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device(
#     "mps"
#     if torch.backends.mps.is_available() and torch.backends.mps.is_built()
#     else "cpu"
# )
np.random.seed(42)
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
warnings.simplefilter("ignore")
print(device)

## Setup


### Set all Paths


In [None]:
train_version = None
agent = "ddqn"
create_directories(agent)

model_folder = (
    os.path.join("models", agent)
    if train_version == None
    else os.path.join("models", f"{agent}_v{train_version}")
)
checkpoint_folder = os.path.join(model_folder, "checkpoints")

videos_folder = (
    os.path.join("references", agent, "videos")
    if train_version == None
    else os.path.join("references", f"{agent}_v{train_version}", "videos")
)
plot_folder = (
    os.path.join("references", agent, "images")
    if train_version == None
    else os.path.join("references", f"{agent}_v{train_version}", "images")
)
evaluation_folder = (
    os.path.join("references", agent, "evaluation")
    if train_version == None
    else os.path.join("references", f"{agent}_v{train_version}", "evaluation")
)


# set path for checkpoint to load from
starting_point = None  # os.path.join(checkpoint_folder, "model_ep850.pth")

### Set Hyperparameters


In [None]:
action_space = [
    ["NOOP"],
    ["A"],
    ["B"],
    ["right"],
    ["left"],
    ["right", "A"],
    ["right", "B"],
    ["right", "A", "B"],
]
buffer_size = 25000
batch_size = 64
learning_rate = 0.00009
stacking_number = 10
online_update_every = 3
exp_before_target_sync = 5000
exp_before_training = batch_size + 5
epsilon_start = 1.0
epsilon_min = 0.01
epsilon_decay = 0.001
gamma = 0.99
num_episodes = 1000
plot_every = 25
save_every = 50

## Train Agent


### Initialize Environment and other variables


In [None]:
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
env = JoypadSpace(env, action_space)
env = Monitor(env, videos_folder, video_callable=lambda episode_id: True, force=True)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, lambda obs: np.squeeze(obs, axis=-1))
env = TransformObservation(env, f=lambda x: x / 255.0)
env = FrameStack(env, num_stack=stacking_number)

state = env.reset()
state_shape = state.shape


mario = MarioAgentEpsilonGreedy(
    num_actions=len(action_space),
    state_shape=state_shape,
    checkpoint_folder=checkpoint_folder,
    model_folder=model_folder,
    wantcuda=True,
    starting_point=starting_point,
    learning_rate=learning_rate,
    epsilon_start=epsilon_start,
    epsilon_min=epsilon_min,
    epsilon_decay=epsilon_decay,
    batch_size=batch_size,
    gamma=gamma,
    buffer_size=buffer_size,
    exp_before_training=exp_before_training,
    online_update_every=online_update_every,
    exp_before_target_sync=exp_before_target_sync,
    save_every=save_every,
)

reward_list = []
steps_list = []
q_list = []
loss_list = []
epsilon_list = []

### Train Agent


In [None]:
for episode in tqdm(range(1, num_episodes + 1), total=num_episodes, desc="Training"):
    state = env.reset()
    total_reward = 0
    steps = 0
    mean_episode_q = []
    mean_episode_loss = []
    # Initiate loop for the current episode to play the game until it ends
    while True:
        # To visualize the training, uncomment the following line
        # env.render()
        action = mario.selectAction(state)
        next_state, reward, resetnow, info = env.step(action)
        mario.saveExp(state, action, next_state, reward, resetnow)
        q, loss = mario.learn_get_TDest_loss()
        state = next_state
        total_reward = total_reward + reward
        steps = steps + 1
        mean_episode_q.append(q)
        mean_episode_loss.append(loss)
        if resetnow or info["flag_get"]:
            break
    print(
        f"Episode {episode} abgeschlossen mit {steps} Schritten, Gesamtbelohnung: {total_reward}, Epsilon: {mario.epsilon}\n\n"
    )
    # Save the results of the current episode
    reward_list.append(total_reward)
    steps_list.append(steps)
    q_list.append(np.mean(mean_episode_q))
    loss_list.append(np.mean(mean_episode_loss))
    epsilon_list.append(mario.epsilon)

    # Plot the results of all episodes at the defined intervals
    if episode % 1 == 0:
        plot_results(
            reward_list=reward_list,
            steps_list=steps_list,
            q_list=q_list,
            loss_list=loss_list,
            epsilon_list=epsilon_list,
            save_fig=True,
            save_path=os.path.join(plot_folder, f"plot_{episode}.png"),
        )

    # Save the model at the defined intervals
    if episode % 1 == 0:
        torch.save(
            dict(
                model=mario.online.state_dict(),
                optimizer=mario.optimizer.state_dict(),
                epsilon=mario.epsilon,
            ),
            os.path.join(checkpoint_folder, f"model_ep{episode}.pth"),
        )

    # Decay the epsilon value at the end of each episode
    mario.decayEpsilon(strat="lin")

# Save the final model after all episodes are completed
torch.save(
    dict(
        model=mario.online.state_dict(),
        optimizer=mario.optimizer.state_dict(),
        epsilon=mario.epsilon,
    ),
    os.path.join(model_folder, f"final_model.pth"),
)

# Close the environment
env.close()

## Evaluate Agent


In [None]:
# load model
starting_point = os.path.join(model_folder, f"{agent}.pth")

### Initialize Environment and other variables


In [None]:
stacking_number = 10
num_episodes = 10

env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
env = JoypadSpace(env, action_space)
env = Monitor(env, videos_folder, video_callable=lambda episode_id: True, force=True)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, lambda obs: np.squeeze(obs, axis=-1))
env = TransformObservation(env, f=lambda x: x / 255.0)
env = FrameStack(env, num_stack=stacking_number)


state = env.reset()
state_shape = state.shape

In [None]:
agent = mario = MarioAgentEpsilonGreedy(
    num_actions=len(action_space),
    state_shape=state_shape,
    wantcuda=True,
    checkpoint_folder=None,
    model_folder=None,
    starting_point=starting_point,
)
agent.epsilon = 0.0
agent.model.eval()

reward_list = []
steps_list = []
q_list = []
loss_list = []
epsilon_list = []

### Evaluation Loop


In [None]:
for episode in tqdm(range(1, num_episodes + 1)):
    state = env.reset()
    total_reward = 0
    steps = 0
    mean_episode_q = []
    mean_episode_loss = []
    resetnow = False
    # Initiate loop for the current episode to play the game until it ends
    while resetnow == False:
        # To visualize the game
        env.render()
        action = agent.selectAction(state)
        next_state, reward, resetnow, info = env.step(action)
        resetnow = resetnow
        agent.saveExp(state, action, next_state, reward, resetnow)
        q, loss = agent.learn_get_TDest_loss()
        state = next_state
        total_reward = total_reward + reward
        steps = steps + 1
        mean_episode_q.append(q)
        mean_episode_loss.append(loss)
    print(
        f"Episode {episode} abgeschlossen mit {steps} Schritten, Gesamtbelohnung: {total_reward}, Epsilon: {agent.epsilon}\n\n"
    )
    # Save the results of the current episode
    reward_list.append(total_reward)
    steps_list.append(steps)
    q_list.append(np.mean(mean_episode_q))
    loss_list.append(np.mean(mean_episode_loss))
    epsilon_list.append(agent.epsilon)

# Plot the results of all episodes at the defined intervals
plot_results(
    reward_list,
    steps_list,
    q_list,
    loss_list,
    epsilon_list,
    os.path.join(plot_folder, f"plot_{episode}.png"),
)

# Close the environment
env.close()