# Import

In [1]:
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

from nes_py.wrappers import JoypadSpace
import torch
import numpy as np
import gym
import gym_super_mario_bros
from gym.wrappers import (
    FrameStack,
    GrayScaleObservation,
    ResizeObservation,
    TransformObservation,
    Monitor,
)

# Import own Functions
from src.helper_functions.create_Plot import plot_results
from src.helper_functions.create_Agent import MarioAgentEpsilonGreedy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set Hyperparameters

In [2]:
action_space = [
    ["NOOP"],
    ["A"],
    ["B"],
    ["right"],
    ["left"],
    ["right", "A"],
    ["right", "B"],
    ["right", "A", "B"],
]
buffer_size = 25000
batch_size = 64
learning_rate = 0.00009
stacking_number = 10
online_update_every = 3
exp_before_target_sync = 5000
epsilon_start = 1.0
epsilon_min = 0.01
epsilon_decay = 0.001
gamma = 0.99
num_episodes = 1000
plot_every = 25
save_every = 50

# Set all Paths

In [3]:
# change this version-number to create new folders
train_version = "v2"

trainfolder = "training_" + train_version
vid_folder = os.path.join("res", trainfolder, "all_videos")

model_folder = os.path.join("models")
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

checkpoint_folder = os.path.join(model_folder, trainfolder, "checkpoints")
if not os.path.exists(checkpoint_folder):
    os.makedirs(checkpoint_folder)

plot_folder = os.path.join("res", trainfolder, "plots")
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

starting_point = None  # os.path.join(checkpoint_folder, "model_ep850.pth")

# Initialize Environment and other variables

In [4]:
exp_before_training = batch_size + 5

env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
env = JoypadSpace(env, action_space)
env = Monitor(env, vid_folder, video_callable=lambda episode_id: True, force=True)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, lambda obs: np.squeeze(obs, axis=-1))
env = TransformObservation(env, f=lambda x: x / 255.0)
env = FrameStack(env, num_stack=stacking_number)

state = env.reset()
state_shape = state.shape


mario = MarioAgentEpsilonGreedy(
    num_actions=len(action_space),
    state_shape=state_shape,
    checkpoint_folder=checkpoint_folder,
    model_folder=model_folder,
    wantcuda=True,
    starting_point=starting_point,
    learning_rate=learning_rate,
    epsilon_start=epsilon_start,
    epsilon_min=epsilon_min,
    epsilon_decay=epsilon_decay,
    batch_size=batch_size,
    gamma=gamma,
    buffer_size=buffer_size,
    exp_before_training=exp_before_training,
    online_update_every=online_update_every,
    exp_before_target_sync=exp_before_target_sync,
    save_every=save_every,
)

reward_list = []
steps_list = []
q_list = []
loss_list = []
epsilon_list = []

# Start the training

In [5]:
for episode in range(1, num_episodes + 1):
    state = env.reset()
    total_reward = 0
    steps = 0
    mean_episode_q = []
    mean_episode_loss = []
    # Initiate loop for the current episode to play the game until it ends
    while True:
        # To visualize the training, uncomment the following line
        # env.render()
        action = mario.selectAction(state)
        next_state, reward, resetnow, info = env.step(action)
        mario.saveExp(state, action, next_state, reward, resetnow)
        q, loss = mario.learn_get_TDest_loss()
        state = next_state
        total_reward = total_reward + reward
        steps = steps + 1
        mean_episode_q.append(q)
        mean_episode_loss.append(loss)
        if resetnow or info["flag_get"]:
            break
    print(
        f"Episode {episode} abgeschlossen mit {steps} Schritten, Gesamtbelohnung: {total_reward}, Epsilon: {mario.epsilon}\n\n"
    )
    # Save the results of the current episode
    reward_list.append(total_reward)
    steps_list.append(steps)
    q_list.append(np.mean(mean_episode_q))
    loss_list.append(np.mean(mean_episode_loss))
    epsilon_list.append(mario.epsilon)

    # Plot the results of all episodes at the defined intervals
    if episode % plot_every == 0:
        plot_results(
            reward_list,
            steps_list,
            q_list,
            loss_list,
            epsilon_list,
            os.path.join(plot_folder, f"plot_{episode}.png"),
        )

    # Save the model at the defined intervals
    if episode % save_every == 0:
        torch.save(
            dict(
                model=mario.model.state_dict(),
                optimizer=mario.optimizer.state_dict(),
                epsilon=mario.epsilon,
            ),
            os.path.join(checkpoint_folder, f"model_ep{episode}.pth"),
        )

    # Decay the epsilon value at the end of each episode
    mario.decayEpsilon(strat="lin")

# Save the final model after all episodes are completed
torch.save(
    dict(
        model=mario.model.state_dict(),
        optimizer=mario.optimizer.state_dict(),
        epsilon=mario.epsilon,
    ),
    os.path.join(model_folder, f"final_model.pth"),
)

# Close the environment
env.close()

  state = torch.tensor(state, dtype=torch.float32, device=self.device)
  return (self.ram[0x86] - self.ram[0x071c]) % 256


Episode 1 abgeschlossen mit 8019 Schritten, Gesamtbelohnung: 138.0, Epsilon: 1.0


Episode 2 abgeschlossen mit 166 Schritten, Gesamtbelohnung: 235.0, Epsilon: 0.999


Episode 3 abgeschlossen mit 8019 Schritten, Gesamtbelohnung: 136.0, Epsilon: 0.998


Episode 4 abgeschlossen mit 214 Schritten, Gesamtbelohnung: 236.0, Epsilon: 0.997


Episode 5 abgeschlossen mit 8019 Schritten, Gesamtbelohnung: 139.0, Epsilon: 0.996


Episode 6 abgeschlossen mit 8019 Schritten, Gesamtbelohnung: 139.0, Epsilon: 0.995


Episode 7 abgeschlossen mit 8019 Schritten, Gesamtbelohnung: 137.0, Epsilon: 0.994


Episode 8 abgeschlossen mit 684 Schritten, Gesamtbelohnung: 633.0, Epsilon: 0.993


Episode 9 abgeschlossen mit 8019 Schritten, Gesamtbelohnung: 139.0, Epsilon: 0.992


Episode 10 abgeschlossen mit 180 Schritten, Gesamtbelohnung: 230.0, Epsilon: 0.991


Episode 11 abgeschlossen mit 232 Schritten, Gesamtbelohnung: 239.0, Epsilon: 0.99


