In [1]:
import sys
from itertools import count

import cv2
import gym_super_mario_bros
from gym.wrappers import Monitor
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
from nes_py.wrappers import JoypadSpace

import matplotlib.pyplot as plt
import numpy as np
import torch

from policy_agent import Agent
from wrappers import wrapper

In [2]:
# env settings
WORLD = 1
STAGE = 1
LEVEL_NAME = "SuperMarioBros-{}-{}-v0".format(WORLD, STAGE)
FRAME_DIM = (84, 84, 4)     # (120, 128, 4)  # original image size is 240x256
ACTION_SPACE = SIMPLE_MOVEMENT
RENDER_GAME = True
LOAD_MODEL = False
CHECKPOINT_INTERVAL = 100
MODEL_PATH = "./saved_models/policy_gradient_model_world1-1"  # to create a new model set it to ""

# training hyperparameters
TRAIN_MODEL = True
LEARNING_RATE = 0.000007
NUM_EPOCHS = 1_001
GAMMA = 0.99

LOG_INTERVAL = 1
PLOT_INTERVAL = 50
VIDEO_INTERVAL = 50

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"Available GPU: {torch.cuda.get_device_name(0)}")

Available GPU: GeForce GTX 1050 Ti with Max-Q Design


In [4]:
def create_environment():
    """Creates the environment, applies some wrappers and returns it."""
    tmp_env = gym_super_mario_bros.make(LEVEL_NAME)
    tmp_env = JoypadSpace(tmp_env, ACTION_SPACE)
    tmp_env = wrapper(tmp_env, FRAME_DIM)

    return tmp_env

In [5]:
def lazy_frame_to_tensor(lazy_frame):
    # pytorch expects the frames as height x width x depth
    return torch.from_numpy(
        np.expand_dims(np.asarray(lazy_frame).astype(np.float64).transpose((2, 1, 0)), axis=0)).float().to(DEVICE)


In [6]:
def plot_rewards(reward_list, reward_mean_history):
    plt.figure(figsize=(12,6))
    plt.plot(reward_list, "#b300ff", label = "Reward History")
    plt.plot(reward_mean_history, "#ff8c00", label = "Mean Reward History")
    plt.ylabel("Rewards", fontsize = 18)
    plt.xlabel("Episodes",fontsize = 18)
    plt.title("Reward Per Episode", fontsize = 18)
    plt.legend()
    plt.show()


In [7]:
def record_one_episode(agent, episode):
    tmp_env = gym_super_mario_bros.make(LEVEL_NAME)
    tmp_env = JoypadSpace(tmp_env, ACTION_SPACE)
    tmp_env = Monitor(tmp_env, './video/video-episode-{0:05d}'.format(episode), force=True)
    tmp_env = wrapper(tmp_env, FRAME_DIM)

    state = lazy_frame_to_tensor(tmp_env.reset())

    total_reward = 0
    while True:
        action, _ = agent.select_action_based_on_state(state)

        next_state, reward, done, info = tmp_env.step(action)
        next_state = lazy_frame_to_tensor(next_state)

        if done:
            break

        total_reward += reward

        state = next_state


In [8]:
env = create_environment()

agent = Agent(env.action_space.n, FRAME_DIM, LEARNING_RATE, GAMMA, DEVICE, MODEL_PATH)
if LOAD_MODEL:
    agent.load_model(model_path=MODEL_PATH)

if not TRAIN_MODEL:
    record_one_episode(agent)
    sys.exit()

In [9]:
reward_history = []
reward_mean_history = [0]

step_log_prob_history = []
step_reward_history = []

# save one example warped image for preview
state = env.reset()
cv2.imwrite("exampleImage.jpg", np.asarray(state))

for episode in range(1, NUM_EPOCHS):
    torch.cuda.memory_summary(device=None, abbreviated=False)
    state, last_reward = lazy_frame_to_tensor(env.reset()), 0
    if episode % 100 == 0:
        torch.cuda.empty_cache()
    for step in count():
        # perform an action
        action, log_prob = agent.select_action_based_on_state(state)
        step_log_prob_history.append(log_prob)
        # delete the last state to prevent memory overflow
        del state
        state, reward, done, info = env.step(action)

        if done and reward < 0:
            # if we died the reward will be less than zero
            step_reward_history.append(reward)

            last_reward += reward
            reward_history.append(last_reward)
            if episode >= 25:
                reward_mean_history.append(np.mean(reward_history))
            break

        if done and reward > 0:
            # if we solved the current level give mario the highest possible reward of 15
            step_reward_history.append(15)

            last_reward += 15
            reward_history.append(last_reward)
            if episode >= 25:
                reward_mean_history.append(np.mean(reward_history))
            print("Finished the level")
            break

        state = lazy_frame_to_tensor(state)

        if RENDER_GAME:
            env.render()

        step_reward_history.append(reward)
        last_reward += reward

    loss = agent.update(step_log_prob_history, step_reward_history)

    if episode % LOG_INTERVAL == 0:
        print("Episode {}\tLast Reward: {:.2f}\tAverage reward: {:.2f}\tLoss: {:.2f}".format(episode, last_reward,
                                                                                             reward_mean_history[-1],
                                                                                             loss))

    if episode % PLOT_INTERVAL == 0:
        plot_rewards(reward_history, reward_mean_history)
    # if episode % VIDEO_INTERVAL == 0:
    if last_reward >= 1800:
        print(f"Saving Video for reward: {last_reward} at episode:{episode}")
        record_one_episode(agent, episode)
    if episode % CHECKPOINT_INTERVAL == 0:
        agent.save_model(model_path=MODEL_PATH)
    del loss
    del step_reward_history[:]
    del step_reward_history[:]
    step_reward_history = []
    step_log_prob_history = []

Episode 1	Last Reward: 620.00	Average reward: 0.00	Loss: 1.74
Episode 2	Last Reward: 558.00	Average reward: 0.00	Loss: 1.38
Episode 3	Last Reward: 721.00	Average reward: 0.00	Loss: 1.09
Episode 4	Last Reward: 1553.00	Average reward: 0.00	Loss: -7.78
Episode 5	Last Reward: 719.00	Average reward: 0.00	Loss: -13.25
Episode 6	Last Reward: 847.00	Average reward: 0.00	Loss: 3.60
Episode 7	Last Reward: 743.00	Average reward: 0.00	Loss: 0.61
Episode 8	Last Reward: 230.00	Average reward: 0.00	Loss: 4.39
Episode 9	Last Reward: 750.00	Average reward: 0.00	Loss: -6.15
Episode 10	Last Reward: 231.00	Average reward: 0.00	Loss: -5.94
Episode 11	Last Reward: 595.00	Average reward: 0.00	Loss: 22.01
Episode 12	Last Reward: 701.00	Average reward: 0.00	Loss: 6.61
Episode 13	Last Reward: 727.00	Average reward: 0.00	Loss: -10.05
Episode 14	Last Reward: 733.00	Average reward: 0.00	Loss: -28.57
Episode 15	Last Reward: 761.00	Average reward: 0.00	Loss: -36.10
Episode 16	Last Reward: 1242.00	Average reward: 0.0

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 4.00 GiB total capacity; 2.75 GiB already allocated; 1.38 MiB free; 2.91 GiB reserved in total by PyTorch)