In [1]:
import pickle
import random
from collections import deque

import gym_super_mario_bros
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
from nes_py.wrappers import JoypadSpace
import cv2
import matplotlib.pyplot as plt

from wrappers import *

In [2]:
def arrange(s):
    if not type(s) == "numpy.ndarray":
        s = np.array(s)
    assert len(s.shape) == 3
    ret = np.transpose(s, (2, 0, 1))
    return np.expand_dims(ret, 0)


class replay_memory(object):
    def __init__(self, N):
        self.memory = deque(maxlen=N)

    def push(self, transition):
        self.memory.append(transition)

    def sample(self, n):
        return random.sample(self.memory, n)

    def __len__(self):
        return len(self.memory)


class model(nn.Module):
    def __init__(self, n_frame, n_action, device):
        super(model, self).__init__()
        self.layer1 = nn.Conv2d(n_frame, 32, 8, 4)
        self.layer2 = nn.Conv2d(32, 64, 3, 1)
        self.fc = nn.Linear(20736, 512)
        self.q = nn.Linear(512, n_action)
        self.v = nn.Linear(512, 1)

        self.device = device
        self.seq = nn.Sequential(self.layer1, self.layer2, self.fc, self.q, self.v)

        self.seq.apply(init_weights)

    def forward(self, x):
        if type(x) != torch.Tensor:
            x = torch.FloatTensor(x).to(self.device)
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = x.view(-1, 20736)
        x = torch.relu(self.fc(x))
        adv = self.q(x)
        v = self.v(x)
        q = v + (adv - 1 / adv.shape[-1] * adv.max(-1, True)[0])

        return q


def init_weights(m):
    if type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)


def train(q, q_target, memory, batch_size, gamma, optimizer, device):
    s, r, a, s_prime, done = list(map(list, zip(*memory.sample(batch_size))))
    s = np.array(s).squeeze()
    s_prime = np.array(s_prime).squeeze()
    a_max = q(s_prime).max(1)[1].unsqueeze(-1)
    r = torch.FloatTensor(r).unsqueeze(-1).to(device)
    done = torch.FloatTensor(done).unsqueeze(-1).to(device)
    with torch.no_grad():
        y = r + gamma * q_target(s_prime).gather(1, a_max) * done
    a = torch.tensor(a).unsqueeze(-1).to(device)
    q_value = torch.gather(q(s), dim=1, index=a.view(-1, 1).long())

    loss = F.smooth_l1_loss(q_value, y).mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss


def copy_weights(q, q_target):
    q_dict = q.state_dict()
    q_target.load_state_dict(q_dict)

In [3]:
def plot_training_progress(rewards, steps, epsilons, episode):
    plt.figure(figsize=(12, 8))

    plt.subplot(3, 1, 1)
    plt.plot(rewards, label='Reward')
    plt.title('Training Progress')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.legend()

    plt.subplot(3, 1, 2)
    plt.plot(steps, label='Steps')
    plt.title('Training Progress')
    plt.xlabel('Episode')
    plt.ylabel('Steps')
    plt.legend()

    plt.subplot(3, 1, 3)
    plt.plot(epsilons, label='Epsilon')
    plt.title('Training Progress')
    plt.xlabel('Episode')
    plt.ylabel('Epsilon')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'training_progress_episode_{episode}.png')
    plt.show()

In [4]:
def main(env, q, q_target, optimizer, device):
    t = 0
    gamma = 0.99
    batch_size = 64

    N = 50000
    eps = 1.0
    eps_decay = 0.9995
    eps_min = 0.1
    memory = replay_memory(N)
    update_interval = 50
    print_interval = 10

    rewards = []
    total_reward = 0
    losses = []
    total_loss = 0
    steps = []
    epsilons = []

    for k in range(1000000):
        s = arrange(env.reset())
        done = False

        while not done:
            if eps > np.random.rand():
                a = env.action_space.sample()
            else:
                if device == "cpu":
                    a = np.argmax(q(s).detach().numpy())
                else:
                    a = np.argmax(q(s).cpu().detach().numpy())
            s_prime, r, done, _ = env.step(a)
            s_prime = arrange(s_prime)
            total_reward += r
            r = np.sign(r) * (np.sqrt(abs(r) + 1) - 1) + 0.001 * r
            memory.push((s, float(r), int(a), s_prime, int(1 - done)))
            s = s_prime
            stage = env.unwrapped._stage
            if len(memory) > 2000:
                total_loss += train(q, q_target, memory, batch_size, gamma, optimizer, device)
                t += 1
            if t % update_interval == 0:
                copy_weights(q, q_target)
                torch.save(q.state_dict(), "mario_q.pth")
                torch.save(q_target.state_dict(), "mario_q_target.pth")

        if k % print_interval == 0:
            print(
                "%s |Epoch : %d | score : %f | loss : %.2f | stage : %d | epsilon : %.2f"
                % (
                    device,
                    k,
                    total_reward / print_interval,
                    total_loss / print_interval,
                    stage,
                    eps,
                )
            )
            rewards.append(total_reward / print_interval)
            steps.append(k)
            epsilons.append(eps)
            total_reward = 0.0
            total_loss = 0.0
            pickle.dump(rewards, open("rewards.p", "wb"))
            pickle.dump(steps, open("steps.p", "wb"))
            pickle.dump(epsilons, open("epsilons.p", "wb"))

        #if k % 5 == 0:
        #    plot_training_progress(rewards, steps, epsilons, k)

        eps = max(eps * eps_decay, eps_min)


if __name__ == "__main__":
    n_frame = 4
    env = gym_super_mario_bros.make("SuperMarioBros-v0")
    env = JoypadSpace(env, COMPLEX_MOVEMENT)
    env = wrap_mario(env)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    q = model(n_frame, env.action_space.n, device).to(device)
    q_target = model(n_frame, env.action_space.n, device).to(device)
    optimizer = optim.Adam(q.parameters(), lr=0.001)
    print(device)

    main(env, q, q_target, optimizer, device)

cpu


  return (self.ram[0x86] - self.ram[0x071c]) % 256


cpu |Epoch : 0 | score : 22.300000 | loss : 0.43 | stage : 1 | epsilon : 1.00


KeyboardInterrupt: 