In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import gym
import numpy as np

env = gym.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode="human")
env = JoypadSpace(env, SIMPLE_MOVEMENT)


  logger.warn(
  logger.warn(


In [2]:
from gym.spaces import Box
from gym.wrappers import FrameStack, GrayScaleObservation, ResizeObservation

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, trunk, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, trunk, info

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)


In [3]:

import torch.nn as nn
import torch.nn.functional as F


class PPO(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(PPO, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.linear = nn.Linear(32 * 6 * 6, 512)
        self.critic_linear = nn.Linear(512, 1)
        self.actor_linear = nn.Linear(512, num_actions)
        self._initialize_weights()

    def _initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
                nn.init.orthogonal_(module.weight, nn.init.calculate_gain('relu'))
                # nn.init.xavier_uniform_(module.weight)
                # nn.init.kaiming_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        #x = self.linear(x.view(x.size(0), -1))
        x = self.linear(x.reshape(x.size(0), -1))

        return self.actor_linear(x), self.critic_linear(x)

In [4]:
import torch
import torch.nn.functional as F
from collections import deque


def eval(global_model):
    torch.manual_seed(123)
    
    local_model = PPO(1, env.action_space.n)

    local_model.eval()
    #state = torch.from_numpy(env.reset())
    state, _ = env.reset()
    state = torch.from_numpy(np.array(state)).float().permute(0, 3, 1, 2)
   
    done = True
    curr_step = 0
    actions = deque(maxlen=200)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        logits, value = local_model(state)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, trunc, info = env.step(action)

        # Uncomment following lines if you want to save model whenever level is completed
        # if info["flag_get"]:
        #     print("Finished")
        #     torch.save(local_model.state_dict(),
        #                "{}/ppo_super_mario_bros_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_step))

        env.render()
        actions.append(action)
        if curr_step > 5e6 or actions.count(actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state, _ = env.reset()
        state, _ = env.reset()
        state = torch.from_numpy(np.array(state)).float().permute(0, 3, 1, 2)
        

In [5]:
from torch.distributions import Categorical
import torch.nn.functional as F
import numpy as np
from gym.wrappers.frame_stack import LazyFrames


gamma = 0.9
num_local_steps = 512
tau = 1.0
beta = 0.01
num_epochs = 10
num_processes=1
batch_size = 32
lr = 0.0001
epsilon = 0.2

def train():

    torch.manual_seed(123)
    #if os.path.isdir(opt.log_path):
    #    shutil.rmtree(opt.log_path)
    #os.makedirs(opt.log_path)
    #if not os.path.isdir(opt.saved_path):
    #    os.makedirs(opt.saved_path)
    model = PPO(1, env.action_space.n)
    model.share_memory()
    #process = mp.Process(target=eval, args=(opt, model, envs.num_states, envs.num_actions))
    #process.start()
    #eval(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    #[agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
    #curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
    #curr_states = torch.from_numpy(np.concatenate(curr_states, 0))

    curr_episode = 0
    while True:
        # if curr_episode % opt.save_interval == 0 and curr_episode > 0:
        #     torch.save(model.state_dict(),
        #                "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
        #     torch.save(model.state_dict(),
        #                "{}/ppo_super_mario_bros_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_episode))
        curr_episode += 1
        
        curr_state, _ = env.reset()
        curr_state = torch.from_numpy(np.array(curr_state)).float().permute(0, 3, 1, 2)
        
        old_log_policies = []
        actions = []
        values = []
        states = []
        rewards = []
        dones = []
        for _ in range(num_local_steps):
            states.append(curr_state)
            
            if isinstance(curr_state, LazyFrames):
                curr_state = torch.from_numpy(np.array(curr_state._frames)).float().permute(0, 3, 1, 2)

            
            logits, value = model(curr_state.float())
            values.append(value.squeeze())
            policy = F.softmax(logits, dim=1)
            old_m = Categorical(policy)
            action = old_m.sample()
            actions.append(action)
            old_log_policy = old_m.log_prob(action)
            old_log_policies.append(old_log_policy)
          
            if action.nelement() > 1:  # if action has more than one element
                action = action[0] 

            curr_state, reward, done, trunc, info = env.step(action.item())
            
            if done:
                curr_state = env.reset() 
            
            if isinstance(curr_state, list):
                curr_state = np.stack(curr_state, axis=0)
                
            if isinstance(curr_state, tuple):
                #curr_state = torch.Tensor(curr_state).permute(0, 3, 1, 2).float()
                curr_state = torch.stack([torch.from_numpy(np.array(x._frames)) if isinstance(x, LazyFrames) else torch.from_numpy(x) for x in curr_state]).permute(0, 3, 1, 2).float()

            #curr_state = torch.from_numpy(np.array(curr_state)).permute(0, 3, 1, 2)
            if isinstance(curr_state, np.ndarray):
                curr_state = torch.from_numpy(curr_state).permute(0, 3, 1, 2).float()
            
           
            reward = torch.FloatTensor([reward])
            done = torch.FloatTensor([done])
            rewards.append(reward)
            dones.append(done)
            curr_states = states

        _, next_value, = model(curr_states)
        next_value = next_value.squeeze()
        old_log_policies = torch.cat(old_log_policies).detach()
        actions = torch.cat(actions)
        values = torch.cat(values).detach()
        states = torch.cat(states)
        gae = 0
        R = []
        for value, reward, done in list(zip(values, rewards, dones))[::-1]:
            gae = gae * gamma * tau
            gae = gae + reward + gamma * next_value.detach() * (1 - done) - value.detach()
            next_value = value
            R.append(gae + value)
        R = R[::-1]
        R = torch.cat(R).detach()
        advantages = R - values
        for i in range(num_epochs):
            indice = torch.randperm(num_local_steps * num_processes)
            for j in range(batch_size):
                batch_indices = indice[
                                int(j * (num_local_steps * num_processes / batch_size)): int((j + 1) * (
                                        num_local_steps * num_processes / batch_size))]
                logits, value = model(states[batch_indices])
                new_policy = F.softmax(logits, dim=1)
                new_m = Categorical(new_policy)
                new_log_policy = new_m.log_prob(actions[batch_indices])
                ratio = torch.exp(new_log_policy - old_log_policies[batch_indices])
                actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices],
                                                   torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon) *
                                                   advantages[
                                                       batch_indices]))
                # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
                critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze())
                entropy_loss = torch.mean(new_m.entropy())
                total_loss = actor_loss + critic_loss - beta * entropy_loss
                optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
        print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))

train()

TypeError: expected np.ndarray (got LazyFrames)

In [None]:
env.close()