# Preprocessing the frames

In [29]:
def preprocess(frame):
    frame = frame.sum(axis=-1)/765
    frame = frame[20:210, :]
    frame = frame[::2, ::2]
    return frame

In [30]:
import configparser


config = configparser.ConfigParser()
config.read('./config.conf')


#------------------------
default = 'DEFAULT'
#------------------------
default_config = config[default]

# Environment

In [31]:
train_method = default_config['TrainMethod']
max_step_per_episode = int(default_config['MaxStepPerEpisode'])

class Environment(Process):
    @abstractmethod
    def run(self):
        pass

    @abstractmethod
    def reset(self):
        pass

    @abstractmethod
    def pre_proc(self, x):
        pass

    @abstractmethod
    def get_init_state(self, x):
        pass


def unwrap(env):
    if hasattr(env, "unwrapped"):
        return env.unwrapped
    elif hasattr(env, "env"):
        return unwrap(env.env)
    elif hasattr(env, "leg_env"):
        return unwrap(env.leg_env)
    else:
        return env

In [32]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, is_render, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8)
        self._skip = skip
        self.is_render = is_render

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if self.is_render:
                self.env.render()
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


In [33]:
class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=True,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = JoypadSpace(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.life_done = life_done
        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            obs, reward, done, info = self.env.step(action)

            # when Mario loses life, changes the state to the terminal
            # state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = log_reward

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(
                        self.episode,
                        self.env_idx,
                        self.steps,
                        self.rall,
                        np.mean(
                            self.recent_rlist),
                        info['stage'],
                        info['x_pos'],
                        self.max_pos))

                self.history = self.reset()

            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)

# Flatten

In [34]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

# PPO

In [35]:
class PPOModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(PPOModel, self).__init__()
        
        linear = nn.Linear
        
        #Shared network (CNN Part)
        self.feature = nn.Sequential(
            nn.Conv2d(
                in_channels=4,
                out_channels=32,
                kernel_size=8,
                stride=4
            ),
            nn.ELU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2
            ),
            nn.ELU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1
            ),
            nn.ELU(),
            Flatten(),
            linear(7 * 7 * 64, 256),
            nn.ELU(),
        )
        
        self.actor = nn.Sequential(
            linear(448, 448),
            nn.ELU(),
            linear(448, output_size)
        )
        
        # The layer before having 2 value head
        self.common_critic_layer = nn.Sequential(
            linear(448, 448),
            nn.ELU()
        )
        
        self.critic_ext = linear(448, 1)
        self.critic_int = linear(448, 1)
        
        
        #Initialize the weights
        for p in self.modules():
            #Need to initialize the weights because it will return an error saying ELU does not have weights
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()
            
            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()
                
        
        #Initialize critics
        init.orthogonal_(self.critic_ext.weight, 0.01)
        self.critic_ext.bias.data.zero_()
        
        init.orthogonal_(self.critic_int.weight, 0.01)
        self.critic_int.bias.data.zero_()
        
        
        #Intialize actor
        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
        
        #Init value common layer
        for i in range(len(self.common_critic_layer)):
            if type(self.common_critic_layer[i]) == nn.Linear:
                init.orthogonal_(self.common_critic_layer[i].weight, 0.1)
                self.common_critic_layer[i].bias.data.zero_()
                
        
    def forward(self, state):
        x = self.feature(state)
        policy = self.actor(x)
        value_ext = self.critic_ext(self.common_critic_layer(x) + x)
        value_int = self.critic_int(self.common_critic_layer(x) + x)
        return policy, value_ext, value_int
        

# RND Model

In [36]:
class RNDModel(nn.Module):
    
    
    def __init__(self, input_size, output_size):
        super(RNDModel, self).__init__()
        
        
        self.input_size = input_size
        self.output_size = output_size
        
        
        feature_output = 7 * 7 * 64
        
        
        #Prediction network
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels = 32,
                kernel_size = 8,
                stride=4
            ),
            nn.ELU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2
            ),
            nn.ELU(),
            nn.Conv2d(
                in_channels = 64,
                out_channels = 64,
                kernel_size =3,
                stride=1,
            ),
            nn.ELU(),
            Flatten(),
            nn.Linear(feature_output, 512),
            nn.ELU(),
            nn.Linear(512, 512),
            nn.ELU(),
            nn.Linear(512, 512)
        )
        
        
        #Taregt network
        self.target = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels = 32,
                kernel_size = 8,
                stride=4
            ),
            nn.ELU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2
            ),
            nn.ELU(),
            nn.Conv2d(
                in_channels = 64,
                out_channels = 64,
                kernel_size =3,
                stride=1,
            ),
            nn.ELU(),
            Flatten(),
            nn.Linear(feature_output, 512),
        )
        
        
        #Initalze the weights and biases
        for p in self.modules():
            
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()
            
            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()
                
        
        #Set that target netowrk is not trainable
        for param in self.target.parameters():
            param.requres_grad = False
    
    
    def forward(self, next_obs):
        target_feature = self.target(next_obs)
        predict_feature = self.predictor(next_obs)
        
        return predict_feature, target_feature

# RND Agent

In [37]:
class RNDAgent(object):
    
    def __init__(
        self,
        input_size,
        output_size,
        num_env,
        num_step,
        gamma,
        lam=0.95,
        learning_rate=1e-4,
        ent_coef=0.01,
        clip_grad_norm=0.5,
        epoch=3,
        batch_size=128,
        ppo_eps=0.1,
        update_proportion=0.25,
        use_gae=True,
        use_cuda=False):
        
        #Creating a PPO Model
        self.model  =  PPOModel(input_size, output_size)
        
        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.num_step = num_step
        self.gamma = gamma
        self.lam = lam
        self.learning_rate = learning_rate
        self.ent_coef = ent_coef
        self.clip_grad_norm = clip_grad_norm
        self.epoch = epoch
        self.batch_size = batch_size
        self.ppo_eps = ppo_eps
        self.update_proportion = update_proportion
        self.use_gae = use_gae
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        print("DEVICE: ", self.device)
        
        
        #Creating a RND Model
        self.rnd = RNDModel(input_size, output_size)
        
        
        #Using an optimizer (Adam)
        self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.rnd.predictor.parameters()), lr=learning_rate)
        
        
        self.rnd = self.rnd.to(self.device)
        self.model = self.model.to(self.device)
        
    
    def GetAction(self, state):
        
        #Transform the state into a float 32 tensor
        state = torch.Tensor(state).to(self.device)
        state = state.float()
        
        
        #Getting the policy, value_ext, value _int
        policy, value_ext, value_int = self.model(state)
        
        #Get action probability distrubiton
        action_prob = F.softmax(policy, dim=-1).data.cpu().numpy()
        
        #select action
        action = self.RandomChoiceProbIndex(action_prob)
        
        return action, value_ext.data.cpu().numpy().squeeze(), value_int.data.cput().numpy().squeeze()
    
    
    
    @staticmethod
    def RandomChoiceProbIndex(p, axis=1):
        r = np.expand_dims(np.random.rand(p.shape[1 - axis]), axis=axis)
        return (p.cumsum(axis=axis) > r).argmax(axis=axis)
    
    #Calculate Intrinsic reward
    def ComputeIntrinisicReward(self, next_obs):
        next_obs = torch.FloatTensor(next_obs).to(self.device)
        
        
        #Get target feature
        target_next_feature = self.rnd.target(next_obs)
        
        #Get prediction feature
        predict_next_feature = self.rnd.predictor(next_obs)
        
        #Calculate intrinisc reward
        intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum(1) / 2
        
        return intrinisc_rward.data.cpu().num()
    
    
    
    def train_model(self, s_batch, target_ext_batch, target_int_batch, y_batch, adv_batch, next_obs_batch, old_policy):
        s_batch = torch.FloatTensor(s_batch).to(self.device)
        target_ext_batch = torch.FloatTensor(target_ext_batch).to(self.device)
        target_int_batch = torch.FloatTensor(target_int_batch).to(self.device)
        y_batch = torch.LongTensor(y_batch).to(self.device)
        adv_batch = torch.FloatTensor(adv_batch).to(self.device)
        next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device)
        
        sample_range = np.arrange(len(s_batch))
        forward_mse = nn.MSELoss(reduction = "none")
        
        
        #Getting old policy
        with torch.no_grad():
            policy_old_list = torch.stack(old_policy).permute(1,0,2).contiguous().view(-1, self.output_size).to(self.device)

            m_old = Categorical(F.solftmax(policy_old_list, dim=-1))
            log_prob_old = m_old.log_prob(y_batch)
            
        for i in range(self.epoch):
            #Doing minibatches of training
            np.random.shuffle(sample_range)
            
            for j in range(int(len(s_batch)/ self.batch_size)):
                sample_idx = sample_range[self.batch_size * j:self.batch_size * (j + 1)]
                
                
                #--------------------------------------------------------------------------------------
                #Curiosity driven calcuation (RND)
                predict_next_state_feature, target_next_state_feature = self.rnd(next_obs_batch[sample_idx])
                
                
                forward_loss = forward_mse(predict_next_state_feature, target_next_state_feature.detach()).mean(-1)
                mask = torch.rand(len(forward_loss)).to(self.device)
                mask = (mask < self.update_proportion).type(torch.FloatTensor).to(self.device)
                forward_loss = (forward_loss * mask).sum() / torch.max(mask.sum(), torch.Tensor([1]).to(self.device))
                
                #--------------------------------------------------------------------------------------------------
                
                policy, value_ext, value_int = self.model(s_batch[sample_idx])
                m = Categorical(F.softmax(policy, dim=-1))
                log_prob = m.log_prob(y_batch[sample_idx])
                
                
                ratio = torch.exp(log_prob - log_prob_old[sample_idx])
                
                surr1 = ratio * adv_batch[sample_idx]
                surr2 = torch.clamp(ratio, 1.0 - self.ppo_eps, 1.0 + self.ppo_eps) * adv_batch[sample_idx]
                
                #Calculate actor loss
                actor_loss = -torch.min(surr1, surr2).mean()
                
                #Calcualate critic loss
                critic_ext_loss = F.mse_loss(value_ext.sum(1), target_ext_batch[sample_idx])
                critic_int_loss = F.mse_loss(value_int.sum(1), target_int_batch[sample_idx])
                
                
                #Critic loss = critic E loss + critic I loss
                critic_loss = critic_ext_loss + critic_int_loss
                
                #Calculate the entropy
                # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
                entropy = m.entropy().mean()
                
                #Reset the gardients
                self.optimizer.zero_grad()
                
                
                #Calculate the loss
                #Total loss = Policy gradient loss - entropy * entropy coefficent + Value coefficent * value loss + foward_loss
                loss  = actor_loss + 0.5 * critic_loss - self.ent_coef * entropy + forward_loss
                
                
                #Backpropagation
                loss.backward()
                global_grad_norm_(list(self.model.parameters())+list(self.rnd.predictor.parameters()))
                self.optimizer.step()

        

# Utils

In [38]:
from torch._six import inf

use_gae = default_config.getboolean('UseGAE')
lam = float(default_config['Lambda'])

def make_train_data(reward, done, value, gamma, num_step, num_worker):
    discounted_return = np.empty([num_worker, num_step])
    
    
     # Discounted Return
    if use_gae:
        gae = np.zeros_like([num_worker, ])
        for t in range(num_step - 1, -1, -1):
            delta = reward[:, t] + gamma * value[:, t + 1] * (1 - done[:, t]) - value[:, t]
            gae = delta + gamma * lam * (1 - done[:, t]) * gae

            discounted_return[:, t] = gae + value[:, t]

            # For Actor
        adv = discounted_return - value[:, :-1]

    else:
        running_add = value[:, -1]
        for t in range(num_step - 1, -1, -1):
            running_add = reward[:, t] + gamma * running_add * (1 - done[:, t])
            discounted_return[:, t] = running_add

        # For Actor
        adv = discounted_return - value[:, :-1]

    return discounted_return.reshape([-1]), adv.reshape([-1])


In [39]:
class RunningMeanStd(object):
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
    def __init__(self, epsilon=1e-4, shape=()):
        self.mean = np.zeros(shape, 'float64')
        self.var = np.ones(shape, 'float64')
        self.count = epsilon

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        self.update_from_moments(batch_mean, batch_var, batch_count)

    def update_from_moments(self, batch_mean, batch_var, batch_count):
        delta = batch_mean - self.mean
        tot_count = self.count + batch_count

        new_mean = self.mean + delta * batch_count / tot_count
        m_a = self.var * (self.count)
        m_b = batch_var * (batch_count)
        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
        new_var = M2 / (self.count + batch_count)

        new_count = batch_count + self.count

        self.mean = new_mean
        self.var = new_var
        self.count = new_count


In [40]:
class RewardForwardFilter(object):
    def __init__(self, gamma):
        self.rewems = None
        self.gamma = gamma

    def update(self, rews):
        if self.rewems is None:
            self.rewems = rews
        else:
            self.rewems = self.rewems * self.gamma + rews
        return self.rewems


def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]  # dito
    return e_x / div


def global_grad_norm_(parameters, norm_type=2):
    r"""Clips gradient norm of an iterable of parameters.
    The norm is computed over all gradients together, as if they were
    concatenated into a single vector. Gradients are modified in-place.
    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.
    Returns:
        Total norm of the parameters (viewed as a single vector).
    """
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = list(filter(lambda p: p.grad is not None, parameters))
    norm_type = float(norm_type)
    if norm_type == inf:
        total_norm = max(p.grad.data.abs().max() for p in parameters)
    else:
        total_norm = 0
        for p in parameters:
            param_norm = p.grad.data.norm(norm_type)
            total_norm += param_norm.item() ** norm_type
        total_norm = total_norm ** (1. / norm_type)

    return total_norm


# Main

In [41]:
from torch.multiprocessing import Pipe
from tensorboardX import SummaryWriter

In [42]:
def main():
    
    #Printing out the config hyperparameters
    print({section: dict(config[section]) for section in config.sections()})
    
    #Select the training environment
    env_id = default_config['EnvID']
    
    #Select the env_type
    env_type = default_config['EnvType']
    
    if env_type == 'mario':
        env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    
    #Getting the state size and action space
    input_size = env.observation_space.shape
    print("INPUT SIZE", input_size)
    output_size = env.action_space.n
    print("ACTION SPACE", output_size)
    
    
    env.close()
    
    is_load_model = False
    is_render = False
    
    
    # Defining the model path names
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)
    
    #Set the writer from Tensorboard
    writer = SummaryWriter()
    
    #Get the config hyperparameters
    use_cuda = default_config.getboolean('UseGPU')
    
    #GAE hyperparameters
    use_gae = default_config.getboolean('UseGAE')
    lam = float(default_config['Lambda'])
    
    #Number of different environments to run in parallel
    num_worker = int(default_config['NumEnv'])
    
    num_step = int(default_config['NumStep'])
    
    #PPO epsilon (aka what will help us to define the cliprange)
    ppo_eps = float(default_config['PPOEps'])
        
    epoch = int(default_config['Epoch']) 
    
    mini_batch = int(default_config['MiniBatch'])
    
    batch_size = int(num_step * num_worker / mini_batch)
    
    learning_rate = float(default_config['LearningRate'])
    
    entropy_coef = float(default_config['Entropy'])
    
    #Extrinsic reward discound rate
    gamma = float(default_config['Gamma'])
    
    
    #Intrinsic reward discound rate
    int_gamma = float(default_config['IntGamma'])
    
    #Gradient normalization clip
    clip_grad_norm = float(default_config['ClipGradNorm'])
    
    
    #Extrinisic reward coefficent
    ext_coef = float(default_config['ExtCoef'])
    
    #Intrinsic reward coefficent
    int_coef = float(default_config['IntCoef'])
    
    #Use stick action
    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    
    
    life_done = default_config.getboolean('LifeDone')
    
    
    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)
    
    
    #Instantiate the agent
    agent = RNDAgent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef= entropy_coef,
        clip_grad_norm = clip_grad_norm,
        epoch=epoch,
        batch_size = batch_size,
        ppo_eps = ppo_eps,
        use_cuda = use_cuda,
        use_gae=use_gae
        
    )
    
    
    #Mario Environment
    if default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    #Add a legend of zelda environment later
    
    else:
        raise NotImplementedError
    
    # Loads models
    if is_load_model:
        if use_cuda:
            print("Loading PPO Saved Model using GPU")
            agent.model.load_state_dict(torch.load(model_path))
        else:
            print("Loading PPO Saved Model using CPU")
            agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    
    # Generate the different environements
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conn.append(parent_conn)
        child_conn.append(child_conn)
        
    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    print('Start to initailize observation normalization parameter.....')
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker,))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')
    while True:
        total_state = []
        total_reward = []
        total_done = []
        total_next_state = []
        total_action = []
        total_int_reward = []
        total_next_state = []
        total_next_obs = []
        total_ext_values = []
        total_int_values = []
        total_policy = []
        total_policy_np = []
        
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              gamma,
                                              num_step,
                                              num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              int_gamma,
                                              num_step,
                                              num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training
        agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action,
                          total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                          total_policy)

        if global_step % (num_worker * num_step * 100) == 0:
            print("Num Step: ", num_step)
            print('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)

In [43]:
if __name__ == '__main__':
    main()

{}
INPUT SIZE (240, 256, 3)
ACTION SPACE 12
DEVICE:  cuda


BrokenPipeError: [Errno 32] Broken pipe