In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('./tensorboard/A2C')

from a2c import ActorCritic, RolloutStorage

In [None]:
from minipackman import multi_env

num_envs = 8
mode = 'regular'
envs = multi_env(num_envs=num_envs, mode=mode)
state_shape = envs.observation_space.shape
state_shape

In [None]:
# a2c
gamma = 0.99
entropy_coef = 0.01
value_loss_coef = 0.5
max_grad_norm = 0.5
num_steps = 5 # batch size
num_frames = int(10e5) # whole steps(frames)

# rmsprop
lr = 7e-4
eps = 1e-5
alpha = 0.99

ac = ActorCritic(envs.observation_space.shape, envs.action_space.n)
optimizer = optim.RMSprop(ac.parameters(), lr, eps=eps, alpha=alpha)

rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
if torch.cuda.is_available(): 
    ac.cuda()
    rollout.cuda()

In [None]:
state = torch.FloatTensor(np.float32(envs.reset())) # [8, 3, 15, 19]
rollout.states[0].copy_(state)
episode_rewards = torch.zeros(num_envs, 1)
final_rewards = torch.zeros(num_envs, 1)

In [None]:
for i_update in range(num_frames):
    ##### do rollout {num_steps} times
    for i_step in range(num_steps):
        action = ac.act(state)
        next_state, reward, done, _ = envs.step(action.squeeze(1).cpu().data.numpy())
        reward = torch.FloatTensor(reward).unsqueeze(1)

        episode_rewards += reward
        masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1)
        if torch.cuda.is_available(): masks.cuda()
        
        final_rewards *= masks
        final_rewards += (1-masks) * episode_rewards
        episode_rewards *= masks

        state = torch.FloatTensor(np.float32(next_state))
        rollout.insert(i_step, state, action.data, reward, masks)
    #####

    with torch.no_grad(): _, next_value = ac(rollout.states[-1])
    next_value = next_value.data
    returns = rollout.get_batch_returns(next_value, gamma)

    x = rollout.states[:-1].view(-1, *state_shape)
    in_action = rollout.actions.view(-1, 1)
    logit, action_log_probs, values, entropy = ac.evaluate_actions(x, in_action)
    values = values.view(num_steps, num_envs, 1)
    action_log_probs = action_log_probs.view(num_steps, num_envs, 1)

    advantages = returns - values

    value_loss = advantages.pow(2).mean()
    action_loss = -(advantages.data * action_log_probs).mean()

    # policy, value update
    optimizer.zero_grad()
    loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
    loss.backward()
    nn.utils.clip_grad_norm_(ac.parameters(), max_grad_norm)
    optimizer.step()

    rollout.after_update()

    # log
    r_for_log = 0 if torch.sum(final_rewards) == 0 else final_rewards.mean().item()
    writer.add_scalar('training reward', r_for_log, i_update)
    writer.add_scalar('training loss', loss.item(), i_update)
    if i_update + 1 % 1000 == 0 and i_update != 0:
        print(f'{i_update} th Update :::: Rewards : {final_rewards.mean().item()} :::: Loss : {loss.item()}')
        torch.save(ac.state_dict(), "./model/a2c_" + mode + "_" + str(i_update))

writer.close()
envs.close()