In [None]:
import argparse
from . import BaseAgent
from .. import characters
import pommerman
from pommerman import agents
import pommerman.characters
import pommerman.envs
import numpy as np
import torch
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
import features
import matplotlib.pyplot as plt
import random
import numpy as np
from collections import deque

In [None]:
use_cuda = torch.cuda.is_available()


def get_variable(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x


def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

In [None]:
parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor (default: 0.99)')
parser.add_argument('--lr', type=float, default=2.5e-4,
                    help='learning rate (default: 2.5e-4)')
parser.add_argument('--eps', type=float, default=1e-5,
                    help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument('--alpha', type=float, default=0.99,
                    help='RMSprop optimizer apha (default: 0.99)')
parser.add_argument('--tau', type=float, default=1.00,
                    help='parameter for GAE (default: 1.00)')
parser.add_argument('--entropy-coef', type=float, default=0.01,
                    help='entropy term coefficient (default: 0.01)')
parser.add_argument('--value-loss-coef', type=float, default=0.5,
                    help='value loss coefficient (default: 0.5)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='interval between training status logs (default: 10)')
parser.add_argument('--num-processes', type=int, default=20,
                    help='how many training processes to use (default: 4)')
parser.add_argument('--num-steps', type=int, default=20,
                    help='number of forward steps in A2C (default: 20)')
args = parser.parse_args()

The next part is based on [Ross Wightman's work](https://github.com/rwightman/pytorch-pommerman-rl/blob/master/envs/pommerman.py) on the Pommerman challenge where the observations are one hot encoded and compressed. The features function in the buttom takes the observations as input and returns a $9\times11\times11$ obervation matrix and $1\times3$ feature vector.

In [None]:
DEFAULT_FEATURE_CONFIG = {
    'recode_agents': True,
    'compact_powerups': True,
    'compact_structure': True,
    'rescale': True,
}


def make_np_float(feature):
    return np.array(feature).astype(np.float32)


def _rescale(x):
    return (x - 0.5) * 2.0


def featurize(obs, agent_id, config):
    max_item = pommerman.constants.Item.Agent3.value

    ob = obs["board"]
    ob_bomb_blast_strength = obs["bomb_blast_strength"].astype(np.float32) / pommerman.constants.AGENT_VIEW_SIZE
    ob_bomb_life = obs["bomb_life"].astype(np.float32) / pommerman.constants.DEFAULT_BOMB_LIFE

    # one hot encode the board items
    ob_values = max_item + 1
    ob_hot = np.eye(ob_values)[ob]

    # replace agent item channels with friend, enemy, self channels
    if config['recode_agents']:
        self_value = pommerman.constants.Item.Agent0.value + agent_id
        enemies = np.logical_and(ob >= pommerman.constants.Item.Agent0.value, ob != self_value)
        self = (ob == self_value)
        friends = (ob == pommerman.constants.Item.AgentDummy.value)
        ob_hot[:, :, 9] = friends.astype(np.float32)
        ob_hot[:, :, 10] = self.astype(np.float32)
        ob_hot[:, :, 11] = enemies.astype(np.float32)
        ob_hot = np.delete(ob_hot, np.s_[12::], axis=2)

    if config['compact_powerups']:
        # replace powerups with single channel
        powerup = ob_hot[:, :, 6] * 0.5 + ob_hot[:, :, 7] * 0.66667 + ob_hot[:, :, 8]
        ob_hot[:, :, 6] = powerup
        ob_hot = np.delete(ob_hot, [7, 8], axis=2)

    # replace bomb item channel with bomb life
    ob_hot[:, :, 3] = ob_bomb_life

    if config['compact_structure']:
        ob_hot[:, :, 0] = 0.5 * ob_hot[:, :, 0] + ob_hot[:, :, 5]  # passage + fog
        ob_hot[:, :, 1] = 0.5 * ob_hot[:, :, 2] + ob_hot[:, :, 1]  # rigid + wood walls
        ob_hot = np.delete(ob_hot, [2], axis=2)
        # replace former fog channel with bomb blast strength
        ob_hot[:, :, 5] = ob_bomb_blast_strength
    else:
        # insert bomb blast strength next to bomb life
        ob_hot = np.insert(ob_hot, 4, ob_bomb_blast_strength, axis=2)

    self_ammo = make_np_float([obs["ammo"]])
    self_blast_strength = make_np_float([obs["blast_strength"]])
    self_can_kick = make_np_float([obs["can_kick"]])

    ob_hot = ob_hot.transpose((2, 0, 1))  # PyTorch tensor layout compat

    if config['rescale']:
        ob_hot = _rescale(ob_hot)
        self_ammo = _rescale(self_ammo / 10)
        self_blast_strength = _rescale(self_blast_strength / pommerman.constants.AGENT_VIEW_SIZE)
        self_can_kick = _rescale(self_can_kick)

    return [ob_hot], [np.concatenate([self_ammo, self_blast_strength, self_can_kick])]


def features(obs, feature_config=DEFAULT_FEATURE_CONFIG):
    obs_im, obs_other = featurize(
                obs,
                0,
                feature_config)
    return obs_im, obs_other

In [None]:
class Actor_Critic(nn.Module):
    """Actor and critic - networks"""

    def __init__(self, n_inputs, n_outputs, inputs_other, n_conv_output):
        super(Policy, self).__init__()
        # network
        self.CNN = nn.Sequential(
                    nn.Conv2d(9, 64, 3, stride=1, padding=1),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    nn.Conv2d(64, 64, 3, stride=1, padding=1),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    nn.Conv2d(64, 64, 3, stride=1, padding=1),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    )
        self.CNN_mlp = nn.Sequential(        
                    nn.Linear(n_conv_output, 1024, bias=True),
                    nn.ReLU(),
                    nn.Linear(1024, 512, bias=True),
                    nn.ReLU(),
                    )
                    
        self.fnn_other = nn.Sequential(
                                    nn.Linear(inputs_other, inputs_other, bias=True),
                                    nn.ReLU(),
                                    )
        self.actor = nn.Sequential(
                                    nn.Linear(515, 6, bias=False),
                                    )
        self.state_value = nn.Sequential(
                                    nn.Linear(515, 1, bias=False),
                                    nn.Tanh(),
                                    )
        self.GRU = nn.GRUCell(n_inputs, 515)
        self.rewards = []
        self.values = []
        self.entropies = []
        self.log_prob = []

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return get_variable(torch.zeros(1, 515))

    def forward(self, x_im, x_other, hxs, batch_size=1):
        out = []
        x = self.CNN(x_im)
        x = x.view(batch_size, -1)
        x = self.CNN_mlp(x)
        out.append(x)
        if batch_size > 1:
            x_other = self.input_norm(x_other)
        x = self.fnn_other(x_other)
        out.append(x)
        out = torch.cat(out, dim=1)
        out = hxs = self.GRU(out, hxs)
        action_scores = self.actor(out)
        state_values = self.state_value(out)
        return action_scores, state_values[0], hxs

In [None]:
class PommermanAgent(BaseAgent):
    def __init__(self, character=characters.Bomber, mode='old', model="saved_models/policy_network_power_up3"):
        super(PytorchAgent_actor_critic, self).__init__(character)

        n_inputs = 515
        n_conv_output = 7744
        inputs_other = 3
        n_outputs = 6
        
        self.net = Actor_Critic(n_inputs, n_outputs, inputs_other, n_conv_output)
        
        if use_cuda:
            self.net = self.net.cuda()  

        if mode is not "new":
            self.net.load_state_dict(torch.load(model))
        self.hxs = self.net.init_hidden()

    def act(self, obs, action_space=None):
        obs_im, obs_other = features(obs)
        self.net.eval()
        with torch.no_grad():
            action_scores, _, hxs = self.net(get_variable(torch.Tensor(obs_im)), get_variable(torch.Tensor(obs_other)), self.hxs)
            self.hxs = hxs
        return action_scores.argmax().item()

In [None]:
# Create a set of agents (exactly four)
agent_list = [
    PommermanAgent(model="saved_models/policy_network_team_agent0"),
    agents.SimpleAgent(),
    ommermanAgent(model="saved_models/policy_network_team_agent2"),
    agents.SimpleAgent(),
]
# Make the "Team" environment using the agent list
env = pommerman.make('PommeTeamCompetitionFast-v0', agent_list)
num_episodes = 80000000
agent = 2
team_agent = 0

model = ommermanAgent(mode="test", model="saved_models/policy_network_team_agent"+str(agent)).net
optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=args.eps, alpha=args.alpha)
eps = np.finfo(np.float32).eps.item()
test_results = []

In [None]:
def select_action(state, hxs):
    obs_im, obs_other = features.features(state[agent])
    state_im = get_variable(torch.from_numpy(np.array(obs_im)).float())
    state_other = get_variable(torch.from_numpy(np.array(obs_other)).float())
    action_scores, state_value, hxs = model(state_im, state_other, hxs)
    probs = F.softmax(action_scores, dim=-1)
    m = Categorical(probs)
    action = m.sample().detach()
    log_prob = m.log_prob(action)
    entropy = m.entropy()

    model.log_prob.append(log_prob)
    model.values.append(state_value)
    model.entropies.append(entropy)

    return action.item(), hxs

In [None]:
def finish_episode(done, state, hxs):
    scheduler.step()
    entropies = model.entropies
    values = model.values
    rewards = model.rewards
    retain_graph = False
    log_probs = model.log_prob
    R = get_variable(torch.zeros(1, 1))
    if not done:
        retain_graph = True
        obs_im, obs_other = features.features(state[agent])
        state_im = get_variable(torch.from_numpy(np.array(obs_im)).float())
        state_other = get_variable(torch.from_numpy(np.array(obs_other)).float())
        _, state_value, hxs = model(state_im, state_other, hxs)
        R = state_value.detach()

    values.append(R)
    policy_loss = 0
    value_loss = 0
    gae = get_variable(torch.zeros(1, 1))

    for i in reversed(range(len(rewards))):
        R = args.gamma * R + rewards[i]
        if R > 1:
            R = 1
        error = R - values[i]
        value_loss = value_loss + 0.5*error.pow(2)

        # Generalized Advantage Estimataion
        delta_t = rewards[i] + args.gamma * \
            values[i + 1] - values[i]
        gae = gae * args.gamma * args.tau + delta_t

        policy_loss = policy_loss - \
            log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]
    optimizer.zero_grad()
    (policy_loss + args.value_loss_coef * value_loss).backward(retain_graph=retain_graph)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

    optimizer.step()
    del model.rewards[:]
    del model.entropies[:]
    del model.values[:]
    del model.log_prob[:]

In [None]:
def main():
    win = []
    movement = []
    number_of_bombs = 0
    running_reward = 10
    seen_positions = []
    power_ups = 0
    model.train()
    for i_episode in range(num_episodes):
        total_steps = 0
        state = env.reset()
        old_blast_strength = state[agent]['blast_strength']
        old_ammo = state[agent]['ammo']
        old_can_kick = state[agent]['can_kick']
        ammo = 1
        hxs = model.init_hidden()
        model.zero_grad()
        alive0 = 1
        alive2 = 1
        for t in range(10000):  # Don't infinite loop while learning
            for steps in range(args.num_steps):
                total_steps += 1
                actions = env.act(state)
                actions[agent], hxs = select_action(state, hxs)
                state, reward, done, _ = env.step(actions)
                # region rewardfunction
                if alive0 and pommerman.constants.Item.Agent0.value not in state[team_agent]['alive']:
                    if agent == 0:
                        reward[agent] = -1
                        done = 1
                    elif agent == 2:
                        reward[agent] = -0.5
                if alive2 and pommerman.constants.Item.Agent2.value not in state[agent]['alive']:
                    if agent == 0:
                        reward[agent] = -0.5
                    elif agent == 2:
                        reward[agent] = -1
                        done = 1
                if state[agent]['teammate'].value in state[agent]['board']:
                    reward[agent] += 0.008
                if state[agent]['position'] not in seen_positions:
                    seen_positions.append(state[agent]['position'])
                    # reward[agent] += 0.002
                    movement.append(1)
                if actions[agent] == 5 and reward[agent] != -1 and ammo != 0:
                    reward[agent] += 0.007
                    number_of_bombs += 1
                if state[agent]['blast_strength'] > old_blast_strength:
                    reward[agent] += 0.8
                    power_ups += 1
                if state[agent]['ammo'] > old_ammo:
                    reward[agent] += 0.8
                    power_ups += 1
                if state[agent]['can_kick'] > old_can_kick:
                    reward[agent] += 0.8
                    power_ups += 1
                ammo = state[agent]['ammo'] != 0
                old_blast_strength = max(state[agent]['blast_strength'], old_blast_strength)
                old_ammo = max(state[agent]['ammo'], old_ammo)
                old_can_kick = max(state[agent]['can_kick'], old_can_kick)
                alive0 = pommerman.constants.Item.Agent0.value in state[team_agent]['alive']
                alive2 = pommerman.constants.Item.Agent2.value in state[agent]['alive']
                # endregion
                model.rewards.append(get_variable(torch.from_numpy(np.array(reward[agent])).float()))
                if done or reward[agent] == -1:
                    win.append(reward[agent] >= 1)
                    seen_positions = []
                    done = 1
                    break
            finish_episode(done, state, hxs)
            if done: 
                break
        running_reward = running_reward * 0.99 + total_steps * 0.01
        if i_episode % args.log_interval == 0:
            print('Episode {}\tPower ups per match: {:.2f}\tAverage length: {:.2f}\tWin percentage: {:.2f}\tBombs per match: {:.2f}\tNew Discoveries per match: {:.2f}'.format(
                i_episode, power_ups/args.log_interval, running_reward, np.mean(win), number_of_bombs/args.log_interval, sum(movement)/args.log_interval))
            win = []
            movement = []
            power_ups = 0
            number_of_bombs = 0

In [None]:
try:
    main()
    print('Done')
except KeyboardInterrupt:
    print('Keyboard interrupt')