# Pommerman - training
## By: Peter Ebert Christensen (s153758), Johan Bloch Madsen(s152991) and Mads Okholm Bjørn (s153413)

Pommerman can be installed by following the instructions at the following link: https://github.com/MultiAgentLearning/playground

Docker can be installed by following the instructions at the following link:
https://docs.docker.com/install/

## Import libraries

In [1]:
# will print:
# 'Import error NSDE! You will not be able to render --> Cannot connect to "None"' 
# if run on headless server, no need to worry
import pommerman
from pommerman import agents

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.distributions import Normal

import time

import math

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import numpy as np

import datetime

import random

%matplotlib inline
from IPython import display

# Our own files
from convertInputMapToTrainingLayers import *

Import error NSDE! You will not be able to render --> Cannot connect to "None"


# Setting up the networks

First our main network, an ActorCritic network

In [2]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic_con = nn.Sequential(
            nn.Conv2d(in_channels=7,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.Conv2d(in_channels=64,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.Conv2d(in_channels=64,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.ReLU()
        )
        self.critic_linear = nn.Sequential(
            nn.Linear(3*3*64, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor_con = nn.Sequential(
            nn.Conv2d(in_channels=7,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.Conv2d(in_channels=64,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.Conv2d(in_channels=64,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.ReLU()
        )
        self.actor_linear = nn.Sequential(
            nn.Linear(3*3*64, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs)
        )
        
        self.log_std = nn.Parameter(torch.ones(num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic_con(x)
        value = self.critic_linear(value.view(-1, 3*3*64))
        
        mu    = self.actor_con(x)
        mu    = self.actor_linear(mu.view(-1, 3*3*64))
        
        std1  = self.log_std.exp()
        std   = std1.expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

Then our random network for random network distillation:

In [3]:
class RND(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(RND, self).__init__()
        self.Feature = nn.Sequential(
            nn.Conv2d(in_channels=7,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.Conv2d(in_channels=64,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.Conv2d(in_channels=64,
                      out_channels=64, 
                      kernel_size=3, 
                      padding=0),
            nn.ReLU()
        )
        self.Feature_linear = nn.Sequential(
            nn.Linear(3*3*64, hidden_size),
        )
    def forward(self, x):
        value = self.Feature(x)
        value = self.Feature_linear(value.view(-1, 3*3*64))
        return value

Hyper parameters:

In [4]:
num_inputs       = 324
num_outputs      = 6
hidden_size      = 1024
lr               = 1e-6
lr_RND           = 1e-3
mini_batch_size  = 5
ppo_epochs       = 4
max_frames       = 1500000
frame_idx        = 0
game_idx         = 0
device           = "cpu" # Hard-coded since we have a GPU, but does not want to use
clip_param       = 0.2

# Training Networks

We normaly need to train about 3 000 000 frames to get a win rate above 50 % against 3 simple agents, and train about 500 000 to see an improvement from the random weights.
The win rate is based on test play against 3 simple agents in FFA mode:

In [5]:
model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
RandomNN = RND(num_inputs, num_outputs, hidden_size).to(device)
PredictorNN = RND(num_inputs, num_outputs, hidden_size).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
predictorOptim = optim.Adam(PredictorNN.parameters(), lr=lr_RND)
loss = nn.MSELoss()


test_rewards     = []
d3s              = [[], [], []]
existingAggregate =(0,0,0)

### PPO update functions

In [6]:
# for a new value newValue, compute the new count, new mean, the new M2.
# mean accumulates the mean of the entire dataset
# M2 aggregates the squared distance from the mean
# count aggregates the number of samples seen so far
def varupdate(existingAggregate, newValue):
    (count, mean, M2) = existingAggregate
    count += 1 
    delta = newValue - mean
    mean += delta / count
    delta2 = newValue - mean
    M2 += delta * delta2

    return (count, mean, M2)

# retrieve the mean, variance and sample variance from an aggregate
def varfinalize(existingAggregate):
    (count, mean, M2) = existingAggregate
    (mean, variance, sampleVariance) = (mean, M2/count, M2/(count - 1)) 
    if count < 2:
        return float('nan')
    else:
        return (mean, variance, sampleVariance)
    
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
        
        

def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages):
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            dist, value = model(state)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy


            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

def compute_gae(next_value, rewards, masks, values, gamma=0.8, tau=0.9):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns    

### Setup environment

In [7]:
# Create a set of agents (exactly four)
agent_list = [
    agents.RandomAgent(), # Does not matter, we control this agent
    agents.SimpleAgent(),
    agents.SimpleAgent(),
    agents.SimpleAgent(),
    # agents.DockerAgent("pommerman/simple-agent", port=12345),
]
# Make the "Free-For-All" environment using the agent list
env = pommerman.make('PommeFFACompetitionFast-v0', agent_list)

### Setup testing environment

In [8]:
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    wins = 0
    losses = 0
    ties = 0
    while not done:
        stateOrginal = state
        state = torch.FloatTensor(stateToTorch(state)).to(device)
        dist, _ = model(state)
        actionsList = env.act(stateOrginal)
        next_state, reward, done, info = env.step([dist.mean.cpu().data.numpy()[0].argmax()] + actionsList[1:])
        state = next_state
        #show_state(env, stateOrginal, step=0, info="info")
        if vis: env.render()
    if "winners" in info:
        if 0 in info["winners"]:
            #print("Winner!", info)
            total_reward += 0
            wins += 1
        else:
            #print("Loser!", info)
            total_reward += -1
            losses += 1
    else:
        #print("Tie!", info)
        total_reward += 0
        ties += 1
    return total_reward, wins, losses, ties

### Training loop
In the below code we train for 100 games (each game is a training batch), then we test on 10 games, then repeat.

In [None]:
while frame_idx < max_frames:
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    RNDrewards = []
    masks     = []
    entropy = 0


    done = False
    gameSteps = 0
    info = ""
    dead = False

    state = env.reset()
    game_idx += 1

    actionDist = [0, 0, 0, 0, 0, 0]
    actionDistLogic = [0, 0, 0, 0, 0, 0]
    actionsTaken = []
    avgP=0
    deadLastRound = 0
    while not done:
        gameSteps += 1
        stateOrginal = state
        state = stateToTorch(state)

        ai = [
            createLayerWalkable(stateOrginal[0]),
            createLayerBoxes(stateOrginal[0]),
            createLayerDangerMap(stateOrginal[0]),
            createLayerFriendsAndEnemies(stateOrginal[0])
        ]
        d = []
        if stateOrginal[0]["ammo"]==0:
            d.append(10)
        else:    
            d.append(ai[2][4][4])
        d.append((1 - ai[0][3][4]) + ai[2][3][4])
        d.append((1 - ai[0][5][4]) + ai[2][5][4])
        d.append((1 - ai[0][4][3]) + ai[2][4][3])
        d.append((1 - ai[0][4][5]) + ai[2][4][5])
        sumDanger = 0
        for x in range(3, 6):
            for y in range(3, 6):
                sumDanger += ai[2][x][y]
        if stateOrginal[0]["ammo"]!=0 and sumDanger==0:
            d.append(0)
        elif stateOrginal[0]["ammo"]==0:
            d.append(10)
        else:
            d.append(sumDanger)
        d = [-dd for dd in d]
        actionLogic = d.index(max(d))
        actionDistLogic[actionLogic] += 1

        dist, value = model(state)

        action = dist.sample()
        actionsList = env.act(stateOrginal)
        actionNetwork = action.argmax().data[0].cpu().numpy();
        actionsTaken.append(actionNetwork)
        actionDist[actionNetwork] += 1
        next_state, reward, done, info = env.step([actionNetwork] + actionsList[1:])
        rewardGivenbyUs=np.asscalar(-loss(action.data[0],torch.FloatTensor(d)).data[0].cpu().numpy())
        target=RandomNN(stateToTorch(next_state))
        predict=PredictorNN(stateToTorch(next_state))
        v=loss(target,predict)
        avgP+=v.data[0].cpu().numpy()
        if existingAggregate[2]>1:
            var =varfinalize(existingAggregate)
            rewards.append((v/math.sqrt(var[1]))+rewardGivenbyUs+reward[0]*100)
        else:
            rewards.append(v+rewardGivenbyUs+reward[0]*100)
        RNDrewards.append(v)    
        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)



        masks.append(torch.FloatTensor([1 - deadLastRound]).unsqueeze(1).to(device))

        deadLastRound = reward[0] == -1
        if (deadLastRound):
            dead = True
            break

        states.append(state)
        actions.append(action)

        state = next_state
        frame_idx += 1


    avgR = 0
    countR = 0
    for r in rewards:
        avgR += r
    #print(game_idx, "gameLength: ", gameSteps, dead, float(avgR)/gameSteps,float(avgP)/gameSteps, actionDist)

    if game_idx % 100 == 0:
        test_rewardsList = []
        winsList = []
        lossesList = []
        tiesList = []
        for _ in range(10):
            test_reward, wins, losses, ties = test_env()
            test_rewardsList.append(test_reward)
            winsList.append(wins)
            lossesList.append(losses)
            tiesList.append(ties)
        test_reward = np.mean(test_rewardsList)
        test_rewards.append(test_reward)

        wins = np.mean(winsList)
        losses = np.mean(lossesList)
        ties = np.mean(tiesList)

        d3s[0].append(wins)
        d3s[1].append(losses)
        d3s[2].append(ties)
        print("Out of 10 test games, we won: " + str(sum(winsList)) + ", we tied: " + str(sum(tiesList)) + ", we lost: " + str(sum(lossesList)))
        

    next_state = stateToTorch(next_state)

    next_state = torch.FloatTensor(next_state).to(device)

    _, next_value = model(next_state)
    if (len(test_rewards) >= 800):
        print("Are we alive: " + str(not dead) + ", Game ended in a tie, total frames played: " + str(frame_idx))
    else:
        print("Are we alive: " + str(not dead) + ", Game ended after " + str(len(masks)) + " rounds, total frames played: " + str(frame_idx))
        
    returns = compute_gae(next_value, rewards, masks, values)



    returns   = torch.cat(returns).detach()
    existingAggregate =varupdate(existingAggregate,sum(RNDrewards))

    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)

    returns = returns.view(-1, 1)
    log_probs = log_probs.view(-1, num_outputs)
    values = values.view(-1, 1)
    states = states.view(-1, 7, 9, 9)
    actions = actions.view(-1, num_outputs)
    advantage = returns - values


    targets = RandomNN(states)
    predicts=PredictorNN(states)

    lossPredict = loss(targets,predicts)

    predictorOptim.zero_grad()
    lossPredict.backward()
    predictorOptim.step()


    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)





Are we alive: False, Game ended after 76 rounds, total frames played: 75
Are we alive: False, Game ended after 10 rounds, total frames played: 84
Are we alive: False, Game ended after 10 rounds, total frames played: 93
Are we alive: False, Game ended after 10 rounds, total frames played: 102
Are we alive: False, Game ended after 232 rounds, total frames played: 333
Are we alive: False, Game ended after 337 rounds, total frames played: 669
Are we alive: False, Game ended after 117 rounds, total frames played: 785
Are we alive: False, Game ended after 522 rounds, total frames played: 1306
Are we alive: False, Game ended after 84 rounds, total frames played: 1389
Are we alive: False, Game ended after 260 rounds, total frames played: 1648
Are we alive: False, Game ended after 439 rounds, total frames played: 2086
