<a href="https://colab.research.google.com/github/MarcStorm/pommerman/blob/master/pommerman/AlexMathias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''Install pytorch'''

# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [None]:
'''Install gym'''

!pip install gym

In [None]:
'''Clone and install MultiAgentLearning/playground'''

import os
!cd /content

!git clone https://github.com/MultiAgentLearning/playground
os.chdir('/content/playground')
!pip install -U .

In [1]:
'''An example to show how to set up an pommerman game programmatically'''
import time
import pommerman
from util import flatten_state, flatten_state_no_board, flatten_state_not_first_board
from pommerman import agents
from pommerman import constants as c
from pommerman.configs import ffa_v0_fast_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility
from pommerman import forward_model
from pommerman import constants

# Notebook 6.3
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ReinforceModel(forward_model.ForwardModel):
    def __init__(self):
        super().__init__()
    
    @staticmethod
    def get_rewards(agents, game_type, step_count, max_steps):

        def any_lst_equal(lst, values):
            '''Checks if list are equal'''
            return any([lst == v for v in values])

        alive_agents = [num for num, agent in enumerate(agents) \
                        if agent.is_alive]
        if game_type == constants.GameType.FFA:
            if len(alive_agents) == 1:
                # An agent won. Give them +1, others -1.
                return [2 * int(agent.is_alive) - 1 for agent in agents]
            elif step_count >= max_steps:
                # Game is over from time. Everyone gets -1.
                return [-1] * 4
            else:
                # Game running: 0 for alive, -1 for dead.
                
                return [int(agent.is_alive) - 1 for agent in agents]


class NewAgent(agents.BaseAgent):
    """The Random Agent that returns random actions given an action_space."""

    def __init__(self, Character=Bomber, *args, **kwargs):
        super(NewAgent,self).__init__(Character,*args, **kwargs)
        self.seq = [c.Action.Right, c.Action.Up, c.Action.Left, c.Action.Down]
        self.index = 0
    
    def act(self, obs, action_space):
        if self.index == 4:
            self.index = 0
        action = self.seq[self.index]
        self.index += 1
        return 0
    
# Print all possible environments in the Pommerman registry
print(pommerman.REGISTRY)

# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])

# Create a set of agents (exactly four)
agent_list = [
    NewAgent(config["agent"](0, config["game_type"])),
    agents.SimpleAgent(config["agent"](1, config["game_type"])),
    agents.SimpleAgent(config["agent"](2, config["game_type"])),
    agents.RandomAgent(config["agent"](3, config["game_type"])),
    # agents.DockerAgent("pommerman/simple-agent", port=12345),
]

env.set_agents(agent_list)
env.set_training_agent(0) #<- Does not call act method on training agents in env.act
env.model = ReinforceModel()
env.set_init_game_state(None)

# Run the episodes just like OpenAI Gym
#for i_episode in range(1):
#    state = env.reset()
#    done = False
#    while not done:  
#        actions = env.act(state)
#        state, reward, done, info = env.step(actions)

        #Update


#    print('Episode {} finished'.format(i_episode))
#    print(info)
#time.sleep(2)
#env.close()

    

Cuda: False
['PommeFFACompetition-v0', 'PommeFFACompetitionFast-v0', 'PommeFFAFast-v0', 'PommeFFA-v1', 'PommeRadio-v2', 'PommeTeamCompetition-v0', 'PommeTeamCompetitionFast-v0', 'PommeTeamCompetition-v1', 'PommeTeam-v0', 'PommeTeamFast-v0']


## Notebook 6.3 Network and Train

In [2]:
import time
import pommerman
from util import flatten_state, flatten_state_no_board
from pommerman import agents
from pommerman import constants as c
from pommerman.configs import ffa_v0_fast_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility
from pommerman import forward_model
from pommerman import constants

# Notebook 6.3
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [6]:
class TrainingAgent(agents.BaseAgent):
    """The Random Agent that returns random actions given an action_space."""

    def __init__(self, Character=Bomber, *args, **kwargs):
        super().__init__(Character,*args, **kwargs)
        
    
    def act(self, obs, action_space):
        raise Exception('The act function of the training agent should never be called!')
        return 0


class PolicyNet(nn.Module):
    """Policy network"""

    def __init__(self, n_inputs, n_hidden, n_outputs, learning_rate, batch_norm, conv1_in_channels, conv1_out_channels, conv2_out_channels, conv3_out_channels, kernel_size):
        super(PolicyNet, self).__init__()
        # Network Parameters
        # network
        self.other_shape = [3]
        
        #Input for conv2d is (batch_size, num_channels, width, height)
        self.conv1 = nn.Conv2d(in_channels = conv1_in_channels, out_channels=conv1_out_channels,
                               kernel_size=kernel_size, stride=1, padding=2)
        
        self.conv2 = nn.Conv2d(in_channels = conv1_out_channels, out_channels=conv2_out_channels,
                               kernel_size=kernel_size, stride=1, padding=2)
        
        self.conv3 = nn.Conv2d(in_channels = conv2_out_channels, out_channels=conv3_out_channels,
                               kernel_size=kernel_size, stride=1, padding=2)
        
        self.convolution_out_size = 11*11*3
        
        self.ffn_input_size = n_inputs
        
        self.ffn = nn.Sequential(
            nn.Linear(n_inputs, n_hidden),
            nn.ReLU(),
            #
            nn.Dropout(0.25),
            #nn.BatchNorm1d(n_hidden),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Dropout(0.25),
            #nn.BatchNorm1d(n_hidden),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            #nn.BatchNorm1d(n_hidden),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Dropout(0.25),
            #nn.BatchNorm1d(n_hidden),
            nn.Linear(n_hidden, n_outputs),
        )
        
        self.activation = F.relu
        
        if batch_norm:
            self.bn1 = nn.BatchNorm2d(11)
            #self.bn2 = nn.BatchNorm2d(num_channels)
            #self.bn3 = nn.BatchNorm2d(num_channels)
        else:
            self.bn1 = lambda x: x
            self.bn2 = lambda x: x
            self.bn3 = lambda x: x
        
        self.ffn.apply(self.init_weights)
        
        #self.hidden = nn.Linear(n_inputs, n_hidden)
        #self.hidden2 = nn.Linear(n_hidden, n_hidden)
        #self.out = nn.Linear(n_hidden, n_outputs)
        # training
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
       #Setup data for board
        #print(x)
        #print(x.keys())
        board = x[0]['board']
        
        board = torch.tensor(board)
        board = board.unsqueeze(0)
        board = board.unsqueeze(0)
        board = board.float()
        for i in range(1,len(x)):
            completeBoard = torch.tensor(x[i]['board'])
            completeBoard = completeBoard.unsqueeze(0)
            completeBoard = completeBoard.unsqueeze(0)
            completeBoard = completeBoard.float()
            board = torch.cat([board, completeBoard], dim=0)
        
        #print(board.size())
        board = torch.autograd.Variable(board)
        board = self.conv1(board)
        board = self.bn1(board)
        board = self.activation(board)
        board = self.conv2(board)
        board = self.bn1(board)
        board = self.activation(board)
        board = self.conv3(board)
        board = self.bn1(board)
        board = self.activation(board)
        #print(board.size())
        
        #x = board.view(-1, self.l1_in_features)
        x2 = board.view(-1, self.convolution_out_size)

        x = flatten_state_not_first_board(x)
        x = torch.cat([x2, x], dim=1)
        
        x = self.ffn(x)
        return F.softmax(x, dim=1)
    
    def loss(self, action_probabilities, returns):
        return -torch.mean(torch.mul(torch.log(action_probabilities), returns))
    
    def init_weights(m, *args):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform(m.weight)
            m.bias.data.fill_(0.01)
# Example:

class Training(object):
    
    def __init__(self, netKwargs, num_episodes, discount_factor, val_freq):
        super().__init__()
        self.num_episodes = num_episodes
        self.discount_factor = discount_factor
        self.val_freq = val_freq
        self.env = self.set_up_env()
        netKwargs['n_outputs'] = self.env.action_space.n
        # setup policy network
        self.policy = PolicyNet(**netKwargs)
        
    def set_up_env(self):
            # Instantiate the environment
        config = ffa_v0_fast_env()
        env = Pomme(**config["env_kwargs"])

        # Create a set of agents (exactly four)
        agent_list = [
            TrainingAgent(config["agent"](0, config["game_type"])),
            agents.SimpleAgent(config["agent"](1, config["game_type"])),
            agents.SimpleAgent(config["agent"](2, config["game_type"])),
            agents.RandomAgent(config["agent"](3, config["game_type"])),
            # agents.DockerAgent("pommerman/simple-agent", port=12345),
        ]

        env.set_agents(agent_list)
        env.set_training_agent(0) #<- Does not call act method on training agents in env.act
        #env.model = ReinforceModel()
        env.set_init_game_state(None)
        
        return env

        # train policy network
    def train(self):
        try:
            training_rewards, losses = [], []
            print('start training')
            for i in range(self.num_episodes):
                rollout = []
                s = self.env.reset()
                done = False
                #policy.train()
                while not done:
                    # generate rollout by iteratively evaluating the current policy on the environment
                    with torch.no_grad():
                        a_prob = self.policy(np.atleast_1d(s[0]))
                        #a_prob = policy(s[0])
                        #print(s[0])
                    a = (np.cumsum(a_prob.numpy()) > np.random.rand()).argmax() # sample action

                    actions = self.env.act(s)
                    actions.insert(0,a)

                    #print(actions)

                    s1, r, done, _ = self.env.step(actions)
                    #print(r)
                    rollout.append((s[0], a, r[0]))
                    #print("\n\nrollout:",rollout,"\n\n")
                    s = s1
                # prepare batch
                if(i % 10 == 0):
                    print('done with episode:',i)
                rollout = np.array(rollout)
                states = np.vstack(rollout[:,0])
                actions = np.vstack(rollout[:,1])
                rewards = np.array(rollout[:,2], dtype=float)
                returns = self.compute_returns(rewards)
                # policy gradient update
                self.policy.optimizer.zero_grad()
                a_probs = self.policy([s[0] for s in states]).gather(1, torch.from_numpy(actions)).view(-1)
                loss = self.policy.loss(a_probs, torch.from_numpy(returns).float())
                loss.backward()
                self.policy.optimizer.step()
                # bookkeeping
                training_rewards.append(sum(rewards))
                losses.append(loss.item())
                #policy.eval()
                # print
                if (i+1) % self.val_freq == 0:
                    # validation
                    validation_rewards = []
                    for _ in range(10):
                        s = self.env.reset()
                        reward = 0
                        done = False
                        while not done:
                            #env.render()
                            with torch.no_grad():
                                probs = self.policy(np.atleast_1d(s[0]))
                                #a_prob = policy(s[0])
                                a = probs.argmax().item()
                                #print(probs, "max actions: ", a,probs.argmax())

                            actions = self.env.act(s)
                            actions.insert(0,a)

                            s, r, done, _ = env.step(actions)
                            reward += r[0]
                        validation_rewards.append(reward)
                        #env.render(close=True)

                    t = datetime.date.today().strftime("%Y-%m-%d")
                    print('{:4d}. mean training reward: {:6.2f}, mean validation reward: {:6.2f}, mean loss: {:7.4f}, time:{}'.format(i+1, np.mean(training_rewards[-val_freq:]), np.mean(validation_rewards), np.mean(losses[-val_freq:]), t))
            self.env.close()
            print('done')
        except KeyboardInterrupt:
            print('interrupt')  
            
    def compute_returns(self, rewards):
        """Compute discounted returns."""
        returns = np.zeros(len(rewards))
        returns[-1] = rewards[-1]
        for t in reversed(range(len(rewards)-1)):
            returns[t] = rewards[t] + self.discount_factor * returns[t+1]
        return returns

            
            

            
# training settings
trainingKwargs = {    
    'num_episodes' : 10,
    #rollout_limit = env.spec.timestep_limit # max rollout length
    'discount_factor' : 0.9, # reward discount factor (gamma), 1.0 = no discount
    'val_freq' : 25 # validation frequency
}

# training network settings
netKwargs = {
    'n_inputs' : 614,
    'n_hidden' : 500,
    #'n_outputs' : env.action_space.n, 
    'n_outputs' : 0, # This method is updated in the training class
    'learning_rate': 0.001,
    'batch_norm' : False,
    'conv1_in_channels' : 1,
    'conv1_out_channels' : 3,
    'conv2_out_channels' : 3,
    'conv3_out_channels' : 3,
    'kernel_size' : 5
}

trainer = Training(netKwargs, **trainingKwargs)

trainer.train()

start training
done with episode: 0
done


### plot results
import matplotlib.pyplot as plt

def moving_average(a, n=10) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret / n

plt.figure(figsize=(16,6))
plt.subplot(211)
plt.plot(range(1, len(training_rewards)+1), training_rewards, label='training reward')
plt.plot(moving_average(training_rewards))
plt.xlabel('episode'); plt.ylabel('reward')
plt.xlim((0, len(training_rewards)))
plt.legend(loc=4); plt.grid()
plt.subplot(212)
plt.plot(range(1, len(losses)+1), losses, label='loss')
plt.plot(moving_average(losses))
plt.xlabel('episode'); plt.ylabel('loss')
plt.xlim((0, len(losses)))
plt.legend(loc=4); plt.grid()
plt.tight_layout(); plt.show()

In [None]:
import numpy as np
#completeBoard = [
#    [[[board[x,y]] for y in range(len(board))] for x in range(len(board))]
#    [[[blast_strength[x,y]] for y in range(len(board))] for x in range(len(board))]
#    [[[bomb_life[x,y]] for y in range(len(board))] for x in range(len(board))]
#    ]



abcd = [[[1 for y in range(5)] for x in range(8)],[[1 for y in range(5)] for x in range(8)]]
abcd2 = np.asarray(abcd)

ask = [[4],[4],[4],[3],[2]]

print([i for i in ask])
#as31 = board.reshape([11,11,1])
print(abcd2.shape)
#print(as31)

In [None]:
 Example:
# def forward(self, x):
#     x = flatten_state(x)
# where x is np.atleast1d(S[0])

# Flattens a state s on the form list<dict> where each dict contains information of a state
#def flatten_state(s):
    #print("\n Flatten: ",s)
#    return torch.from_numpy(np.array([flatten_state_aux(x) for x in s])).float()

#def flatten_state_aux(s):
    # Lists
    #print("\n Augs: ",s)
#    alive = [1 if x in s['alive'] else 0 for x in range(10,14)]
    #print(alive)
#    board = s['board']
#    bomb_blast_strength = s['bomb_blast_strength']
#    bomb_life = s['bomb_life']
    # Tuples
#    position = s['position']
    # Ints
#    blast_strength = s['blast_strength']
#    can_kick = s['can_kick']
#    ammo = s['ammo']
    # Enums
#    teammate = s['teammate'] #9 for FFA
#    enemies = s['enemies'] #11,12,13 for FFA and training agent id = 0
    
#    a = np.append(np.array(alive),np.array(board).flatten())
#    a = np.append(a,np.array(bomb_blast_strength).flatten())
#    a = np.append(a,np.array(bomb_life).flatten())
#    a = np.append(a,position[0])
#    a = np.append(a,position[1])
#    a = np.append(a,blast_strength)
#    a = np.append(a,can_kick)
#    a = np.append(a,ammo)
    # Commented out as we get size 376 but expected 372. I assume we calculated wrong.
    # Makes sense to ignore these imo
    #a = np.append(a,teammate.value)
    #a = np.append(a,[e.value for e in enemies])
    #print(np.shape(a))
#    return a.astype(float)