# Prerequisites

### Download and install env

In [None]:
# Colab installs
#!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install tqdm
#!pip install torchvision
#!pip install seaborn
#!pip install sklearn
#!pip install numpy
#!pip install opencv-python
#!pip install onnx onnx2pytorch

# ## commands to install the env
!git clone https://github.com/MultiAgentLearning/playground ./pommer_setup
!pip install -U ./pommer_setup
!rm -rf ./pommer_setup

In [None]:
!git clone https://github.com/RLCommunity/graphic_pomme_env ./graphic_pomme_env
!pip install -U ./graphic_pomme_env
!rm -rf ./graphic_pomme_env

### Imports and Constants

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.onnx
import shutil
from time import strftime, time
from collections import deque, namedtuple
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import IPython.display
from functools import partial
from itertools import chain

from torch.distributions import Categorical
!pip install onnx
import onnx
!pip install onnx2pytorch
from onnx2pytorch import ConvertModel

import os
import sys
import pickle
import random

import gym
from gym import Env, Wrapper
from gym import logger as gymlogger
# Environment import and set logger level to display error only
gymlogger.set_level(40)  # error only

import gym
from gym import Env, Wrapper

from pommerman import make
from pommerman.agents import BaseAgent, RandomAgent, SimpleAgent
from graphic_pomme_env import graphic_pomme_env
from graphic_pomme_env.wrappers import PommerEnvWrapperFrameSkip2

print('''Hint: just ignore the error "Import error NSDE! You will not be able to render --> Cannot connect to 'None'"''')
pomenvs = [es.id for es in gym.envs.registry.all() if es.id.startswith('Pomme')]
print("\n".join(pomenvs))
res = graphic_pomme_env.load_resources()
N_PLAYERS = 2 
NUM_STACK = 5
NUM_ACTIONS = 6
'''
0 Stop
1 Up
2 Down
3 Left
4 Right
5 Bomb
'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

from google.colab import drive
drive.mount('/content/gdrive')

# Opponent Agents

In [None]:
def idle_actor(frame_stack):
    del frame_stack
    return 0
  
def random_actor(frame_stack):
    del frame_stack
    return np.random.randint(NUM_ACTIONS)

def no_bomb_random_actor(frame_stack):
    del frame_stack
    return np.random.randint(NUM_ACTIONS-1)
  
def model_actor(frame_stack, model):
    obs = torch.from_numpy(np.array(frame_stack.get_observation()))
    net_out = model(obs).detach().cpu().numpy()
    action = np.argmax(net_out)
    return action

# DQN Agent

In [None]:
class DQNNet(nn.Module):

    def __init__(self, num_stack, num_actions):
        """ Create a DQN agent for Pommerman using Conv2d
        Params
        ======
            num_stack (int): number of stacked images
            num_actions (int): number of agent actions
        """
        super().__init__()
        
        self.features = nn.Sequential(
            nn.Conv2d(num_stack, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=4, stride=2),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, num_actions)
        )


    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float).to(device=device)
        if len(x.size()) == 3:
            x = x.unsqueeze(dim=0)
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

### Helper Functions

In [None]:
dqn_save_path = "gdrive/MyDrive/models"
dqn_load_path = "game_1.pth" # load up this checkpoint

if os.path.exists(dqn_save_path) is False:
    os.mkdir(dqn_save_path)


def load_checkpoint(checkpoint_path='', device=device):
    """ Load up an existing checkpoint 
    Params
    ======
        checkpoint_path (String): path to a checkpoint .pth file
        device (device): current device (cuda/cpu)
    """
    dqn = DQNNet(num_stack=NUM_STACK, num_actions=NUM_ACTIONS).to(device)
    dqn_target = DQNNet(num_stack=NUM_STACK, num_actions=NUM_ACTIONS).to(device)
    optimizer = optim.Adam(dqn.parameters(), lr=LEARNING_RATE)
    replay = ReplayBuffer(num_actions=NUM_ACTIONS, memory_len=BUFFER)
    mse = torch.nn.MSELoss()
    epoch = 0
    timesteps = 0

    if os.path.exists(os.path.join(dqn_save_path, dqn_load_path)):
        print(f'Loading checkpoint {checkpoint_path}')
        checkpoint_dict = torch.load(os.path.join(dqn_save_path, checkpoint_path), map_location=device)
        timesteps = checkpoint_dict['timesteps']
        dqn.load_state_dict(checkpoint_dict['model_params'])
        dqn_target.load_state_dict(checkpoint_dict['model_params'])
        optimizer.load_state_dict(checkpoint_dict['optimizer_state_dict'])
        epoch = int(''.join(filter(str.isdigit, checkpoint_path)))
        
    return dqn, dqn_target, timesteps, epoch+1, optimizer, replay, mse


def store_checkpoint(game_id, dqn_net, timesteps, optimizer):
    """ Create a checkpoint by saving the network's dictionary, the number of timesteps and the optimizer's dictionary
    Params
    ======
        game_id (int): number of episode
        dqn_net (DQNNet): agent network
        timesteps (int): number of passed timesteps (used for epsilon to avoid random exploration)
        optimizer (state_dict): optimizer's state dictionary
    """
    dqn_load_path = f'game_{game_id}.pth'
    torch.save({'model_params': dqn_net.state_dict(),
                'optimizer_state_dict': optimizer,
                'timesteps': timesteps
                },  os.path.join(dqn_save_path, dqn_load_path))

In [None]:
def soft_update(local_model, target_model, tau):
    """ Soft update model parameters
    θ_target = τ*θ_local + (1 - τ)*θ_target
    Params
    ======
        local_model (DQNNet): weights will be copied from
        target_model (DQNNet): weights will be copied to
        tau (float): interpolation parameter 
    """
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
        target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

# Buffer

In [None]:
Transition = namedtuple("Transition", ["obs", "action", "reward", "next_obs", "done"])
import random

class ReplayBuffer():
    def __init__(self, num_actions, memory_len = 10000):
        self.memory_len = memory_len
        self.transition = []
        self.num_actions = num_actions

    def add(self, obs, action, reward, next_obs, done):
        """ add a transition to the buffer
        Params
        ======
            obs (array): current board observation
            action (int): best action in given observation
            reward (int): -1 (lost), 0 (draw), 1 (won)
            next_obs (array): next observation
            done (Boolean): True (game finished), False (game still running)
        """
        if self.length() > self.memory_len:
            self.remove()
        self.transition.append(Transition(obs, action, reward, next_obs, done))

    def sample_batch(self, batch_size = 32):
        """ get random batches from the buffer
        Params
        ======
            batch_size (int): number of batches to include in the sample
        """
        minibatch = random.sample(self.transition, batch_size)
        obs_mb, a_, reward_mb, next_obs_mb, done_mb = map(np.array, zip(*minibatch))

        mb_reward = torch.from_numpy(reward_mb).to(device=device, dtype=torch.float32)
        mb_done = torch.from_numpy(done_mb.astype(int)).to(device=device)
        a_ = a_.astype(int)
        a_mb = np.zeros((a_.size, self.num_actions), dtype=np.float32)
        a_mb[np.arange(a_.size), a_] = 1
        mb_a = torch.from_numpy(a_mb).to(device=device)
        return obs_mb, mb_a, mb_reward, next_obs_mb, mb_done

    def length(self):
        """ get length of buffer
        """
        return len(self.transition)

    def remove(self):
        """ remove transistion from the buffer
        """
        self.transition.pop(0)

# Training

In [None]:
BUFFER = 1e6
MINIBATCH = 128 
DISCOUNT = 0.99 
TAU = 0.01
LEARNING_RATE = 1e-6
UPDATE_RATE = 1000
EPS_DECAY = 300000
EPS_UB = 1.0
EPS_LB = 0.02
# set seed
np.random.seed(0)
torch.manual_seed(0)

In [None]:
network, target, timesteps, e, optimizer, replay, mse = load_checkpoint(dqn_load_path)

reward_history = deque(maxlen=100)  # display last 100 rewards
loss_history = deque(maxlen=100)    # display last 100 losses
win_history = 0                     # count last 100 games outcomes

# curriculum learning: train against different agents
num_episodes =  [5000, 25000]
actors = [no_bomb_random_actor, None]
actor_curriculum = chain.from_iterable([[actor] * num_episodes for num_episodes, actor in zip(num_episodes, actors)])

for actor in actor_curriculum: 
    env = PommerEnvWrapperFrameSkip2(num_stack=5, start_pos=random.randint(0,1), board='GraphicOVOCompact-v0', opponent_actor=actor) # create the 6x6 board 
    obs, _ = env.reset()
    ret = 0
    done = False

    while not done: # play a game

        # action selection according to epsilon
        epsilon = max(EPS_LB, EPS_UB - timesteps/ EPS_DECAY)
        if np.random.choice([0,1], p=[1-epsilon,epsilon]) == 1:
            a = np.random.randint(low=0, high=NUM_ACTIONS, size=1)[0]
        else:
            net_out = network(obs).detach().cpu().numpy()
            a = np.argmax(net_out)

        # perform action
        agent_step, _ = env.step(a)
        next_obs, r, done, info = agent_step
        ret += r
        
        # store transition in replay buffer
        replay.add(obs, a, r, next_obs, done)
        obs = next_obs
        timesteps +=  1

        # update policy using temporal difference
        if replay.length() > MINIBATCH and replay.length() > UPDATE_RATE:

            optimizer.zero_grad()

            # sample a minibatch randomly
            obs_mb, mb_a, mb_reward, next_obs_mb, mb_done = replay.sample_batch(MINIBATCH) 

            # compute predictions & targets
            q = network(obs_mb).gather(1, torch.argmax(mb_a,1).unsqueeze(-1)).squeeze(-1)
            q_t = target(next_obs_mb)
            q_t[mb_done == 1] = 0.0
            targets = mb_reward + (DISCOUNT * q_t.max(1)[0])
            predictions = q    

            # compute loss
            loss = mse(predictions, targets) 
            loss.backward(retain_graph=False)
            optimizer.step()
            loss_history.append(loss.item())

            # update policy
            soft_update(network, target, TAU)

    # game ended
    print('Episode '+ str(e)+' Reward ' +str(ret))    
    e += 1
    reward_history.append(ret)

    if r>0: 
        win_history += 1

    if e % 100 == 0: 
        print('\rStats for Episode {} (100 Games): \tAverage Score: {:.2f}, Average Loss: {:.2f}, Win Probabilty: {:.2f} %'.format(e, np.mean(reward_history), np.mean(loss_history), win_history))
        win_history = 0

    if e % 500 == 0: # create a checkpoint (to continue training later) & export trained model as onnx (to evaluate performance)
        store_checkpoint(e, network, timesteps, optimizer.state_dict()) 
        model_file = "gdrive/MyDrive/submission.onnx"
        state_for_onnx = np.array(obs, dtype=np.float32)
        torch.onnx.export(network,
                          torch.from_numpy(state_for_onnx).float(), # example model input
                          model_file, # file path
                          export_params=True, # save trained parameters
                          opset_version=10,
                          do_constant_folding=True)

## Export Agent in ONNX format

In [None]:
onnx_path = "./submission.onnx"
state_for_onnx = np.array(obs, dtype=np.float32)
torch.onnx.export(network,
                  torch.from_numpy(state_for_onnx).float(), # example model input
                  onnx_path, # file path
                  export_params=True, # save trained parameters
                  opset_version=10,
                  do_constant_folding=True)

# Evaluation

In [None]:
# ignore prints to stdout of imports
save_stdout = sys.stdout
sys.stdout = open('trash', 'w')
from graphic_pomme_env import graphic_pomme_env
sys.stdout = save_stdout
from graphic_pomme_env.wrappers import PommerEnvWrapperFrameSkip2

# Seed random number generators
if os.path.exists("seed.rnd"):
    with open("seed.rnd", "r") as f:
        seed = int(f.readline().strip())
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
else:
    seed = None

N_EPISODES = 50
RAND_PERF = 0.5

model_file = "submission.onnx"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Network
net = ConvertModel(onnx.load(model_file), experimental=True)
net = net.to(device)
net.eval()

win_count = 0.0
env = PommerEnvWrapperFrameSkip2(num_stack=5, start_pos=0)

for i in range(N_EPISODES):
    if seed is not None:
        seed = np.random.randint(1e7)
    
    done = False
    obs, opponent_obs = env.reset()
    while not done:
        obs = torch.from_numpy(np.array(obs)).to(device)
        net_out = net(obs).detach().cpu().numpy()
        action = np.argmax(net_out)

        agent_step, opponent_step = env.step(action)
        obs, r, done, info = agent_step
        
        if done and r > 0:
            win_count += 1.0
            
win_count /= N_EPISODES
win_count = (win_count - RAND_PERF) / (1.0 - RAND_PERF)
            
print(win_count)