# Team Performance Assessment

We implement multi-episode play in order to better assess how good a team or a culture is. The output will be averaged results over these episodes.


In [2]:
import os
import random
import time
import platform
import torch
import gym
import numpy as np
import pickle

# This is the Gathering Game Environment based on Tribal Organization of agents
from tribes_env import GatheringEnv
from tribes_model import *

import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

print("Python version: ", platform.python_version())
print("Pytorch version: {}".format(torch.__version__))
print("OpenAI Gym version: {}".format(gym.__version__))

Python version:  3.6.4
Pytorch version: 0.4.1.post2
OpenAI Gym version: 0.9.2


In [76]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_name = 'MA_models/no_fragging/p-1.0/'
episodes = 2000  # This is used to recall a model file trained to a # of episodes

# There will be 10 agents - 3 teams of 3 AI agents each and 1 random agents
num_ai_agents = 9
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two


# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return


# Load models for AI agents
if episodes > 0:
    agents= [[] for i in range(num_ai_agents)]
    # If episodes is provided (not 0), load the model for each AI agent
    for i in range(num_ai_agents):
        model_file = dir_name+'MA{}_Gather__ep{}.p'.format(i,episodes)
        try:
            with open(model_file, 'rb') as f:
                # Model File include both model and optim parameters
                saved_model = pickle.load(f)
                agents[i], _ = saved_model
                print("Load saved model for agent {}".format(i))
        except OSError:
            print('Model file not found.')
            raise
else:
    # If episodes=0, start with a freshly initialized model for each AI agent
    for i in range(num_ai_agents):
        print("Load AI agent {}".format(i))
        agents.append(Policy(num_frames, num_actions, i))

# Load random agents    
for i in range(num_ai_agents,num_agents):
    print("Load random agent {}".format(i))
    agents.append(Rdn_Policy())

# Establish tribal association
tribes = []
tribes.append(Tribe(name='Vikings',color='blue', agents=[agents[0], agents[1], agents[2]]))
tribes.append(Tribe(name='Saxons', color='red', agents=[agents[3], agents[4]]))
tribes.append(Tribe(name='Franks', color='purple', agents=[agents[5], agents[6], agents[7]]))
tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[8]]))   # random agents are crazy!!!

# 9 agents in 4 tribes, used map defined in default.txt
agent_colors = [agent.color for agent in agents]
agent_tribes = [agent.tribe for agent in agents]
env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, map_name='default')  

# Used to accumulate episode stats for averaging
cum_rewards = 0
cum_tags = 0
cum_US_hits = 0
cum_THEM_hits = 0
cum_agent_rewards = [0 for agent in agents]
cum_agent_tags = [0 for agent in agents]
cum_agent_US_hits = [0 for agent in agents]
cum_agent_THEM_hits = [0 for agent in agents]
cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

cuda = False
start = time.time()

for ep in range(max_episodes):
    
    print('.', end='')  # To show progress
    
    # Initialize AI and random agent data
    actions = [0 for i in range(num_agents)]
    tags = [0 for i in range(num_agents)]
    US_hits = [0 for i in range(num_agents)]
    THEM_hits = [0 for i in range(num_agents)]

    env_obs = env.reset()  # Environment return observations
    """
    # For Debug only
    print (len(agents_obs))
    print (agents_obs[0].shape)
    """
    
    # Unpack observations into data structure compatible with agent Policy
    agents_obs = unpack_env_obs(env_obs)
    
    for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
        agents[i].reset_info()    
    
    if render:
        env.render()
        time.sleep(1/15)  # Change speed of video rendering
    
    """
    # For Debug only
    print (len(agents_obs))
    print (agents_obs[0].shape)
    """
    
    """
    For now, we do not stack observations, and we do not implement LSTM
    
    state = np.stack([state]*num_frames)

    # Reset LSTM hidden units when episode begins
    cx = Variable(torch.zeros(1, 256))
    hx = Variable(torch.zeros(1, 256))
    """

    for frame in range(max_frames):

        for i in range(num_ai_agents):    # For AI agents
            actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
            if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                tags[i] += 1   # record a tag for accessing aggressiveness
                
        for i in range(num_ai_agents, num_agents):   # For random agents
            actions[i] = agents[i].select_action(agents_obs[i])
            if actions[i] is 6:
                tags[i] += 1   # record a tag for accessing aggressiveness
        
        """
        For now, we do not implement LSTM
        # Select action
        action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
        """

        # if frame % 10 == 0:
        #     print (actions)    
            
        # Perform step        
        env_obs, reward, done, info = env.step(actions)
        
        """
        For Debug only
        print (env_obs)
        print (reward)
        print (done) 
        """

        for i in range(num_ai_agents):
            agents[i].rewards.append(reward[i])  # Stack rewards

        
        # Unpack observations into data structure compatible with agent Policy
        agents_obs = unpack_env_obs(env_obs)
        load_info(agents, narrate=False)   # Load agent info for AI agents
        
        for i in range(num_agents):
            US_hits[i] += agents[i].US_hit
            THEM_hits[i] += agents[i].THEM_hit
            
        """
        For now, we do not stack observation, may come in handy later on
        
        # Evict oldest diff add new diff to state
        next_state = np.stack([next_state]*num_frames)
        next_state[1:, :, :] = state[:-1, :, :]
        state = next_state
        """
        
        if render:
            env.render()
            time.sleep(1/15)  # Change speed of video rendering

        if any(done):
            print("Done after {} frames".format(frame))
            break
            
    # Print out statistics of AI agents
    ep_rewards = 0
    ep_tags = 0
    ep_US_hits = 0
    ep_THEM_hits = 0

    if verbose:
        print ('\nStatistics by Agent')
        print ('===================')
    for i in range(num_ai_agents):
        agent_tags = sum(agents[i].tag_hist)
        ep_tags += agent_tags
        cum_agent_tags[i] += agent_tags

        agent_reward = sum(agents[i].rewards)
        ep_rewards += agent_reward
        cum_agent_rewards[i] += agent_reward

        agent_US_hits = sum(agents[i].US_hits)
        agent_THEM_hits = sum(agents[i].THEM_hits)
        ep_US_hits += agent_US_hits
        ep_THEM_hits += agent_THEM_hits
        cum_agent_US_hits[i] += agent_US_hits
        cum_agent_THEM_hits[i] += agent_THEM_hits
        
        if verbose:
            print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
            print ("Agent{} reward is {:d}".format(i, agent_reward))
            print('US agents hit = {}'.format(agent_US_hits))
            print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
    cum_rewards += ep_rewards
    cum_tags += ep_tags
    cum_US_hits += ep_US_hits
    cum_THEM_hits += ep_THEM_hits
    
    if verbose:
        print ('\nStatistics in Aggregate')
        print ('=======================')
        print ('Total rewards gathered = {}'.format(ep_rewards))
        print ('Num laser fired = {}'.format(ep_tags))
        print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
        print ('Total THEM Hit = {}'.format(ep_THEM_hits))
        print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

    if verbose:
        print ('\nStatistics by Tribe')
        print ('===================')
    for i, t in enumerate(tribes):
        if t.name is not 'Crazies':
            ep_tribe_reward = sum(t.sum_rewards())
            cum_tribe_rewards[i] += ep_tribe_reward
            if verbose:
                print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

    for i in range(num_ai_agents):
        agents[i].clear_history()

env.close()  # Close the rendering window
end = time.time()

print ('\nAverage Statistics in Aggregate')
print ('=================================')
print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

print ('\nAverage Statistics by Tribe')
print ('=============================')
for i, t in enumerate(tribes):
    if t.name is not 'Crazies':
        print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

print ('\nAverage Statistics by Agent')
print ('=============================')
for i in range(num_ai_agents):
    print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
    print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
    print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
    print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))


Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load random agent 8
..............................
Average Statistics in Aggregate
Total rewards gathered = 427.3
Num laser fired = 378.1
Total US Hit (friendly fire) = 20.0
Total THEM Hit = 131.0
friendly fire (%) = 0.133

Average Statistics by Tribe
Tribe Vikings has total reward of 96.0
Tribe Saxons has total reward of 219.6
Tribe Franks has total reward of 111.7

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 36.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 28.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 31.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.25
Agent3 reward i

In [26]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_names = ['MA_models/3T-9L1R/Indiv/']
episodes = [1000,2000,3000,4000,5000]  
culture = "individualist"

av_agent_reward = [[0 for i in episodes] for j in dir_names]

# There will be 10 agents - 3 teams of 3 AI agents each and 1 random agents
num_ai_agents = 9
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two

# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return

for dir_num, dir_name in enumerate(dir_names):
    print ("###### Dir = {} #######".format(dir_name))
    
    for eps_num, eps in enumerate(episodes):
        print ("###### Trained episodes = {} #######".format(eps))
    
        # Load models for AI agents
        agents= [[] for i in range(num_ai_agents)]
        # If episodes is provided (not 0), load the model for each AI agent
        for i in range(num_ai_agents):
            model_file = dir_name+'MA{}_Gather_ep_{}.p'.format(i,eps)
            try:
                with open(model_file, 'rb') as f:
                    # Model File include both model and optim parameters
                    saved_model = pickle.load(f)
                    agents[i], _ = saved_model
                    print("Load saved model for agent {}".format(i))
            except OSError:
                print('Model file not found.')
                raise

        # Load random agents    
        for i in range(num_ai_agents,num_agents):
            print("Load random agent {}".format(i))
            agents.append(Rdn_Policy())
        
        # Establish tribal association
        tribes = []
        tribes.append(Tribe(name='Vikings',color='blue', culture=culture, \
                    agents=[agents[0], agents[1], agents[2]]))
        tribes.append(Tribe(name='Saxons', color='red', culture=culture, \
                    agents=[agents[3], agents[4], agents[5]]))
        tribes.append(Tribe(name='Franks', color='purple', culture=culture, \
                    agents=[agents[6], agents[7], agents[8]]))
        tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[9]]))   # random agents are crazy!!!


        # 9 agents in 4 tribes, used map defined in default.txt
        agent_colors = [agent.color for agent in agents]
        agent_tribes = [agent.tribe for agent in agents]

        env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, \
                       map_name='default')    

        # Used to accumulate episode stats for averaging
        cum_rewards = 0
        cum_tags = 0
        cum_US_hits = 0
        cum_THEM_hits = 0
        cum_agent_rewards = [0 for agent in agents]
        cum_agent_tags = [0 for agent in agents]
        cum_agent_US_hits = [0 for agent in agents]
        cum_agent_THEM_hits = [0 for agent in agents]
        cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

        cuda = False
        start = time.time()

        for ep in range(max_episodes):
    
            print('.', end='')  # To show progress
    
            # Initialize AI and random agent data
            actions = [0 for i in range(num_agents)]
            tags = [0 for i in range(num_agents)]
            US_hits = [0 for i in range(num_agents)]
            THEM_hits = [0 for i in range(num_agents)]

            env_obs = env.reset()  # Environment return observations
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            # Unpack observations into data structure compatible with agent Policy
            agents_obs = unpack_env_obs(env_obs)
    
            for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
                agents[i].reset_info()    
    
            if render:
                env.render()
                time.sleep(1/15)  # Change speed of video rendering
    
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            """
            For now, we do not stack observations, and we do not implement LSTM
    
            state = np.stack([state]*num_frames)

            # Reset LSTM hidden units when episode begins
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
            """

            for frame in range(max_frames):

                for i in range(num_ai_agents):    # For AI agents
                    actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
                    if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                        tags[i] += 1   # record a tag for accessing aggressiveness
                
                for i in range(num_ai_agents, num_agents):   # For random agents
                    actions[i] = agents[i].select_action(agents_obs[i])
                    if actions[i] is 6:
                        tags[i] += 1   # record a tag for accessing aggressiveness
        
                """
                For now, we do not implement LSTM
                # Select action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
                """

                # if frame % 10 == 0:
                #     print (actions)    
            
                # Perform step        
                env_obs, reward, done, info = env.step(actions)
        
                """
                For Debug only
                print (env_obs)
                print (reward)
                print (done) 
                """

                for i in range(num_ai_agents):
                    agents[i].rewards.append(reward[i])  # Stack rewards

        
                # Unpack observations into data structure compatible with agent Policy
                agents_obs = unpack_env_obs(env_obs)
                load_info(agents, narrate=False)   # Load agent info for AI agents
        
                for i in range(num_agents):
                    US_hits[i] += agents[i].US_hit
                    THEM_hits[i] += agents[i].THEM_hit
            
                """
                For now, we do not stack observation, may come in handy later on
        
                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state
                """
        
                if render:
                    env.render()
                    time.sleep(1/15)  # Change speed of video rendering

                if any(done):
                    print("Done after {} frames".format(frame))
                    break
            
            # Print out statistics of AI agents
            ep_rewards = 0
            ep_tags = 0
            ep_US_hits = 0
            ep_THEM_hits = 0

            if verbose:
                print ('\nStatistics by Agent')
                print ('===================')
            for i in range(num_ai_agents):
                agent_tags = sum(agents[i].tag_hist)
                ep_tags += agent_tags
                cum_agent_tags[i] += agent_tags

                agent_reward = sum(agents[i].rewards)
                ep_rewards += agent_reward
                cum_agent_rewards[i] += agent_reward

                agent_US_hits = sum(agents[i].US_hits)
                agent_THEM_hits = sum(agents[i].THEM_hits)
                ep_US_hits += agent_US_hits
                ep_THEM_hits += agent_THEM_hits
                cum_agent_US_hits[i] += agent_US_hits
                cum_agent_THEM_hits[i] += agent_THEM_hits
        
                if verbose:
                    print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
                    print ("Agent{} reward is {:d}".format(i, agent_reward))
                    print('US agents hit = {}'.format(agent_US_hits))
                    print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
            cum_rewards += ep_rewards
            cum_tags += ep_tags
            cum_US_hits += ep_US_hits
            cum_THEM_hits += ep_THEM_hits
    
            if verbose:
                print ('\nStatistics in Aggregate')
                print ('=======================')
                print ('Total rewards gathered = {}'.format(ep_rewards))
                print ('Num laser fired = {}'.format(ep_tags))
                print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
                print ('Total THEM Hit = {}'.format(ep_THEM_hits))
                print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

            if verbose:
                print ('\nStatistics by Tribe')
                print ('===================')
            for i, t in enumerate(tribes):
                if t.name is not 'Crazies':
                    ep_tribe_reward = sum(t.sum_rewards())
                    cum_tribe_rewards[i] += ep_tribe_reward
                    if verbose:
                        print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

            for i in range(num_ai_agents):
                agents[i].clear_history()

        env.close()  # Close the rendering window
        end = time.time()

        print ('\nAverage Statistics in Aggregate')
        print ('=================================')
        print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
        av_agent_reward[dir_num][eps_num] = cum_rewards/max_episodes/num_ai_agents
        print (av_agent_reward[dir_num][eps_num])
        print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
        print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
        print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
        print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

        print ('\nAverage Statistics by Tribe')
        print ('=============================')
        for i, t in enumerate(tribes):
            if t.name is not 'Crazies':
                print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

        print ('\nAverage Statistics by Agent')
        print ('=============================')
        for i in range(num_ai_agents):
            print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
            print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
            print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
            print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

        print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))
                              
print (av_agent_reward)


###### Dir = MA_models/3T-9L1R/Indiv/ #######
###### Trained episodes = 1000 #######
Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load saved model for agent 8
Load random agent 9
..............................
Average Statistics in Aggregate
Total rewards gathered = 305.2
33.91481481481482
Num laser fired = 588.8
Total US Hit (friendly fire) = 79.1
Total THEM Hit = 145.6
friendly fire (%) = 0.352

Average Statistics by Tribe
Tribe Vikings has total reward of 21.2
Tribe Saxons has total reward of 21.5
Tribe Franks has total reward of 262.6

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.04
Agent0 reward is 0.0
US agents hit = 6.3
THEM agents hit = 14.1
Agent1 of Vikings aggressiveness is 0.11
Agent1 reward is 0.1
US agents hit = 11.4
THEM agents hit = 26.9
Agent2 of Vikings aggressive

## Average Agent Reward - Individualist

In [27]:
for reward in av_agent_reward:
    print(reward)

[33.91481481481482, 39.577777777777776, 36.029629629629625, 36.51111111111111, 38.196296296296296]


In [16]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_names = ["MA_models/3T-9L1R/pacifist/p-0.01/",
             "MA_models/3T-9L1R/pacifist/p-0.1/", 
             "MA_models/3T-9L1R/pacifist/p-1.0/",
             "MA_models/3T-9L1R/pacifist/p-10.0/",
             "MA_models/3T-9L1R/pacifist/p-100.0/"]
episodes = [1000,2000,3000,4000,5000]  
culture = "pacifist"

av_agent_reward = [[0 for i in episodes] for j in dir_names]

# There will be 10 agents - 3 teams of 3 AI agents each and 1 random agents
num_ai_agents = 9
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two

# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return

for dir_num, dir_name in enumerate(dir_names):
    print ("###### Dir = {} #######".format(dir_name))
    
    for eps_num, eps in enumerate(episodes):
        print ("###### Trained episodes = {} #######".format(eps))
    
        # Load models for AI agents
        agents= [[] for i in range(num_ai_agents)]
        # If episodes is provided (not 0), load the model for each AI agent
        for i in range(num_ai_agents):
            model_file = dir_name+'MA{}_Gather__ep{}.p'.format(i,eps)
            try:
                with open(model_file, 'rb') as f:
                    # Model File include both model and optim parameters
                    saved_model = pickle.load(f)
                    agents[i], _ = saved_model
                    print("Load saved model for agent {}".format(i))
            except OSError:
                print('Model file not found.')
                raise

        # Load random agents    
        for i in range(num_ai_agents,num_agents):
            print("Load random agent {}".format(i))
            agents.append(Rdn_Policy())
        
        # Establish tribal association
        tribes = []
        tribes.append(Tribe(name='Vikings',color='blue', culture=culture, \
                    agents=[agents[0], agents[1], agents[2]]))
        tribes.append(Tribe(name='Saxons', color='red', culture=culture, \
                    agents=[agents[3], agents[4], agents[5]]))
        tribes.append(Tribe(name='Franks', color='purple', culture=culture, \
                    agents=[agents[6], agents[7], agents[8]]))
        tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[9]]))   # random agents are crazy!!!


        # 9 agents in 4 tribes, used map defined in default.txt
        agent_colors = [agent.color for agent in agents]
        agent_tribes = [agent.tribe for agent in agents]

        env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, \
                       map_name='default')    

        # Used to accumulate episode stats for averaging
        cum_rewards = 0
        cum_tags = 0
        cum_US_hits = 0
        cum_THEM_hits = 0
        cum_agent_rewards = [0 for agent in agents]
        cum_agent_tags = [0 for agent in agents]
        cum_agent_US_hits = [0 for agent in agents]
        cum_agent_THEM_hits = [0 for agent in agents]
        cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

        cuda = False
        start = time.time()

        for ep in range(max_episodes):
    
            print('.', end='')  # To show progress
    
            # Initialize AI and random agent data
            actions = [0 for i in range(num_agents)]
            tags = [0 for i in range(num_agents)]
            US_hits = [0 for i in range(num_agents)]
            THEM_hits = [0 for i in range(num_agents)]

            env_obs = env.reset()  # Environment return observations
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            # Unpack observations into data structure compatible with agent Policy
            agents_obs = unpack_env_obs(env_obs)
    
            for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
                agents[i].reset_info()    
    
            if render:
                env.render()
                time.sleep(1/15)  # Change speed of video rendering
    
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            """
            For now, we do not stack observations, and we do not implement LSTM
    
            state = np.stack([state]*num_frames)

            # Reset LSTM hidden units when episode begins
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
            """

            for frame in range(max_frames):

                for i in range(num_ai_agents):    # For AI agents
                    actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
                    if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                        tags[i] += 1   # record a tag for accessing aggressiveness
                
                for i in range(num_ai_agents, num_agents):   # For random agents
                    actions[i] = agents[i].select_action(agents_obs[i])
                    if actions[i] is 6:
                        tags[i] += 1   # record a tag for accessing aggressiveness
        
                """
                For now, we do not implement LSTM
                # Select action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
                """

                # if frame % 10 == 0:
                #     print (actions)    
            
                # Perform step        
                env_obs, reward, done, info = env.step(actions)
        
                """
                For Debug only
                print (env_obs)
                print (reward)
                print (done) 
                """

                for i in range(num_ai_agents):
                    agents[i].rewards.append(reward[i])  # Stack rewards

        
                # Unpack observations into data structure compatible with agent Policy
                agents_obs = unpack_env_obs(env_obs)
                load_info(agents, narrate=False)   # Load agent info for AI agents
        
                for i in range(num_agents):
                    US_hits[i] += agents[i].US_hit
                    THEM_hits[i] += agents[i].THEM_hit
            
                """
                For now, we do not stack observation, may come in handy later on
        
                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state
                """
        
                if render:
                    env.render()
                    time.sleep(1/15)  # Change speed of video rendering

                if any(done):
                    print("Done after {} frames".format(frame))
                    break
            
            # Print out statistics of AI agents
            ep_rewards = 0
            ep_tags = 0
            ep_US_hits = 0
            ep_THEM_hits = 0

            if verbose:
                print ('\nStatistics by Agent')
                print ('===================')
            for i in range(num_ai_agents):
                agent_tags = sum(agents[i].tag_hist)
                ep_tags += agent_tags
                cum_agent_tags[i] += agent_tags

                agent_reward = sum(agents[i].rewards)
                ep_rewards += agent_reward
                cum_agent_rewards[i] += agent_reward

                agent_US_hits = sum(agents[i].US_hits)
                agent_THEM_hits = sum(agents[i].THEM_hits)
                ep_US_hits += agent_US_hits
                ep_THEM_hits += agent_THEM_hits
                cum_agent_US_hits[i] += agent_US_hits
                cum_agent_THEM_hits[i] += agent_THEM_hits
        
                if verbose:
                    print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
                    print ("Agent{} reward is {:d}".format(i, agent_reward))
                    print('US agents hit = {}'.format(agent_US_hits))
                    print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
            cum_rewards += ep_rewards
            cum_tags += ep_tags
            cum_US_hits += ep_US_hits
            cum_THEM_hits += ep_THEM_hits
    
            if verbose:
                print ('\nStatistics in Aggregate')
                print ('=======================')
                print ('Total rewards gathered = {}'.format(ep_rewards))
                print ('Num laser fired = {}'.format(ep_tags))
                print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
                print ('Total THEM Hit = {}'.format(ep_THEM_hits))
                print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

            if verbose:
                print ('\nStatistics by Tribe')
                print ('===================')
            for i, t in enumerate(tribes):
                if t.name is not 'Crazies':
                    ep_tribe_reward = sum(t.sum_rewards())
                    cum_tribe_rewards[i] += ep_tribe_reward
                    if verbose:
                        print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

            for i in range(num_ai_agents):
                agents[i].clear_history()

        env.close()  # Close the rendering window
        end = time.time()

        print ('\nAverage Statistics in Aggregate')
        print ('=================================')
        print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
        av_agent_reward[dir_num][eps_num] = cum_rewards/max_episodes/num_ai_agents
        print (av_agent_reward[dir_num][eps_num])
        print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
        print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
        print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
        print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

        print ('\nAverage Statistics by Tribe')
        print ('=============================')
        for i, t in enumerate(tribes):
            if t.name is not 'Crazies':
                print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

        print ('\nAverage Statistics by Agent')
        print ('=============================')
        for i in range(num_ai_agents):
            print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
            print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
            print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
            print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

        print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))
                              
print (av_agent_reward)

###### Dir = MA_models/3T-9L1R/pacifist/p-0.01/ #######
###### Trained episodes = 1000 #######
Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load saved model for agent 8
Load random agent 9
..............................
Average Statistics in Aggregate
Total rewards gathered = 505.0
56.11481481481482
Num laser fired = 0.9
Total US Hit (friendly fire) = 0.4
Total THEM Hit = 1.2
friendly fire (%) = 0.265

Average Statistics by Tribe
Tribe Vikings has total reward of 66.8
Tribe Saxons has total reward of 168.3
Tribe Franks has total reward of 269.9

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 0.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 66.8
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggres

..............................
Average Statistics in Aggregate
Total rewards gathered = 480.6
53.400000000000006
Num laser fired = 105.4
Total US Hit (friendly fire) = 25.5
Total THEM Hit = 64.1
friendly fire (%) = 0.285

Average Statistics by Tribe
Tribe Vikings has total reward of 62.1
Tribe Saxons has total reward of 141.2
Tribe Franks has total reward of 277.3

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 0.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 23.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 39.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 31.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 58.7
US agents hit = 0.0
THEM agents hit = 0.2
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 51.3
US agents hit = 0.0


..............................
Average Statistics in Aggregate
Total rewards gathered = 511.5
56.82962962962963
Num laser fired = 0.2
Total US Hit (friendly fire) = 0.0
Total THEM Hit = 0.3
friendly fire (%) = 0.100

Average Statistics by Tribe
Tribe Vikings has total reward of 176.4
Tribe Saxons has total reward of 138.5
Tribe Franks has total reward of 196.5

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 32.1
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 67.5
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 76.8
US agents hit = 0.0
THEM agents hit = 0.1
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 26.4
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 57.5
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 54.6
US agents hit = 0.0
THE

..............................
Average Statistics in Aggregate
Total rewards gathered = 537.7
59.74074074074073
Num laser fired = 0.1
Total US Hit (friendly fire) = 0.1
Total THEM Hit = 0.2
friendly fire (%) = 0.222

Average Statistics by Tribe
Tribe Vikings has total reward of 145.5
Tribe Saxons has total reward of 193.3
Tribe Franks has total reward of 198.9

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 50.4
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 23.1
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 72.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 80.3
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 56.1
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 56.9
US agents hit = 0.0
THE

..............................
Average Statistics in Aggregate
Total rewards gathered = 524.9
58.32592592592592
Num laser fired = 0.0
Total US Hit (friendly fire) = 0.0
Total THEM Hit = 0.0
friendly fire (%) = 0.000

Average Statistics by Tribe
Tribe Vikings has total reward of 138.7
Tribe Saxons has total reward of 173.3
Tribe Franks has total reward of 212.9

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 43.4
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 27.8
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 67.5
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 46.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 60.3
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 66.3
US agents hit = 0.0
THE

## Average Agent Reward - Pacifists

In [17]:
for reward in av_agent_reward:
    print(reward)

[56.11481481481482, 44.03703703703704, 52.17407407407407, 53.33703703703704, 59.35555555555556]
[53.400000000000006, 56.49629629629629, 58.662962962962965, 62.89259259259259, 58.770370370370365]
[56.82962962962963, 62.48148148148149, 60.84444444444445, 61.129629629629626, 62.829629629629636]
[59.74074074074073, 62.67777777777778, 62.94814814814814, 65.3851851851852, 63.459259259259255]
[58.32592592592592, 63.048148148148144, 63.951851851851856, 63.15555555555555, 64.91111111111111]


In [18]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_names = ["MA_models/3T-9L1R/no_fragging/p-0.01/",
             "MA_models/3T-9L1R/no_fragging/p-0.1/", 
             "MA_models/3T-9L1R/no_fragging/p-1.0/",
             "MA_models/3T-9L1R/no_fragging/p-10.0/",
             "MA_models/3T-9L1R/no_fragging/p-100.0/"]
episodes = [1000,2000,3000,4000,5000]  
culture = "no_fragging"

av_agent_reward = [[0 for i in episodes] for j in dir_names]

# There will be 10 agents - 3 teams of 3 AI agents each and 1 random agents
num_ai_agents = 9
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two

# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return

for dir_num, dir_name in enumerate(dir_names):
    print ("###### Dir = {} #######".format(dir_name))
    
    for eps_num, eps in enumerate(episodes):
        print ("###### Trained episodes = {} #######".format(eps))
    
        # Load models for AI agents
        agents= [[] for i in range(num_ai_agents)]
        # If episodes is provided (not 0), load the model for each AI agent
        for i in range(num_ai_agents):
            model_file = dir_name+'MA{}_Gather__ep{}.p'.format(i,eps)
            try:
                with open(model_file, 'rb') as f:
                    # Model File include both model and optim parameters
                    saved_model = pickle.load(f)
                    agents[i], _ = saved_model
                    print("Load saved model for agent {}".format(i))
            except OSError:
                print('Model file not found.')
                raise

        # Load random agents    
        for i in range(num_ai_agents,num_agents):
            print("Load random agent {}".format(i))
            agents.append(Rdn_Policy())
        
        # Establish tribal association
        tribes = []
        tribes.append(Tribe(name='Vikings',color='blue', culture=culture, \
                    agents=[agents[0], agents[1], agents[2]]))
        tribes.append(Tribe(name='Saxons', color='red', culture=culture, \
                    agents=[agents[3], agents[4], agents[5]]))
        tribes.append(Tribe(name='Franks', color='purple', culture=culture, \
                    agents=[agents[6], agents[7], agents[8]]))
        tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[9]]))   # random agents are crazy!!!


        # 9 agents in 4 tribes, used map defined in default.txt
        agent_colors = [agent.color for agent in agents]
        agent_tribes = [agent.tribe for agent in agents]

        env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, \
                       map_name='default')    

        # Used to accumulate episode stats for averaging
        cum_rewards = 0
        cum_tags = 0
        cum_US_hits = 0
        cum_THEM_hits = 0
        cum_agent_rewards = [0 for agent in agents]
        cum_agent_tags = [0 for agent in agents]
        cum_agent_US_hits = [0 for agent in agents]
        cum_agent_THEM_hits = [0 for agent in agents]
        cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

        cuda = False
        start = time.time()

        for ep in range(max_episodes):
    
            print('.', end='')  # To show progress
    
            # Initialize AI and random agent data
            actions = [0 for i in range(num_agents)]
            tags = [0 for i in range(num_agents)]
            US_hits = [0 for i in range(num_agents)]
            THEM_hits = [0 for i in range(num_agents)]

            env_obs = env.reset()  # Environment return observations
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            # Unpack observations into data structure compatible with agent Policy
            agents_obs = unpack_env_obs(env_obs)
    
            for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
                agents[i].reset_info()    
    
            if render:
                env.render()
                time.sleep(1/15)  # Change speed of video rendering
    
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            """
            For now, we do not stack observations, and we do not implement LSTM
    
            state = np.stack([state]*num_frames)

            # Reset LSTM hidden units when episode begins
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
            """

            for frame in range(max_frames):

                for i in range(num_ai_agents):    # For AI agents
                    actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
                    if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                        tags[i] += 1   # record a tag for accessing aggressiveness
                
                for i in range(num_ai_agents, num_agents):   # For random agents
                    actions[i] = agents[i].select_action(agents_obs[i])
                    if actions[i] is 6:
                        tags[i] += 1   # record a tag for accessing aggressiveness
        
                """
                For now, we do not implement LSTM
                # Select action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
                """

                # if frame % 10 == 0:
                #     print (actions)    
            
                # Perform step        
                env_obs, reward, done, info = env.step(actions)
        
                """
                For Debug only
                print (env_obs)
                print (reward)
                print (done) 
                """

                for i in range(num_ai_agents):
                    agents[i].rewards.append(reward[i])  # Stack rewards

        
                # Unpack observations into data structure compatible with agent Policy
                agents_obs = unpack_env_obs(env_obs)
                load_info(agents, narrate=False)   # Load agent info for AI agents
        
                for i in range(num_agents):
                    US_hits[i] += agents[i].US_hit
                    THEM_hits[i] += agents[i].THEM_hit
            
                """
                For now, we do not stack observation, may come in handy later on
        
                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state
                """
        
                if render:
                    env.render()
                    time.sleep(1/15)  # Change speed of video rendering

                if any(done):
                    print("Done after {} frames".format(frame))
                    break
            
            # Print out statistics of AI agents
            ep_rewards = 0
            ep_tags = 0
            ep_US_hits = 0
            ep_THEM_hits = 0

            if verbose:
                print ('\nStatistics by Agent')
                print ('===================')
            for i in range(num_ai_agents):
                agent_tags = sum(agents[i].tag_hist)
                ep_tags += agent_tags
                cum_agent_tags[i] += agent_tags

                agent_reward = sum(agents[i].rewards)
                ep_rewards += agent_reward
                cum_agent_rewards[i] += agent_reward

                agent_US_hits = sum(agents[i].US_hits)
                agent_THEM_hits = sum(agents[i].THEM_hits)
                ep_US_hits += agent_US_hits
                ep_THEM_hits += agent_THEM_hits
                cum_agent_US_hits[i] += agent_US_hits
                cum_agent_THEM_hits[i] += agent_THEM_hits
        
                if verbose:
                    print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
                    print ("Agent{} reward is {:d}".format(i, agent_reward))
                    print('US agents hit = {}'.format(agent_US_hits))
                    print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
            cum_rewards += ep_rewards
            cum_tags += ep_tags
            cum_US_hits += ep_US_hits
            cum_THEM_hits += ep_THEM_hits
    
            if verbose:
                print ('\nStatistics in Aggregate')
                print ('=======================')
                print ('Total rewards gathered = {}'.format(ep_rewards))
                print ('Num laser fired = {}'.format(ep_tags))
                print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
                print ('Total THEM Hit = {}'.format(ep_THEM_hits))
                print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

            if verbose:
                print ('\nStatistics by Tribe')
                print ('===================')
            for i, t in enumerate(tribes):
                if t.name is not 'Crazies':
                    ep_tribe_reward = sum(t.sum_rewards())
                    cum_tribe_rewards[i] += ep_tribe_reward
                    if verbose:
                        print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

            for i in range(num_ai_agents):
                agents[i].clear_history()

        env.close()  # Close the rendering window
        end = time.time()

        print ('\nAverage Statistics in Aggregate')
        print ('=================================')
        print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
        av_agent_reward[dir_num][eps_num] = cum_rewards/max_episodes/num_ai_agents
        print (av_agent_reward[dir_num][eps_num])
        print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
        print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
        print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
        print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

        print ('\nAverage Statistics by Tribe')
        print ('=============================')
        for i, t in enumerate(tribes):
            if t.name is not 'Crazies':
                print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

        print ('\nAverage Statistics by Agent')
        print ('=============================')
        for i in range(num_ai_agents):
            print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
            print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
            print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
            print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

        print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))
                              
print (av_agent_reward)

###### Dir = MA_models/3T-9L1R/no_fragging/p-0.01/ #######
###### Trained episodes = 1000 #######
Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load saved model for agent 8
Load random agent 9
..............................
Average Statistics in Aggregate
Total rewards gathered = 423.5
47.05555555555556
Num laser fired = 205.5
Total US Hit (friendly fire) = 31.3
Total THEM Hit = 128.6
friendly fire (%) = 0.196

Average Statistics by Tribe
Tribe Vikings has total reward of 196.4
Tribe Saxons has total reward of 122.4
Tribe Franks has total reward of 104.7

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 22.9
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 13.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Viki

..............................
Average Statistics in Aggregate
Total rewards gathered = 393.6
43.733333333333334
Num laser fired = 371.7
Total US Hit (friendly fire) = 27.5
Total THEM Hit = 131.6
friendly fire (%) = 0.173

Average Statistics by Tribe
Tribe Vikings has total reward of 22.4
Tribe Saxons has total reward of 50.1
Tribe Franks has total reward of 321.1

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 10.6
US agents hit = 0.1
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 0.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 11.6
US agents hit = 0.0
THEM agents hit = 0.1
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 17.6
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 16.5
US agents hit = 0.1
THEM agents hit = 0.3
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 16.0
US agents hit = 0.3


..............................
Average Statistics in Aggregate
Total rewards gathered = 433.6
48.17407407407407
Num laser fired = 160.2
Total US Hit (friendly fire) = 30.1
Total THEM Hit = 90.7
friendly fire (%) = 0.249

Average Statistics by Tribe
Tribe Vikings has total reward of 76.7
Tribe Saxons has total reward of 102.9
Tribe Franks has total reward of 253.9

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 22.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 28.8
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 25.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 22.0
US agents hit = 0.0
THEM agents hit = 0.2
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 44.6
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 36.3
US agents hit = 0.1


..............................
Average Statistics in Aggregate
Total rewards gathered = 530.6
58.959259259259255
Num laser fired = 0.6
Total US Hit (friendly fire) = 0.1
Total THEM Hit = 0.6
friendly fire (%) = 0.136

Average Statistics by Tribe
Tribe Vikings has total reward of 112.2
Tribe Saxons has total reward of 187.6
Tribe Franks has total reward of 230.9

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 0.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 47.6
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 64.6
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 58.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 68.4
US agents hit = 0.0
THEM agents hit = 0.1
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 61.0
US agents hit = 0.0
THE

..............................
Average Statistics in Aggregate
Total rewards gathered = 545.5
60.60740740740741
Num laser fired = 0.1
Total US Hit (friendly fire) = 0.1
Total THEM Hit = 0.3
friendly fire (%) = 0.273

Average Statistics by Tribe
Tribe Vikings has total reward of 132.4
Tribe Saxons has total reward of 193.6
Tribe Franks has total reward of 219.4

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 29.5
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 54.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 48.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 51.1
US agents hit = 0.0
THEM agents hit = 0.0
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 81.6
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 60.9
US agents hit = 0.0
THE

## Average Agent Reward - No Fraticide

In [19]:
for reward in av_agent_reward:
    print(reward)

[47.05555555555556, 45.15555555555555, 48.58888888888889, 49.54814814814815, 50.096296296296295]
[43.733333333333334, 48.8962962962963, 51.88148148148148, 51.64074074074074, 53.91851851851852]
[48.17407407407407, 56.44814814814815, 56.66296296296296, 60.56296296296297, 60.84814814814815]
[58.959259259259255, 60.56666666666667, 62.70370370370371, 66.50740740740741, 62.17407407407408]
[60.60740740740741, 63.70370370370371, 63.29629629629629, 64.35925925925926, 62.68518518518518]


In [20]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_names = ["MA_models/3T-9L1R/cooperative/cf0.01/",
             "MA_models/3T-9L1R/cooperative/cf0.1/",
             "MA_models/3T-9L1R/cooperative/cf1.0/",
             "MA_models/3T-9L1R/cooperative/cf5.0/",
             "MA_models/3T-9L1R/cooperative/cf10/",
             "MA_models/3T-9L1R/cooperative/cf15/",
             "MA_models/3T-9L1R/cooperative/cf20/",
             "MA_models/3T-9L1R/cooperative/cf25/",
             "MA_models/3T-9L1R/cooperative/cf50/"]

episodes = [1000,2000,3000,4000,5000]  
culture = "cooperative"

av_agent_reward = [[0 for i in episodes] for j in dir_names]

# There will be 10 agents - 3 teams of 3 AI agents each and 1 random agents
num_ai_agents = 9
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two

# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return

for dir_num, dir_name in enumerate(dir_names):
    print ("###### Dir = {} #######".format(dir_name))
    
    for eps_num, eps in enumerate(episodes):
        print ("###### Trained episodes = {} #######".format(eps))
    
        # Load models for AI agents
        agents= [[] for i in range(num_ai_agents)]
        # If episodes is provided (not 0), load the model for each AI agent
        for i in range(num_ai_agents):
            model_file = dir_name+'MA{}_Gather__ep{}.p'.format(i,eps)
            try:
                with open(model_file, 'rb') as f:
                    # Model File include both model and optim parameters
                    saved_model = pickle.load(f)
                    agents[i], _ = saved_model
                    print("Load saved model for agent {}".format(i))
            except OSError:
                print('Model file not found.')
                raise

        # Load random agents    
        for i in range(num_ai_agents,num_agents):
            print("Load random agent {}".format(i))
            agents.append(Rdn_Policy())
        
        # Establish tribal association
        tribes = []
        tribes.append(Tribe(name='Vikings',color='blue', culture=culture, \
                    agents=[agents[0], agents[1], agents[2]]))
        tribes.append(Tribe(name='Saxons', color='red', culture=culture, \
                    agents=[agents[3], agents[4], agents[5]]))
        tribes.append(Tribe(name='Franks', color='purple', culture=culture, \
                    agents=[agents[6], agents[7], agents[8]]))
        tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[9]]))   # random agents are crazy!!!


        # 9 agents in 4 tribes, used map defined in default.txt
        agent_colors = [agent.color for agent in agents]
        agent_tribes = [agent.tribe for agent in agents]

        env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, \
                       map_name='default')    

        # Used to accumulate episode stats for averaging
        cum_rewards = 0
        cum_tags = 0
        cum_US_hits = 0
        cum_THEM_hits = 0
        cum_agent_rewards = [0 for agent in agents]
        cum_agent_tags = [0 for agent in agents]
        cum_agent_US_hits = [0 for agent in agents]
        cum_agent_THEM_hits = [0 for agent in agents]
        cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

        cuda = False
        start = time.time()

        for ep in range(max_episodes):
    
            print('.', end='')  # To show progress
    
            # Initialize AI and random agent data
            actions = [0 for i in range(num_agents)]
            tags = [0 for i in range(num_agents)]
            US_hits = [0 for i in range(num_agents)]
            THEM_hits = [0 for i in range(num_agents)]

            env_obs = env.reset()  # Environment return observations
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            # Unpack observations into data structure compatible with agent Policy
            agents_obs = unpack_env_obs(env_obs)
    
            for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
                agents[i].reset_info()    
    
            if render:
                env.render()
                time.sleep(1/15)  # Change speed of video rendering
    
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            """
            For now, we do not stack observations, and we do not implement LSTM
    
            state = np.stack([state]*num_frames)

            # Reset LSTM hidden units when episode begins
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
            """

            for frame in range(max_frames):

                for i in range(num_ai_agents):    # For AI agents
                    actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
                    if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                        tags[i] += 1   # record a tag for accessing aggressiveness
                
                for i in range(num_ai_agents, num_agents):   # For random agents
                    actions[i] = agents[i].select_action(agents_obs[i])
                    if actions[i] is 6:
                        tags[i] += 1   # record a tag for accessing aggressiveness
        
                """
                For now, we do not implement LSTM
                # Select action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
                """

                # if frame % 10 == 0:
                #     print (actions)    
            
                # Perform step        
                env_obs, reward, done, info = env.step(actions)
        
                """
                For Debug only
                print (env_obs)
                print (reward)
                print (done) 
                """

                for i in range(num_ai_agents):
                    agents[i].rewards.append(reward[i])  # Stack rewards

        
                # Unpack observations into data structure compatible with agent Policy
                agents_obs = unpack_env_obs(env_obs)
                load_info(agents, narrate=False)   # Load agent info for AI agents
        
                for i in range(num_agents):
                    US_hits[i] += agents[i].US_hit
                    THEM_hits[i] += agents[i].THEM_hit
            
                """
                For now, we do not stack observation, may come in handy later on
        
                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state
                """
        
                if render:
                    env.render()
                    time.sleep(1/15)  # Change speed of video rendering

                if any(done):
                    print("Done after {} frames".format(frame))
                    break
            
            # Print out statistics of AI agents
            ep_rewards = 0
            ep_tags = 0
            ep_US_hits = 0
            ep_THEM_hits = 0

            if verbose:
                print ('\nStatistics by Agent')
                print ('===================')
            for i in range(num_ai_agents):
                agent_tags = sum(agents[i].tag_hist)
                ep_tags += agent_tags
                cum_agent_tags[i] += agent_tags

                agent_reward = sum(agents[i].rewards)
                ep_rewards += agent_reward
                cum_agent_rewards[i] += agent_reward

                agent_US_hits = sum(agents[i].US_hits)
                agent_THEM_hits = sum(agents[i].THEM_hits)
                ep_US_hits += agent_US_hits
                ep_THEM_hits += agent_THEM_hits
                cum_agent_US_hits[i] += agent_US_hits
                cum_agent_THEM_hits[i] += agent_THEM_hits
        
                if verbose:
                    print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
                    print ("Agent{} reward is {:d}".format(i, agent_reward))
                    print('US agents hit = {}'.format(agent_US_hits))
                    print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
            cum_rewards += ep_rewards
            cum_tags += ep_tags
            cum_US_hits += ep_US_hits
            cum_THEM_hits += ep_THEM_hits
    
            if verbose:
                print ('\nStatistics in Aggregate')
                print ('=======================')
                print ('Total rewards gathered = {}'.format(ep_rewards))
                print ('Num laser fired = {}'.format(ep_tags))
                print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
                print ('Total THEM Hit = {}'.format(ep_THEM_hits))
                print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

            if verbose:
                print ('\nStatistics by Tribe')
                print ('===================')
            for i, t in enumerate(tribes):
                if t.name is not 'Crazies':
                    ep_tribe_reward = sum(t.sum_rewards())
                    cum_tribe_rewards[i] += ep_tribe_reward
                    if verbose:
                        print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

            for i in range(num_ai_agents):
                agents[i].clear_history()

        env.close()  # Close the rendering window
        end = time.time()

        print ('\nAverage Statistics in Aggregate')
        print ('=================================')
        print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
        av_agent_reward[dir_num][eps_num] = cum_rewards/max_episodes/num_ai_agents
        print (av_agent_reward[dir_num][eps_num])
        print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
        print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
        print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
        print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

        print ('\nAverage Statistics by Tribe')
        print ('=============================')
        for i, t in enumerate(tribes):
            if t.name is not 'Crazies':
                print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

        print ('\nAverage Statistics by Agent')
        print ('=============================')
        for i in range(num_ai_agents):
            print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
            print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
            print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
            print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

        print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))
                              
print (av_agent_reward)

###### Dir = MA_models/3T-9L1R/cooperative/cf0.01/ #######
###### Trained episodes = 1000 #######
Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load saved model for agent 8
Load random agent 9
..............................
Average Statistics in Aggregate
Total rewards gathered = 285.0
31.666666666666668
Num laser fired = 767.1
Total US Hit (friendly fire) = 51.9
Total THEM Hit = 145.3
friendly fire (%) = 0.263

Average Statistics by Tribe
Tribe Vikings has total reward of 61.5
Tribe Saxons has total reward of 58.4
Tribe Franks has total reward of 165.1

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.07
Agent0 reward is 0.0
US agents hit = 6.0
THEM agents hit = 19.6
Agent1 of Vikings aggressiveness is 0.32
Agent1 reward is 0.0
US agents hit = 19.8
THEM agents hit = 1.1
Agent2 of Vikin

..............................
Average Statistics in Aggregate
Total rewards gathered = 284.1
31.56666666666667
Num laser fired = 807.2
Total US Hit (friendly fire) = 46.9
Total THEM Hit = 200.2
friendly fire (%) = 0.190

Average Statistics by Tribe
Tribe Vikings has total reward of 5.1
Tribe Saxons has total reward of 25.8
Tribe Franks has total reward of 253.1

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 0.0
US agents hit = 1.3
THEM agents hit = 0.8
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 4.6
US agents hit = 0.1
THEM agents hit = 0.1
Agent2 of Vikings aggressiveness is 0.07
Agent2 reward is 0.5
US agents hit = 4.9
THEM agents hit = 8.0
Agent3 of Saxons aggressiveness is 0.07
Agent3 reward is 25.7
US agents hit = 13.8
THEM agents hit = 20.7
Agent4 of Saxons aggressiveness is 0.01
Agent4 reward is 0.1
US agents hit = 2.1
THEM agents hit = 4.3
Agent5 of Saxons aggressiveness is 0.02
Agent5 reward is 0.0
US agents hit = 1.2
THEM

..............................
Average Statistics in Aggregate
Total rewards gathered = 264.1
29.34444444444445
Num laser fired = 412.9
Total US Hit (friendly fire) = 44.7
Total THEM Hit = 172.6
friendly fire (%) = 0.206

Average Statistics by Tribe
Tribe Vikings has total reward of 0.5
Tribe Saxons has total reward of 110.8
Tribe Franks has total reward of 152.8

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.04
Agent0 reward is 0.0
US agents hit = 7.7
THEM agents hit = 14.9
Agent1 of Vikings aggressiveness is 0.04
Agent1 reward is 0.0
US agents hit = 6.9
THEM agents hit = 14.3
Agent2 of Vikings aggressiveness is 0.02
Agent2 reward is 0.5
US agents hit = 6.9
THEM agents hit = 7.3
Agent3 of Saxons aggressiveness is 0.02
Agent3 reward is 1.2
US agents hit = 1.8
THEM agents hit = 7.0
Agent4 of Saxons aggressiveness is 0.01
Agent4 reward is 0.0
US agents hit = 2.2
THEM agents hit = 5.3
Agent5 of Saxons aggressiveness is 0.06
Agent5 reward is 109.5
US agents hit = 1.6
TH

..............................
Average Statistics in Aggregate
Total rewards gathered = 244.7
27.185185185185183
Num laser fired = 538.8
Total US Hit (friendly fire) = 40.9
Total THEM Hit = 163.2
friendly fire (%) = 0.201

Average Statistics by Tribe
Tribe Vikings has total reward of 34.5
Tribe Saxons has total reward of 47.4
Tribe Franks has total reward of 162.7

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.20
Agent0 reward is 34.5
US agents hit = 20.4
THEM agents hit = 75.7
Agent1 of Vikings aggressiveness is 0.01
Agent1 reward is 0.0
US agents hit = 0.6
THEM agents hit = 0.9
Agent2 of Vikings aggressiveness is 0.01
Agent2 reward is 0.0
US agents hit = 1.7
THEM agents hit = 4.0
Agent3 of Saxons aggressiveness is 0.02
Agent3 reward is 18.8
US agents hit = 1.4
THEM agents hit = 6.4
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 26.9
US agents hit = 0.1
THEM agents hit = 0.5
Agent5 of Saxons aggressiveness is 0.12
Agent5 reward is 1.8
US agents hit = 5.3


..............................
Average Statistics in Aggregate
Total rewards gathered = 288.8
32.08518518518518
Num laser fired = 288.5
Total US Hit (friendly fire) = 23.5
Total THEM Hit = 134.2
friendly fire (%) = 0.149

Average Statistics by Tribe
Tribe Vikings has total reward of 93.0
Tribe Saxons has total reward of 110.5
Tribe Franks has total reward of 85.2

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.09
Agent0 reward is 71.5
US agents hit = 7.9
THEM agents hit = 26.1
Agent1 of Vikings aggressiveness is 0.09
Agent1 reward is 20.5
US agents hit = 4.1
THEM agents hit = 46.8
Agent2 of Vikings aggressiveness is 0.02
Agent2 reward is 1.0
US agents hit = 4.2
THEM agents hit = 9.1
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 0.8
US agents hit = 0.9
THEM agents hit = 1.4
Agent4 of Saxons aggressiveness is 0.01
Agent4 reward is 0.3
US agents hit = 0.9
THEM agents hit = 2.0
Agent5 of Saxons aggressiveness is 0.07
Agent5 reward is 109.4
US agents hit = 4.0


..............................
Average Statistics in Aggregate
Total rewards gathered = 262.5
29.16296296296296
Num laser fired = 720.1
Total US Hit (friendly fire) = 35.5
Total THEM Hit = 191.5
friendly fire (%) = 0.157

Average Statistics by Tribe
Tribe Vikings has total reward of 20.1
Tribe Saxons has total reward of 130.7
Tribe Franks has total reward of 111.6

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.27
Agent0 reward is 0.1
US agents hit = 15.2
THEM agents hit = 41.9
Agent1 of Vikings aggressiveness is 0.02
Agent1 reward is 19.9
US agents hit = 3.1
THEM agents hit = 16.5
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 0.1
US agents hit = 0.1
THEM agents hit = 0.1
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 0.0
US agents hit = 0.5
THEM agents hit = 1.2
Agent4 of Saxons aggressiveness is 0.02
Agent4 reward is 0.0
US agents hit = 4.5
THEM agents hit = 9.7
Agent5 of Saxons aggressiveness is 0.17
Agent5 reward is 130.7
US agents hit = 1.6

..............................
Average Statistics in Aggregate
Total rewards gathered = 266.1
29.562962962962963
Num laser fired = 273.4
Total US Hit (friendly fire) = 30.2
Total THEM Hit = 130.6
friendly fire (%) = 0.188

Average Statistics by Tribe
Tribe Vikings has total reward of 83.3
Tribe Saxons has total reward of 41.7
Tribe Franks has total reward of 141.0

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.02
Agent0 reward is 0.0
US agents hit = 2.3
THEM agents hit = 7.1
Agent1 of Vikings aggressiveness is 0.15
Agent1 reward is 82.2
US agents hit = 11.1
THEM agents hit = 57.0
Agent2 of Vikings aggressiveness is 0.01
Agent2 reward is 1.2
US agents hit = 1.8
THEM agents hit = 6.1
Agent3 of Saxons aggressiveness is 0.01
Agent3 reward is 0.1
US agents hit = 2.7
THEM agents hit = 12.3
Agent4 of Saxons aggressiveness is 0.02
Agent4 reward is 0.0
US agents hit = 4.8
THEM agents hit = 14.8
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 41.6
US agents hit = 0.0

..............................
Average Statistics in Aggregate
Total rewards gathered = 317.5
35.27407407407407
Num laser fired = 308.7
Total US Hit (friendly fire) = 31.8
Total THEM Hit = 139.7
friendly fire (%) = 0.185

Average Statistics by Tribe
Tribe Vikings has total reward of 88.1
Tribe Saxons has total reward of 61.0
Tribe Franks has total reward of 168.4

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.04
Agent0 reward is 87.7
US agents hit = 6.4
THEM agents hit = 29.6
Agent1 of Vikings aggressiveness is 0.05
Agent1 reward is 0.4
US agents hit = 6.7
THEM agents hit = 26.4
Agent2 of Vikings aggressiveness is 0.01
Agent2 reward is 0.0
US agents hit = 1.7
THEM agents hit = 2.4
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 0.0
US agents hit = 0.4
THEM agents hit = 1.3
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 37.1
US agents hit = 0.1
THEM agents hit = 0.2
Agent5 of Saxons aggressiveness is 0.12
Agent5 reward is 23.9
US agents hit = 5.6
T

..............................
Average Statistics in Aggregate
Total rewards gathered = 291.6
32.400000000000006
Num laser fired = 394.5
Total US Hit (friendly fire) = 33.2
Total THEM Hit = 135.3
friendly fire (%) = 0.197

Average Statistics by Tribe
Tribe Vikings has total reward of 69.5
Tribe Saxons has total reward of 156.9
Tribe Franks has total reward of 65.3

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 0.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.06
Agent1 reward is 19.7
US agents hit = 4.1
THEM agents hit = 17.8
Agent2 of Vikings aggressiveness is 0.01
Agent2 reward is 49.8
US agents hit = 1.2
THEM agents hit = 1.8
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 0.4
US agents hit = 0.0
THEM agents hit = 0.4
Agent4 of Saxons aggressiveness is 0.09
Agent4 reward is 0.7
US agents hit = 5.6
THEM agents hit = 17.3
Agent5 of Saxons aggressiveness is 0.13
Agent5 reward is 155.8
US agents hit = 6.3

## Average Agent Reward - Cooperative

In [21]:
for reward in av_agent_reward:
    print(reward)

[31.666666666666668, 27.637037037037036, 35.455555555555556, 35.718518518518515, 38.63333333333333]
[31.56666666666667, 42.24814814814815, 35.925925925925924, 40.01481481481481, 40.733333333333334]
[29.34444444444445, 35.7, 39.77037037037037, 44.303703703703704, 45.955555555555556]
[27.185185185185183, 40.388888888888886, 48.51481481481481, 40.86296296296296, 49.737037037037034]
[32.08518518518518, 35.74814814814815, 38.77777777777778, 34.32962962962963, 39.15925925925926]
[29.16296296296296, 37.214814814814815, 44.403703703703705, 42.7962962962963, 39.98518518518519]
[29.562962962962963, 41.803703703703704, 43.34074074074074, 37.922222222222224, 36.737037037037034]
[35.27407407407407, 41.34444444444445, 49.714814814814815, 52.696296296296296, 43.64074074074074]
[32.400000000000006, 42.53703703703704, 42.87777777777777, 44.46666666666667, 38.86666666666667]


In [28]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_names = ["MA_models/3T-9L1R/warlike/p-1.0_r0.0001/",
             "MA_models/3T-9L1R/warlike/p-1.0_r0.001/",
             "MA_models/3T-9L1R/warlike/p-1.0_r0.005/",  
             "MA_models/3T-9L1R/warlike/p-1.0_r0.01/",   
             "MA_models/3T-9L1R/warlike/p-1.0_r0.05/",   
             "MA_models/3T-9L1R/warlike/p-1.0_r0.1/",
             "MA_models/3T-9L1R/warlike/p-1.0_r0.5/",
             "MA_models/3T-9L1R/warlike/p-1.0_r1.0/"]

episodes = [1000,2000,3000,4000,5000]  
culture = "warlike"

av_agent_reward = [[0 for i in episodes] for j in dir_names]

# There will be 10 agents - 3 teams of 3 AI agents each and 1 random agents
num_ai_agents = 9
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two

# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return

for dir_num, dir_name in enumerate(dir_names):
    print ("###### Dir = {} #######".format(dir_name))
    
    for eps_num, eps in enumerate(episodes):
        print ("###### Trained episodes = {} #######".format(eps))
    
        # Load models for AI agents
        agents= [[] for i in range(num_ai_agents)]
        # If episodes is provided (not 0), load the model for each AI agent
        for i in range(num_ai_agents):
            model_file = dir_name+'MA{}_Gather__ep{}.p'.format(i,eps)
            try:
                with open(model_file, 'rb') as f:
                    # Model File include both model and optim parameters
                    saved_model = pickle.load(f)
                    agents[i], _ = saved_model
                    print("Load saved model for agent {}".format(i))
            except OSError:
                print('Model file not found.')
                raise

        # Load random agents    
        for i in range(num_ai_agents,num_agents):
            print("Load random agent {}".format(i))
            agents.append(Rdn_Policy())
        
        # Establish tribal association
        tribes = []
        tribes.append(Tribe(name='Vikings',color='blue', culture=culture, \
                    agents=[agents[0], agents[1], agents[2]]))
        tribes.append(Tribe(name='Saxons', color='red', culture=culture, \
                    agents=[agents[3], agents[4], agents[5]]))
        tribes.append(Tribe(name='Franks', color='purple', culture=culture, \
                    agents=[agents[6], agents[7], agents[8]]))
        tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[9]]))   # random agents are crazy!!!


        # 9 agents in 4 tribes, used map defined in default.txt
        agent_colors = [agent.color for agent in agents]
        agent_tribes = [agent.tribe for agent in agents]

        env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, \
                       map_name='default')    

        # Used to accumulate episode stats for averaging
        cum_rewards = 0
        cum_tags = 0
        cum_US_hits = 0
        cum_THEM_hits = 0
        cum_agent_rewards = [0 for agent in agents]
        cum_agent_tags = [0 for agent in agents]
        cum_agent_US_hits = [0 for agent in agents]
        cum_agent_THEM_hits = [0 for agent in agents]
        cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

        cuda = False
        start = time.time()

        for ep in range(max_episodes):
    
            print('.', end='')  # To show progress
    
            # Initialize AI and random agent data
            actions = [0 for i in range(num_agents)]
            tags = [0 for i in range(num_agents)]
            US_hits = [0 for i in range(num_agents)]
            THEM_hits = [0 for i in range(num_agents)]

            env_obs = env.reset()  # Environment return observations
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            # Unpack observations into data structure compatible with agent Policy
            agents_obs = unpack_env_obs(env_obs)
    
            for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
                agents[i].reset_info()    
    
            if render:
                env.render()
                time.sleep(1/15)  # Change speed of video rendering
    
            """
            # For Debug only
            print (len(agents_obs))
            print (agents_obs[0].shape)
            """
    
            """
            For now, we do not stack observations, and we do not implement LSTM
    
            state = np.stack([state]*num_frames)

            # Reset LSTM hidden units when episode begins
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
            """

            for frame in range(max_frames):

                for i in range(num_ai_agents):    # For AI agents
                    actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
                    if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                        tags[i] += 1   # record a tag for accessing aggressiveness
                
                for i in range(num_ai_agents, num_agents):   # For random agents
                    actions[i] = agents[i].select_action(agents_obs[i])
                    if actions[i] is 6:
                        tags[i] += 1   # record a tag for accessing aggressiveness
        
                """
                For now, we do not implement LSTM
                # Select action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
                """

                # if frame % 10 == 0:
                #     print (actions)    
            
                # Perform step        
                env_obs, reward, done, info = env.step(actions)
        
                """
                For Debug only
                print (env_obs)
                print (reward)
                print (done) 
                """

                for i in range(num_ai_agents):
                    agents[i].rewards.append(reward[i])  # Stack rewards

        
                # Unpack observations into data structure compatible with agent Policy
                agents_obs = unpack_env_obs(env_obs)
                load_info(agents, narrate=False)   # Load agent info for AI agents
        
                for i in range(num_agents):
                    US_hits[i] += agents[i].US_hit
                    THEM_hits[i] += agents[i].THEM_hit
            
                """
                For now, we do not stack observation, may come in handy later on
        
                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state
                """
        
                if render:
                    env.render()
                    time.sleep(1/15)  # Change speed of video rendering

                if any(done):
                    print("Done after {} frames".format(frame))
                    break
            
            # Print out statistics of AI agents
            ep_rewards = 0
            ep_tags = 0
            ep_US_hits = 0
            ep_THEM_hits = 0

            if verbose:
                print ('\nStatistics by Agent')
                print ('===================')
            for i in range(num_ai_agents):
                agent_tags = sum(agents[i].tag_hist)
                ep_tags += agent_tags
                cum_agent_tags[i] += agent_tags

                agent_reward = sum(agents[i].rewards)
                ep_rewards += agent_reward
                cum_agent_rewards[i] += agent_reward

                agent_US_hits = sum(agents[i].US_hits)
                agent_THEM_hits = sum(agents[i].THEM_hits)
                ep_US_hits += agent_US_hits
                ep_THEM_hits += agent_THEM_hits
                cum_agent_US_hits[i] += agent_US_hits
                cum_agent_THEM_hits[i] += agent_THEM_hits
        
                if verbose:
                    print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
                    print ("Agent{} reward is {:d}".format(i, agent_reward))
                    print('US agents hit = {}'.format(agent_US_hits))
                    print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
            cum_rewards += ep_rewards
            cum_tags += ep_tags
            cum_US_hits += ep_US_hits
            cum_THEM_hits += ep_THEM_hits
    
            if verbose:
                print ('\nStatistics in Aggregate')
                print ('=======================')
                print ('Total rewards gathered = {}'.format(ep_rewards))
                print ('Num laser fired = {}'.format(ep_tags))
                print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
                print ('Total THEM Hit = {}'.format(ep_THEM_hits))
                print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

            if verbose:
                print ('\nStatistics by Tribe')
                print ('===================')
            for i, t in enumerate(tribes):
                if t.name is not 'Crazies':
                    ep_tribe_reward = sum(t.sum_rewards())
                    cum_tribe_rewards[i] += ep_tribe_reward
                    if verbose:
                        print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

            for i in range(num_ai_agents):
                agents[i].clear_history()

        env.close()  # Close the rendering window
        end = time.time()

        print ('\nAverage Statistics in Aggregate')
        print ('=================================')
        print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
        av_agent_reward[dir_num][eps_num] = cum_rewards/max_episodes/num_ai_agents
        print (av_agent_reward[dir_num][eps_num])
        print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
        print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
        print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
        print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

        print ('\nAverage Statistics by Tribe')
        print ('=============================')
        for i, t in enumerate(tribes):
            if t.name is not 'Crazies':
                print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

        print ('\nAverage Statistics by Agent')
        print ('=============================')
        for i in range(num_ai_agents):
            print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
            print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
            print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
            print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

        print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))
                              
print (av_agent_reward)

###### Dir = MA_models/3T-9L1R/warlike/p-1.0_r0.0001/ #######
###### Trained episodes = 1000 #######
Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load saved model for agent 8
Load random agent 9
..............................
Average Statistics in Aggregate
Total rewards gathered = 377.9
41.992592592592594
Num laser fired = 1026.6
Total US Hit (friendly fire) = 32.0
Total THEM Hit = 179.3
friendly fire (%) = 0.151

Average Statistics by Tribe
Tribe Vikings has total reward of 38.4
Tribe Saxons has total reward of 124.7
Tribe Franks has total reward of 214.8

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 11.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 7.4
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of V

..............................
Average Statistics in Aggregate
Total rewards gathered = 401.2
44.577777777777776
Num laser fired = 328.4
Total US Hit (friendly fire) = 36.1
Total THEM Hit = 142.6
friendly fire (%) = 0.202

Average Statistics by Tribe
Tribe Vikings has total reward of 166.6
Tribe Saxons has total reward of 83.1
Tribe Franks has total reward of 151.6

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 25.9
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.11
Agent1 reward is 118.9
US agents hit = 15.3
THEM agents hit = 65.9
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 21.7
US agents hit = 0.0
THEM agents hit = 0.1
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 22.9
US agents hit = 0.0
THEM agents hit = 0.1
Agent4 of Saxons aggressiveness is 0.02
Agent4 reward is 35.3
US agents hit = 2.5
THEM agents hit = 8.5
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 24.8
US agents hit =

..............................
Average Statistics in Aggregate
Total rewards gathered = 427.8
47.53333333333333
Num laser fired = 255.7
Total US Hit (friendly fire) = 33.7
Total THEM Hit = 147.8
friendly fire (%) = 0.186

Average Statistics by Tribe
Tribe Vikings has total reward of 175.6
Tribe Saxons has total reward of 93.8
Tribe Franks has total reward of 158.4

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 21.5
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 45.7
US agents hit = 0.6
THEM agents hit = 1.4
Agent2 of Vikings aggressiveness is 0.08
Agent2 reward is 108.4
US agents hit = 9.7
THEM agents hit = 67.1
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 21.5
US agents hit = 0.0
THEM agents hit = 0.1
Agent4 of Saxons aggressiveness is 0.02
Agent4 reward is 41.3
US agents hit = 2.7
THEM agents hit = 14.5
Agent5 of Saxons aggressiveness is 0.00
Agent5 reward is 31.0
US agents hit = 

..............................
Average Statistics in Aggregate
Total rewards gathered = 371.9
41.32222222222222
Num laser fired = 1149.4
Total US Hit (friendly fire) = 30.8
Total THEM Hit = 183.8
friendly fire (%) = 0.143

Average Statistics by Tribe
Tribe Vikings has total reward of 15.3
Tribe Saxons has total reward of 135.1
Tribe Franks has total reward of 221.5

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 5.3
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 6.5
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 3.5
US agents hit = 0.0
THEM agents hit = 0.1
Agent3 of Saxons aggressiveness is 0.82
Agent3 reward is 0.0
US agents hit = 0.1
THEM agents hit = 92.9
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 36.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.16
Agent5 reward is 98.4
US agents hit = 8.4
T

..............................
Average Statistics in Aggregate
Total rewards gathered = 400.4
44.48518518518519
Num laser fired = 401.5
Total US Hit (friendly fire) = 39.0
Total THEM Hit = 161.1
friendly fire (%) = 0.195

Average Statistics by Tribe
Tribe Vikings has total reward of 184.7
Tribe Saxons has total reward of 130.6
Tribe Franks has total reward of 85.1

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 30.0
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.13
Agent1 reward is 54.9
US agents hit = 10.1
THEM agents hit = 54.3
Agent2 of Vikings aggressiveness is 0.16
Agent2 reward is 99.8
US agents hit = 16.7
THEM agents hit = 49.3
Agent3 of Saxons aggressiveness is 0.00
Agent3 reward is 22.6
US agents hit = 0.1
THEM agents hit = 0.1
Agent4 of Saxons aggressiveness is 0.00
Agent4 reward is 10.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent5 of Saxons aggressiveness is 0.11
Agent5 reward is 97.8
US agents hit =

..............................
Average Statistics in Aggregate
Total rewards gathered = 300.0
33.33703703703704
Num laser fired = 1096.8
Total US Hit (friendly fire) = 33.2
Total THEM Hit = 202.1
friendly fire (%) = 0.141

Average Statistics by Tribe
Tribe Vikings has total reward of 43.8
Tribe Saxons has total reward of 104.2
Tribe Franks has total reward of 152.1

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 0.0
US agents hit = 0.0
THEM agents hit = 0.2
Agent1 of Vikings aggressiveness is 0.02
Agent1 reward is 37.9
US agents hit = 0.9
THEM agents hit = 5.5
Agent2 of Vikings aggressiveness is 0.11
Agent2 reward is 5.9
US agents hit = 1.7
THEM agents hit = 34.9
Agent3 of Saxons aggressiveness is 0.72
Agent3 reward is 0.0
US agents hit = 0.0
THEM agents hit = 84.0
Agent4 of Saxons aggressiveness is 0.08
Agent4 reward is 46.6
US agents hit = 5.4
THEM agents hit = 20.4
Agent5 of Saxons aggressiveness is 0.06
Agent5 reward is 57.6
US agents hit = 3.

..............................
Average Statistics in Aggregate
Total rewards gathered = 0.0
0.0
Num laser fired = 350.7
Total US Hit (friendly fire) = 272.8
Total THEM Hit = 1948.0
friendly fire (%) = 0.123

Average Statistics by Tribe
Tribe Vikings has total reward of 0.0
Tribe Saxons has total reward of 0.0
Tribe Franks has total reward of 0.0

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.04
Agent0 reward is 0.0
US agents hit = 77.9
THEM agents hit = 272.8
Agent1 of Vikings aggressiveness is 0.04
Agent1 reward is 0.0
US agents hit = 39.0
THEM agents hit = 273.0
Agent2 of Vikings aggressiveness is 0.04
Agent2 reward is 0.0
US agents hit = 0.0
THEM agents hit = 272.5
Agent3 of Saxons aggressiveness is 0.04
Agent3 reward is 0.0
US agents hit = 0.0
THEM agents hit = 116.8
Agent4 of Saxons aggressiveness is 0.04
Agent4 reward is 0.0
US agents hit = 39.0
THEM agents hit = 155.9
Agent5 of Saxons aggressiveness is 0.04
Agent5 reward is 0.0
US agents hit = 0.0
THEM agents

FileNotFoundError: [Errno 2] No such file or directory: 'MA_models/3T-9L1R/warlike/p-1.0_r0.5/MA0_Gather__ep3000.p'

## Average Agent Reward - Warlike

In [29]:
for reward in av_agent_reward:
    print(reward)

[41.992592592592594, 49.529629629629625, 47.166666666666664, 45.55555555555556, 46.82592592592593]
[44.577777777777776, 44.84814814814815, 43.24814814814815, 47.65925925925926, 38.77037037037037]
[47.53333333333333, 46.37037037037037, 50.714814814814815, 47.13703703703704, 44.01111111111111]
[41.32222222222222, 46.737037037037034, 43.31111111111111, 42.55555555555556, 39.87407407407407]
[44.48518518518519, 37.903703703703705, 34.940740740740736, 40.91111111111111, 60.51481481481481]
[33.33703703703704, 71.04444444444444, 71.74814814814815, 69.07407407407408, 71.81481481481482]
[0.0, 0.0, 0, 0, 0]
[0, 0, 0, 0, 0]
