# Team Performance Assessment

We implement multi-episode play in order to better assess how good a team or a culture is. The output will be averaged results over these episodes.


In [58]:
import os
import random
import time
import platform
import torch
import gym
import numpy as np
import pickle

# This is the Gathering Game Environment based on Tribal Organization of agents
from tribes_env import GatheringEnv
from tribes_model import *

import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

print("Python version: ", platform.python_version())
print("Pytorch version: {}".format(torch.__version__))
print("OpenAI Gym version: {}".format(gym.__version__))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Python version:  3.6.4
Pytorch version: 0.4.1.post2
OpenAI Gym version: 0.9.2


In [76]:
import pickle
import numpy as np

import torch
from torch.autograd import Variable

dir_name = 'MA_models/no_fragging/p-1.0/'
episodes = 2000  # This is used to recall a model file trained to a # of episodes

# There will be 9 agents - 2 teams of 4 AI agents each and 1 random agents
num_ai_agents = 8
num_rdn_agents = 1
num_agents = num_ai_agents+num_rdn_agents  # just the sum of the two


# Data structure for AI agents (agents will form their own Class later on)
agents = []
actions = []
tags = []

# Initialize environment
render = False
num_actions = 8                       # There are 8 actions defined in Gathering

# Initialize constants
num_frames = 4
max_episodes = 30
max_frames = 1000
verbose = False

def unpack_env_obs(env_obs):
    """
    Gathering is a partially-observable Markov Game. env_obs returned by GatheringEnv is a numpy 
    array of dimension (num_agent, 800), which represents the agents' observations of the game.

    The 800 elements (view_box) encodes 4 layers of 10x20 pixels frames in the format:
    (viewbox_width, viewbox_depth, 4).
    
    This code reshapes the above into stacked frames that can be accepted by the Policy class:
    (batch_idx, in_channel, width, height)
    
    """
    
    num_agents = len(env_obs)  # environ observations is a list of agents' observations
    
    obs = []
    for i in range(num_agents):
        x = env_obs[i]   # take the indexed agent's observation
        x = torch.Tensor(x)   # Convert to tensor
        
        # Policy is a 3-layer CNN
        x = x.view(1, 10, 20, -1)  # reshape into environment defined stacked frames
        x = x.permute(0, 3, 1, 2)  # permute to Policy accepted stacked frames
        obs.append(x)
        
    return obs  # return a list of Policy accepted stacked frames (tensor)


"""
For now, we do not implement LSTM            
# LSTM Change: Need to cycle hx and cx thru function
def select_action(model, state, lstm_hc, cuda):
    hx , cx = lstm_hc 
    num_frames, height, width = state.shape
    state = torch.FloatTensor(state.reshape(-1, num_frames, height, width))

    if cuda:
        state = state.cuda()

    probs, value, (hx, cx) = model((Variable(state), (hx, cx)))

    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    # LSTM Change: Need to cycle hx and cx thru function
    return action.data[0], log_prob, value, (hx, cx)
"""

def select_action(model, obs, cuda):
    """
    This code expects obs to be an array of stacked frames of the following dim:
    (batch_idx, in_channel, width, height)
    
    This is inputted into model - the agent's Policy, which outputs a probability 
    distribution over available actions.
    
    Policy gradient is implemented using torch.distributions.Categorical. 
    """
    
    # Policy is a 3-layer CNN
    # _, num_frames, width, height = obs.shape
    # obs = torch.FloatTensor(obs.reshape(-1, num_frames, width, height))
    
    # Policy is a 2-layer NN for now
    # obs = obs.view(1, -1)
   
    if cuda:
        obs = obs.cuda()
      
    probs = model(obs)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)

    return action.item(), log_prob 


def load_info(agents, narrate=False):
    for i in range(num_agents):    
        agents[i].load_info(info[i])
        if narrate:
            if agents[i].tagged:
                print('frame {}, agent{} is tagged'.format(frame,i))
            if agents[i].laser_fired:
                print('frame {}, agent{} fires its laser'.format(frame,i))
                print('and hit {} US and {} THEM'.format(agents[i].US_hit, agents[i].THEM_hit))
    return


# Load models for AI agents
if episodes > 0:
    agents= [[] for i in range(num_ai_agents)]
    # If episodes is provided (not 0), load the model for each AI agent
    for i in range(num_ai_agents):
        model_file = dir_name+'MA{}_Gather__ep{}.p'.format(i,episodes)
        try:
            with open(model_file, 'rb') as f:
                # Model File include both model and optim parameters
                saved_model = pickle.load(f)
                agents[i], _ = saved_model
                print("Load saved model for agent {}".format(i))
        except OSError:
            print('Model file not found.')
            raise
else:
    # If episodes=0, start with a freshly initialized model for each AI agent
    for i in range(num_ai_agents):
        print("Load AI agent {}".format(i))
        agents.append(Policy(num_frames, num_actions, i))

# Load random agents    
for i in range(num_ai_agents,num_agents):
    print("Load random agent {}".format(i))
    agents.append(Rdn_Policy())

# Establish tribal association
tribes = []
tribes.append(Tribe(name='Vikings',color='blue', agents=[agents[0], agents[1], agents[2]]))
tribes.append(Tribe(name='Saxons', color='red', agents=[agents[3], agents[4]]))
tribes.append(Tribe(name='Franks', color='purple', agents=[agents[5], agents[6], agents[7]]))
tribes.append(Tribe(name='Crazies', color='yellow', agents=[agents[8]]))   # random agents are crazy!!!

# 9 agents in 4 tribes, used map defined in default.txt
agent_colors = [agent.color for agent in agents]
agent_tribes = [agent.tribe for agent in agents]
env = GatheringEnv(n_agents=num_agents,agent_colors=agent_colors, agent_tribes=agent_tribes, map_name='default')  

# Used to accumulate episode stats for averaging
cum_rewards = 0
cum_tags = 0
cum_US_hits = 0
cum_THEM_hits = 0
cum_agent_rewards = [0 for agent in agents]
cum_agent_tags = [0 for agent in agents]
cum_agent_US_hits = [0 for agent in agents]
cum_agent_THEM_hits = [0 for agent in agents]
cum_tribe_rewards = [0 for t in tribes if t.name is not 'Crazies']

cuda = False
start = time.time()

for ep in range(max_episodes):
    
    print('.', end='')  # To show progress
    
    # Initialize AI and random agent data
    actions = [0 for i in range(num_agents)]
    tags = [0 for i in range(num_agents)]
    US_hits = [0 for i in range(num_agents)]
    THEM_hits = [0 for i in range(num_agents)]

    env_obs = env.reset()  # Environment return observations
    """
    # For Debug only
    print (len(agents_obs))
    print (agents_obs[0].shape)
    """
    
    # Unpack observations into data structure compatible with agent Policy
    agents_obs = unpack_env_obs(env_obs)
    
    for i in range(num_ai_agents):    # Reset agent info - laser tag statistics
        agents[i].reset_info()    
    
    if render:
        env.render()
        time.sleep(1/15)  # Change speed of video rendering
    
    """
    # For Debug only
    print (len(agents_obs))
    print (agents_obs[0].shape)
    """
    
    """
    For now, we do not stack observations, and we do not implement LSTM
    
    state = np.stack([state]*num_frames)

    # Reset LSTM hidden units when episode begins
    cx = Variable(torch.zeros(1, 256))
    hx = Variable(torch.zeros(1, 256))
    """

    for frame in range(max_frames):

        for i in range(num_ai_agents):    # For AI agents
            actions[i], _ = select_action(agents[i], agents_obs[i], cuda=cuda)
            if actions[i] is 6:  # action[i] is a tensor, .item() returns the integer
                tags[i] += 1   # record a tag for accessing aggressiveness
                
        for i in range(num_ai_agents, num_agents):   # For random agents
            actions[i] = agents[i].select_action(agents_obs[i])
            if actions[i] is 6:
                tags[i] += 1   # record a tag for accessing aggressiveness
        
        """
        For now, we do not implement LSTM
        # Select action
        action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx))        
        """

        # if frame % 10 == 0:
        #     print (actions)    
            
        # Perform step        
        env_obs, reward, done, info = env.step(actions)
        
        """
        For Debug only
        print (env_obs)
        print (reward)
        print (done) 
        """

        for i in range(num_ai_agents):
            agents[i].rewards.append(reward[i])  # Stack rewards

        
        # Unpack observations into data structure compatible with agent Policy
        agents_obs = unpack_env_obs(env_obs)
        load_info(agents, narrate=False)   # Load agent info for AI agents
        
        for i in range(num_agents):
            US_hits[i] += agents[i].US_hit
            THEM_hits[i] += agents[i].THEM_hit
            
        """
        For now, we do not stack observation, may come in handy later on
        
        # Evict oldest diff add new diff to state
        next_state = np.stack([next_state]*num_frames)
        next_state[1:, :, :] = state[:-1, :, :]
        state = next_state
        """
        
        if render:
            env.render()
            time.sleep(1/15)  # Change speed of video rendering

        if any(done):
            print("Done after {} frames".format(frame))
            break
            
    # Print out statistics of AI agents
    ep_rewards = 0
    ep_tags = 0
    ep_US_hits = 0
    ep_THEM_hits = 0

    if verbose:
        print ('\nStatistics by Agent')
        print ('===================')
    for i in range(num_ai_agents):
        agent_tags = sum(agents[i].tag_hist)
        ep_tags += agent_tags
        cum_agent_tags[i] += agent_tags

        agent_reward = sum(agents[i].rewards)
        ep_rewards += agent_reward
        cum_agent_rewards[i] += agent_reward

        agent_US_hits = sum(agents[i].US_hits)
        agent_THEM_hits = sum(agents[i].THEM_hits)
        ep_US_hits += agent_US_hits
        ep_THEM_hits += agent_THEM_hits
        cum_agent_US_hits[i] += agent_US_hits
        cum_agent_THEM_hits[i] += agent_THEM_hits
        
        if verbose:
            print ("Agent{} aggressiveness is {:.2f}".format(i, agent_tags/frame))
            print ("Agent{} reward is {:d}".format(i, agent_reward))
            print('US agents hit = {}'.format(agent_US_hits))
            print('THEM agents hit = {}'.format(agent_THEM_hits ))
        
    cum_rewards += ep_rewards
    cum_tags += ep_tags
    cum_US_hits += ep_US_hits
    cum_THEM_hits += ep_THEM_hits
    
    if verbose:
        print ('\nStatistics in Aggregate')
        print ('=======================')
        print ('Total rewards gathered = {}'.format(ep_rewards))
        print ('Num laser fired = {}'.format(ep_tags))
        print ('Total US Hit (friendly fire) = {}'.format(ep_US_hits))
        print ('Total THEM Hit = {}'.format(ep_THEM_hits))
        print ('friendly fire (%) = {0:.3f}'.format(ep_US_hits/(ep_US_hits+ep_THEM_hits+1e-7)))

    if verbose:
        print ('\nStatistics by Tribe')
        print ('===================')
    for i, t in enumerate(tribes):
        if t.name is not 'Crazies':
            ep_tribe_reward = sum(t.sum_rewards())
            cum_tribe_rewards[i] += ep_tribe_reward
            if verbose:
                print ('Tribe {} has total reward of {}'.format(t.name, ep_tribe_reward))

    for i in range(num_ai_agents):
        agents[i].clear_history()

env.close()  # Close the rendering window
end = time.time()

print ('\nAverage Statistics in Aggregate')
print ('=================================')
print ('Total rewards gathered = {:.1f}'.format(cum_rewards/max_episodes))
print ('Num laser fired = {:.1f}'.format(cum_tags/max_episodes))
print ('Total US Hit (friendly fire) = {:.1f}'.format(cum_US_hits/max_episodes))
print ('Total THEM Hit = {:.1f}'.format(cum_THEM_hits/max_episodes))
print ('friendly fire (%) = {:.3f}'.format(cum_US_hits/(cum_US_hits+cum_THEM_hits+1e-7)))

print ('\nAverage Statistics by Tribe')
print ('=============================')
for i, t in enumerate(tribes):
    if t.name is not 'Crazies':
        print ('Tribe {} has total reward of {:.1f}'.format(t.name, cum_tribe_rewards[i]/max_episodes))    

print ('\nAverage Statistics by Agent')
print ('=============================')
for i in range(num_ai_agents):
    print ("Agent{} of {} aggressiveness is {:.2f}".format(i, agents[i].tribe, \
                                                           cum_agent_tags[i]/(max_episodes*max_frames)))
    print ("Agent{} reward is {:.1f}".format(i, cum_agent_rewards[i]/max_episodes))
    print('US agents hit = {:.1f}'.format(cum_agent_US_hits[i]/max_episodes))
    print('THEM agents hit = {:.1f}'.format(cum_agent_THEM_hits[i]/max_episodes))

print('Training time per epochs: {:.2f} sec'.format((end-start)/max_episodes))


Load saved model for agent 0
Load saved model for agent 1
Load saved model for agent 2
Load saved model for agent 3
Load saved model for agent 4
Load saved model for agent 5
Load saved model for agent 6
Load saved model for agent 7
Load random agent 8
..............................
Average Statistics in Aggregate
Total rewards gathered = 427.3
Num laser fired = 378.1
Total US Hit (friendly fire) = 20.0
Total THEM Hit = 131.0
friendly fire (%) = 0.133

Average Statistics by Tribe
Tribe Vikings has total reward of 96.0
Tribe Saxons has total reward of 219.6
Tribe Franks has total reward of 111.7

Average Statistics by Agent
Agent0 of Vikings aggressiveness is 0.00
Agent0 reward is 36.7
US agents hit = 0.0
THEM agents hit = 0.0
Agent1 of Vikings aggressiveness is 0.00
Agent1 reward is 28.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent2 of Vikings aggressiveness is 0.00
Agent2 reward is 31.2
US agents hit = 0.0
THEM agents hit = 0.0
Agent3 of Saxons aggressiveness is 0.25
Agent3 reward i