## Setup

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents.utils import get_config, smooth
from agents.agents import DDPGAgent, TD3Agent
from tqdm.auto import tqdm

In [2]:
# Set global variables
FIRST_ACTOR_WEIGHTS_PATH = 'agents/weights/pretrained_first_actor.pt'
FIRST_CRITIC_WEIGHTS_PATH = 'agents/weights/pretrained_first_critic.pt'
SECOND_ACTOR_WEIGHTS_PATH = 'agents/weights/pretrained_second_actor.pt'
SECOND_CRITIC_WEIGHTS_PATH = 'agents/weights/pretrained_second_critic.pt'

######################################## Define this as explained in README.md###########################################
TENNIS_PATH = 'environment/tennis_windows/Tennis.exe'
#########################################################################################################################

In [3]:
# Initialize agent and environment
agent_config = get_config()
first_agent = TD3Agent(config=agent_config)
second_agent = TD3Agent(config=agent_config)
env = UnityEnvironment(file_name=TENNIS_PATH)

# Set exploration to zero
first_agent.train_mode = False
second_agent.train_mode = False

# Load agent's weights:
first_agent.load(FIRST_ACTOR_WEIGHTS_PATH, FIRST_CRITIC_WEIGHTS_PATH)
second_agent.load(SECOND_ACTOR_WEIGHTS_PATH, SECOND_CRITIC_WEIGHTS_PATH)

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


## Play Time

In [4]:
# A function used to play one episode with given agents
def play_one(first_agent, second_agent, env, brain_name, train_mode=True):
    
    # Initialize the return
    first_agent_episode_return = 0
    second_agent_episode_return = 0
    
    # Start episode
    env_info = env.reset(train_mode=train_mode)[brain_name]
    first_observation = env_info.vector_observations[0, :]
    second_observation = env_info.vector_observations[1, :]
    done = False
    
    # Play
    while not done:
        # Interact with environment
        first_action = first_agent.act(first_observation)
        second_action = second_agent.act(second_observation)
        action = np.concatenate([first_action, second_action], axis=0)
        env_info = env.step(action)[brain_name]
        next_first_observation = env_info.vector_observations[0, :]
        next_second_observation = env_info.vector_observations[1, :]
        first_agent_reward = env_info.rewards[0]
        second_agent_reward = env_info.rewards[1]
        done = env_info.local_done[0]
        first_agent_episode_return += first_agent_reward
        second_agent_episode_return += second_agent_reward
        
        # Perform transition
        first_observation = next_first_observation
        second_observation = next_second_observation
    
    return np.maximum(first_agent_episode_return, second_agent_episode_return)

## Evaluate

In [6]:
# Play 100 episodes and print the average reward (repeated for 10 runs):
for i in tqdm(range(10)):
    cache = np.zeros(100)
    for j in range(100):
        cache[j] = play_one(first_agent, second_agent, env, brain_name)
    print(f'In run {i+1} the better performing Agent received an average return of {np.mean(cache)} over 100 episodes.')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In run 1 the better performing Agent received an average return of 2.527000037655234 over 100 episodes.
In run 2 the better performing Agent received an average return of 2.4430000364035367 over 100 episodes.
In run 3 the better performing Agent received an average return of 2.475000036880374 over 100 episodes.
In run 4 the better performing Agent received an average return of 2.52300003759563 over 100 episodes.
In run 5 the better performing Agent received an average return of 2.5530000380426645 over 100 episodes.
In run 6 the better performing Agent received an average return of 2.3670000352710487 over 100 episodes.
In run 7 the better performing Agent received an average return of 2.4220000360906124 over 100 episodes.
In run 8 the better performing Agent received an average return of 2.4700000368058683 over 100 episodes.
In run 9 the better performing Agent received an average return of 2.3430000349134206 over 100 episodes.
In run 10 the better performing Agent received an average r

## Watch

In [5]:
# Watch Time
play_one(first_agent, second_agent, env, brain_name, train_mode=False)

2.600000038743019

## Cleanup

In [7]:
# Shutdown
env.close()