## Setup

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents.utils import get_config, smooth
from agents.agents import DDPGAgent, TD3Agent
from tqdm.auto import tqdm

In [2]:
# Set global variables
ACTOR_WEIGHTS_PATH = 'agents/weights/pretrained_actor.pt'
CRITIC_WEIGHTS_PATH = 'agents/weights/pretrained_critic.pt'

######################################## Define this as explained in README.md###########################################
REACHER_PATH = 'environment/reacher_windows/Reacher.exe'
#########################################################################################################################

In [3]:
# Initialize agent and environment
agent_config = get_config()
agent = TD3Agent(config=agent_config)
env = UnityEnvironment(file_name=REACHER_PATH)

# Set exploration to zero
agent.train_mode = False

# Load agent's weights:
agent.load(ACTOR_WEIGHTS_PATH, CRITIC_WEIGHTS_PATH)

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## Play Time

In [4]:
# A function used to play one episode with a given agent
def play_one(agent, env, brain_name, train_mode=True):
    
    # Initialize the return
    episode_return = 0
    
    # Start episode
    env_info = env.reset(train_mode=train_mode)[brain_name]
    observation = env_info.vector_observations[0]
    done = False
    
    # Play
    while not done:
        # Interact with environment
        action = agent.act(observation)
        env_info = env.step(action)[brain_name]
        next_observation = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        episode_return += reward     
        
        # Perform transition
        observation = next_observation
    
    return episode_return

## Evaluate

In [6]:
# Set exploration to zero:
agent.epsilon = 0
# Play 100 episodes and print the average reward (repeated for 10 runs):
for i in tqdm(range(10)):
    cache = np.zeros(100)
    for j in range(100):
        cache[j] = play_one(agent, env, brain_name)
    print(f'In run {i+1} the Agent received an average return of {np.mean(cache)} over 100 episodes.')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In run 1 the Agent received an average return of 38.8969991305843 over 100 episodes.
In run 2 the Agent received an average return of 39.02369912775234 over 100 episodes.
In run 3 the Agent received an average return of 39.07129912668839 over 100 episodes.
In run 4 the Agent received an average return of 38.84899913165718 over 100 episodes.
In run 5 the Agent received an average return of 39.1340991252847 over 100 episodes.
In run 6 the Agent received an average return of 39.06709912678227 over 100 episodes.
In run 7 the Agent received an average return of 38.98389912864193 over 100 episodes.
In run 8 the Agent received an average return of 39.15019912492484 over 100 episodes.
In run 9 the Agent received an average return of 39.075299126598985 over 100 episodes.
In run 10 the Agent received an average return of 38.82129913227632 over 100 episodes.



## Watch

In [5]:
# Watch Time
play_one(agent, env, brain_name, train_mode=False)

39.35999912023544

## Cleanup

In [6]:
# Shutdown
env.close()