## Setup

In [6]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents.utils import get_config, smooth
from agents.agents import DeepQAgent
from tqdm.auto import tqdm

In [7]:
# Set global variables
WEIGHTS_PATH = 'agents/weights/pretrained_network.pt'

######################################## Define this as explained in README.md###########################################
BANANA_PATH = 'environment/banana_windows/Banana.exe'
#########################################################################################################################

In [8]:
# Initialize agent and environment
agent_config = get_config()
agent = DeepQAgent(config=agent_config)
env = UnityEnvironment(file_name=BANANA_PATH)

# Set exploration to zero
agent.epsilon = 0

# Load agent's weights:
agent.load(WEIGHTS_PATH)

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## Play Time

In [9]:
# A function used to play one episode with a given agent
def play_one(agent, env, brain_name, train_mode=True):
    
    # Initialize the return
    episode_return = 0
    
    # Start episode
    env_info = env.reset(train_mode=train_mode)[brain_name]
    observation = env_info.vector_observations[0]
    done = False
    
    # Play
    while not done:
        # Interact with environment
        action = agent.act(observation)
        env_info = env.step(action)[brain_name]
        next_observation = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        episode_return += reward     
        
        # Perform transition
        observation = next_observation
    
    return episode_return

## Evaluate

In [5]:
# Set exploration to zero:
agent.epsilon = 0
# Play 100 episodes and print the average reward (repeated for 10 runs):
for i in tqdm(range(10)):
    cache = np.zeros(100)
    for j in range(100):
        cache[j] = play_one(agent, env, brain_name)
    print(f'In run {i+1} the Agent received an average return of {np.mean(cache)} over 100 episodes.')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In run 1 the Agent received an average return of 14.02 over 100 episodes.
In run 2 the Agent received an average return of 14.46 over 100 episodes.
In run 3 the Agent received an average return of 14.57 over 100 episodes.
In run 4 the Agent received an average return of 14.56 over 100 episodes.
In run 5 the Agent received an average return of 14.27 over 100 episodes.
In run 6 the Agent received an average return of 14.72 over 100 episodes.
In run 7 the Agent received an average return of 14.27 over 100 episodes.
In run 8 the Agent received an average return of 14.81 over 100 episodes.
In run 9 the Agent received an average return of 14.29 over 100 episodes.
In run 10 the Agent received an average return of 14.55 over 100 episodes.



## Watch

In [10]:
# Watch Time
play_one(agent, env, brain_name, train_mode=False)

17.0

## Cleanup

In [11]:
# Shutdown
env.close()