## Setup

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents.utils import get_config, smooth
from agents.agents import DoubleQAgent
from tqdm.auto import tqdm

In [2]:
# Set global variables
WEIGHTS_PATH = 'agents/weights/pretrained_network.pt'
BANANA_PATH = 'environment/banana_windows/Banana.exe'
NUM_EPISODES = 10000

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1d5afd56e30>

In [3]:
# Initialize agent and environment
agent_config = get_config()
agent = DoubleQAgent(config=agent_config)
env = UnityEnvironment(file_name=BANANA_PATH)

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## Training Day

In [4]:
# A function used to play one episode with a given agent
def play_one(agent, env, brain_name, train_mode=True):
    
    # Initialize the return
    episode_return = 0
    
    # Start episode
    env_info = env.reset(train_mode=train_mode)[brain_name]
    observation = env_info.vector_observations[0]
    done = False
    
    # Play
    while not done:
        # Interact with environment
        action = agent.act(observation)
        env_info = env.step(action)[brain_name]
        next_observation = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        episode_return += reward     

        # Add memory
        agent.memento(observation, action, reward, next_observation, done)
        # Train from memory
        agent.learn()
        
        # Perform transition
        observation = next_observation
    
    return episode_return

In [5]:
# Populating replay buffer:
env_info = env.reset(train_mode=True)[brain_name]
observation = env_info.vector_observations[0]
done = False

print('Filling Replay Buffer...')
while not agent.memory.is_ready():
    action = np.random.choice(4)
    env_info = env.step(action)[brain_name]
    reward = env_info.rewards[0]
    next_observation = env_info.vector_observations[0]
    done = env_info.local_done[0]

    # Add memory
    agent.memento(observation, action, reward, next_observation, done)
    
    if done:
        env_info = env.reset(train_mode=True)[brain_name]
        observation = env_info.vector_observations[0]
        
# Now we are able to train our agent
episode_returns = np.zeros(NUM_EPISODES)

print('Starting Training...')
for i in tqdm(range(NUM_EPISODES)):
    episode_returns[i] = play_one(agent, env, brain_name)
    if i % 500 == 0 and i > 0:
        print(f'Average Return over the last 100 episodes: {smooth(episode_returns[:i+1])[-1]}')

Filling Replay Buffer...
Starting Training...


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

Average Return over the last 100 episodes: 0.25
Average Return over the last 100 episodes: -0.11
Average Return over the last 100 episodes: 0.0
Average Return over the last 100 episodes: 0.0
Average Return over the last 100 episodes: 0.03
Average Return over the last 100 episodes: 0.13


KeyboardInterrupt: 

In [None]:
# Plot the results:
plt.figure(figsize=(20,7.5))
plt.plot(episode_returns)
plt.plot(smooth(episode_returns), color='green')

# Print the return over the last 100 episodes:
print(f'Over the last 100 episodes, the agent received an average return of {smooth(episode_returns)[-1]}')

In [None]:
# Save weights
agent.save(WEIGHTS_PATH)

## Evaluation

In [None]:
# Set exploration to zero:
agent.epsilon = 0
# Play 100 episodes and print the average reward (repeated for 10 runs):
for i in tqdm(range(10)):
    cache = np.zeros(100)
    for j in range(100):
        cache[j] = play_one(agent, env, brain_name)
    print(f'In run {i+1} the Agent received an average return of {np.mean(cache)} over 100 episodes.')

In [None]:
# Watch Time
play_one(agent, env, brain_name, train_mode=False)

In [None]:
# Shutdown
env.close()