 The purpose of this notebook is to test that the Unity environment works as expected while training with ml environments

In [1]:
from mlagents.envs.environment import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import torch

In [3]:
print(torch.cuda.is_available())

True


In [4]:
from ddpg_agent import Agent

## Start the environment 

In [4]:
# env = UnityEnvironment(file_name='../builds/crawler_new/Unity Environment.exe')
env = UnityEnvironment(file_name='../builds/crawler1/Unity Environment.exe')

INFO:mlagents.envs:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Training Brains : 0
        Reset Parameters :
		



Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [9]:
# get the default brain

default_brain = env.external_brain_names[0]
brain = env.brains[default_brain]

### Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [10]:
# reset the environment
env_info = env.reset(train_mode=True)[default_brain]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# # size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 10
Size of each action: [20]
There are 10 agents. Each observes a state with length: 126
The state for the first agent looks like: [ 2.19138432e+00 -7.09163305e-06 -1.32700126e-03  9.99998987e-01
 -5.73529069e-05  9.99999106e-01  1.32700091e-03  0.00000000e+00
 -6.33711927e-04 -9.31617022e-01 -1.29005651e-03 -3.32853175e-03
  1.04987113e-04  1.82529297e-02  0.00000000e+00 -2.56439894e-01
  4.45060998e-01  2.56952643e-01  2.08717847e+00 -2.79050646e-03
  2.10881615e+00  5.92227638e-01 -9.86313820e-03 -5.92277110e-01
  5.00000000e-01  5.00000000e-01  0.00000000e+00  5.00000000e-01
  0.00000000e+00 -1.57400322e+00 -2.47904801e+00  1.57905018e+00
 -3.33920741e+00  3.92939243e-03 -3.31237078e+00  1.25328040e+00
 -2.69466788e-01 -1.25354970e+00  5.00000000e-01  0.00000000e+00
  0.00000000e+00  5.00000000e-01  0.00000000e+00  2.56088346e-01
  4.12214011e-01  2.56530493e-01  2.01774788e+00  1.13008032e-03
 -1.99914312e+00 -5.92019737e-01 -8.28386750e-03 -5.91978252e-01
  5.00

### Take Random Actions in the Environment

Controlling the agent and receive feedback from the environment using the python API

Once this cell is executed, you will watch the agent's performance, if it selects an action at random with each time step.  A window should pop up that allows you to observe the agent, as it moves through the environment.  

In [11]:
env_info = env.reset(train_mode=False)[default_brain]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, int(action_size[0])) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[default_brain]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 0.051601926982402804


In [71]:
# def validate(agent, max_t, train_mode=True):
#     env_info = env.reset(train_mode=True)[default_brain]
#     agent.reset()
#     states = env_info.vector_observations
#     scores_one_episode = np.zeros(states.shape[0])
#     for t in range(max_t):
#         actions = agent.act(states, add_noise=False)             # select an action (for each agent)
#         env_info = env.step(actions)[brain_name]                 # send all actions to the environment
#         next_states = env_info.vector_observations               # get next state (for each agent)
#         rewards = env_info.rewards                               # get reward (for each agent)
#         dones = env_info.local_done                              # see if episode finished
#         scores_one_episode += env_info.rewards                   # update the score (for each agent)
#         states = next_states                                     # roll over states to next time step
#         if np.any(dones):                                        # exit loop if episode finished
#             break
#     return scores_one_episode

## With training mode

In [12]:
env_info = env.reset(train_mode=True)[default_brain]
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, int(action_size[0])) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[default_brain]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: -0.5285558249306632


In [19]:

agent = Agent(state_size=state_size, action_size=action_size, random_seed=15)
avg_over = 100
print_every = 10

def ddpg(n_episodes=200):
    scores_deque = deque(maxlen=avg_over)
    scores_global = []
    average_global = []
    best_avg = -np.inf
    solved = False
    tic = time.time()
        
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        scores = np.zeros(num_agents)                          # initialize the score (for each agent)
        agent.reset()
        
        score_average = 0
        timestep = time.time()
        for t in count():
            actions = agent.act(states, add_noise=True)
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            agent.step(states, actions, rewards, next_states, dones, t)
            states = next_states                               # roll over states to next time step
            scores += rewards                                  # update the score (for each agent)            
            if np.any(dones):                                  # exit loop if episode finished
                break
        
        score = np.mean(scores)
        scores_deque.append(score)
        score_average = np.mean(scores_deque)
        scores_global.append(score)
        average_global.append(score_average)
                
        print('\rEpisode {}, Average Score: {:.2f}, Max Score: {:.2f}, Min Score: {:.2f}, Time per Episode: {:.2f}'\
              .format(i_episode, score_average, np.max(scores), np.min(scores), time.time() - timestep), end="\n")        
        
        if i_episode % print_every == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
            
        if score_average >= 30.0:
            if not solved:
                toc = time.time()
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}, training time: {}'.format(i_episode, score_average, toc-tic))
                solved = True
                torch.save(agent.actor_local.state_dict(), 'best_checkpoint_actor.pth')
                torch.save(agent.critic_local.state_dict(), 'best_checkpoint_critic.pth')
        if score_average >=37.0 and score_average > best_avg: 
            best_avg = score_average
            print('\nEnvironment best average (above 37.0) in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, score_average)) 
            
    return scores_global, average_global

scores, averages = ddpg()

fig = plt.figure()
plt.subplot(2, 1, 1)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')

plt.subplot(2, 1, 2)
plt.plot(np.arange(1, len(scores)+1), averages)
plt.ylabel('Average Score')
plt.xlabel('Episode #')
plt.show()

RuntimeError: CUDA error (10): invalid device ordinal

In [12]:
def train(agent, n_episodes=500, max_t=500, train_mode=True):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        for t in range(max_t):
            actions = agent.act(states, add_noise=True)              # select an action (for each agent)
            env_info = env.step(actions)[brain_name]                 # send all actions to the environment
            next_states = env_info.vector_observations               # get next state (for each agent)
            rewards = env_info.rewards                               # get reward (for each agent)
            dones = env_info.local_done                              # see if episode finished
            agent.step(states, actions, rewards, next_states, dones) # learn
            states = next_states                                     # roll over states to next time step
            if np.any(dones):                                        # exit loop if episode finished
                break
        scores_one_episode = validate(agent, max_t, train_mode=train_mode)
        score = np.average(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)
        print('\rEpisode {}\tAverage Score: {:.3f}\tLast Score: {:.3f}\tMax Score: {:.3f}'.format(i_episode, 
                                                                                          mean_100, 
                                                                                          score,
                                                                                         np.max(scores_one_episode)), end="")
        if i_episode % 100 == 0:
            print(' '*300, end="")
            print('\rEpisode {}\tAverage Score: {:.3f}\tMax Score: {:.3f}'.format(i_episode, mean_100, np.max(scores_window)))
            agent.save()
        if len(scores_window) >= 100 and np.mean(scores_window)>=30.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            agent.save()
            break
    return scores

In [13]:
from agents.utils import ReplayBuffer, ParameterNoise, ActionNoise
from agents.parallel_ddpg import ParallelDDPG
from agents.models_success import Actor, Critic
import random

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = "cpu"
torch.set_num_threads(3)
random_seed = 0
states = env_info.vector_observations
action_size = brain.vector_action_space_size
memory = ReplayBuffer(action_size, device, int(1e5), 64, random_seed)
agent = ParallelDDPG(states.shape[1], action_size, random_seed, states.shape[0], memory, Actor, Critic, device, 
                    TAU=1e-3,
                    UPDATE_EVERY=20,
                    TRANSFER_EVERY=1,
                    UPDATE_LOOP=10,
                    ADD_NOISE_EVERY=1,
                    BOOTSTRAP_SIZE=4,
                    LR_CRITIC = 1e-4,
                    LR_ACTOR = 1e-3)
# noise = ParameterNoise(agent.actor_target, device, random_seed, mu=0., theta=0.3, sigma=0.05)

random.seed(random_seed)
noise = [ActionNoise(action_size, device, random_seed + i*10, mu=-0.5, theta=0.1*random.random(), 
                     sigma=0.1*random.random()) for i in range(int(states.shape[1]))] 
agent.set_noise(noise)
# Check that saving and loading are working
# agent.save()
# agent.load()

RuntimeError: CUDA error (10): invalid device ordinal

In [20]:
env.close()

INFO:mlagents.envs:Environment shut down with return code 0 (CTRL_C_EVENT).
