In [1]:
import time
from unityagents import UnityEnvironment
import numpy as np
import torch
import random
from agent import Agent
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = UnityEnvironment(file_name='Env_20_Agents/Reacher_Linux/Reacher.x86_64', no_graphics = True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
#definition hyperparameters and trainingsconditions
n_episodes = 300
max_noice= 1.0
noice_decay = 0.995
training_mode = True
random_seed = 0

In [None]:
# Initialize Feed-forward DNNs for Actor and Critic models. 
agent = Agent(state_size, action_size, random_seed, max_noice, noice_decay)

#train the agent
def ddpg_train(n_episodes):
    #list containing scores from each episode
    scores = []
    #list containing means over last 100 episodes
    means = []
    #last 100 scores
    scores_window = deque(maxlen = 100)
    # messure time for achieving a mean score over 30
    start_time = time.time()
    for episode in range(n_episodes):
        # messure time for one episode
        start_episode = time.time()
        # Reset the enviroment
        env_info = env.reset(train_mode=training_mode)[brain_name] 
        cur_states = env_info.vector_observations
        score = np.zeros(num_agents)
        # initialize timestep
        timestep = 0
        # reset noise
        agent.reset()
        while True:
            # Choose best action for given network
            actions = agent.act(cur_states, add_noise = True)
            # Action is performed and new state, reward, info are received. 
            env_info = env.step(actions)[brain_name]
            # get next state 
            next_states = env_info.vector_observations
            # see if episode is finished
            dones = env_info.local_done
            # get reward
            rewards = env_info.rewards
            # save experience to replay buffer, perform learning step at defined interval"
            for cur_state, action, reward, next_state, done in zip(cur_states, 
                                                                   actions, 
                                                                   rewards, 
                                                                   next_states, 
                                                                   dones):
                # current state, action, reward, new state are stored in the experience replay"
                agent.step(cur_state, action, reward, next_state, done, timestep)
            # roll over new state
            cur_states = next_states      
            #add reward to score
            score += rewards
            # count timestep
            timestep+=1
            
            if np.any(dones):
                break
        
        # calculate time
        time_episode = time.time() - start_episode
        time_entire = time.time() - start_time
        #save most recent score
        scores_window.append(score)
        scores.append(score)
        mean = np.mean(scores_window)
        means.append(mean)
        print("Episode:" + str(episode) + " Score:" + str(np.mean(score)) + 
              " Mean Score(last 100 episodes):" + str(np.mean(scores_window)) +
              " Duration episode:" + time.strftime('%Mm%Ss', time.gmtime(time_episode)) + 
              " Duration training:" + time.strftime('%Hh%Mm%Ss', time.gmtime(time_entire)))
        
        # save model weights
        if (episode+1) % 10 == 0 or np.mean(scores_window) >= 30:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_Actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_Critic.pth')
            if np.mean(scores_window) >= 30:
                print("Environment solved in " + str(episode) + " episodes. Mean score over all 20 agents " +
                      str(np.mean(scores_window)) + " for the last 100 episodes")
    
    return scores, means

scores, means = ddpg_train(n_episodes)

episode_scores = []
for record in scores:
    episode_scores.append(np.mean(record))

#plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(episode_scores)), episode_scores, label = "score")
plt.plot(np.arange(len(means)), means, label = "mean")
plt.axhline(y=30, color='r', linestyle='--', label="target")
plt.ylabel('Score')
plt.xlabel('Episode')
plt.legend()
plt.show()

Episode:0 Score:1.1864999734796584 Mean Score(last 100 episodes):1.1864999734796584 Duration episode:04m20s Duration training:00h04m20s
Episode:1 Score:2.2639999493956564 Mean Score(last 100 episodes):1.7252499614376575 Duration episode:04m33s Duration training:00h08m54s
Episode:2 Score:3.910499912593514 Mean Score(last 100 episodes):2.453666611822943 Duration episode:04m33s Duration training:00h13m27s
Episode:3 Score:4.706499894801527 Mean Score(last 100 episodes):3.016874932567589 Duration episode:04m34s Duration training:00h18m02s
Episode:4 Score:5.271499882172793 Mean Score(last 100 episodes):3.46779992248863 Duration episode:04m35s Duration training:00h22m38s
Episode:5 Score:4.630499896500259 Mean Score(last 100 episodes):3.661583251490568 Duration episode:04m42s Duration training:00h27m21s
Episode:6 Score:5.78449987070635 Mean Score(last 100 episodes):3.96485705423568 Duration episode:04m51s Duration training:00h32m13s
Episode:7 Score:6.3944998570717875 Mean Score(last 100 episod

Episode:61 Score:15.342499657068402 Mean Score(last 100 episodes):9.793023974657238 Duration episode:06m41s Duration training:05h59m40s
Episode:62 Score:15.01549966437742 Mean Score(last 100 episodes):9.875920414176607 Duration episode:06m43s Duration training:06h06m23s
Episode:63 Score:16.011499642115087 Mean Score(last 100 episodes):9.971788839613145 Duration episode:06m39s Duration training:06h13m03s
Episode:64 Score:16.467499631922692 Mean Score(last 100 episodes):10.071722851802523 Duration episode:06m41s Duration training:06h19m45s
Episode:65 Score:15.850499645713716 Mean Score(last 100 episodes):10.159280075952694 Duration episode:07m00s Duration training:06h26m45s
Episode:66 Score:16.995999620109796 Mean Score(last 100 episodes):10.261320666163993 Duration episode:06m40s Duration training:06h33m26s
Episode:67 Score:12.177999727800488 Mean Score(last 100 episodes):10.289507122952765 Duration episode:06m43s Duration training:06h40m10s
Episode:68 Score:15.05149966357276 Mean Score

Episode:121 Score:31.609999293461442 Mean Score(last 100 episodes):19.454699565153568 Duration episode:06m55s Duration training:12h44m15s
Episode:122 Score:30.7449993127957 Mean Score(last 100 episodes):19.66081956054643 Duration episode:06m42s Duration training:12h50m57s
Episode:123 Score:30.23999932408333 Mean Score(last 100 episodes):19.862174556045794 Duration episode:07m03s Duration training:12h58m00s
Episode:124 Score:32.20849928008393 Mean Score(last 100 episodes):20.09402455086354 Duration episode:06m47s Duration training:13h04m47s
Episode:125 Score:31.594999293796718 Mean Score(last 100 episodes):20.323054545744323 Duration episode:06m45s Duration training:13h11m33s
Episode:126 Score:32.88949926486239 Mean Score(last 100 episodes):20.546044540760107 Duration episode:06m42s Duration training:13h18m15s
Episode:127 Score:33.43249925272539 Mean Score(last 100 episodes):20.775159535638988 Duration episode:06m40s Duration training:13h24m56s
Episode:128 Score:35.14149921452626 Mean S

Episode:181 Score:34.18499923590571 Mean Score(last 100 episodes):28.59978436074499 Duration episode:06m45s Duration training:19h33m05s
Episode:182 Score:26.508999407477678 Mean Score(last 100 episodes):28.6352443599524 Duration episode:06m45s Duration training:19h39m50s
Episode:183 Score:33.49099925141782 Mean Score(last 100 episodes):28.739839357614517 Duration episode:07m13s Duration training:19h47m03s
Episode:184 Score:31.891499287169427 Mean Score(last 100 episodes):28.788259356532247 Duration episode:06m47s Duration training:19h53m50s
Episode:185 Score:21.955499509256335 Mean Score(last 100 episodes):28.741459357578307 Duration episode:07m07s Duration training:20h00m57s
Episode:186 Score:13.364999701268971 Mean Score(last 100 episodes):28.59432936086692 Duration episode:06m46s Duration training:20h07m44s
Episode:187 Score:23.236999480612575 Mean Score(last 100 episodes):28.55643436171394 Duration episode:07m00s Duration training:20h14m44s
Episode:188 Score:26.129999415948987 Mean

Episode:241 Score:22.25299950260669 Mean Score(last 100 episodes):27.048509395418687 Duration episode:07m00s Duration training:02h27m17s
Episode:242 Score:21.017499530222267 Mean Score(last 100 episodes):27.002624396444297 Duration episode:07m02s Duration training:02h34m19s
Episode:243 Score:27.34749938873574 Mean Score(last 100 episodes):27.029219395849854 Duration episode:07m00s Duration training:02h41m19s
Episode:244 Score:33.829999243840575 Mean Score(last 100 episodes):27.096159394353627 Duration episode:07m10s Duration training:02h48m30s
Episode:245 Score:29.064999350346625 Mean Score(last 100 episodes):27.117864393868484 Duration episode:07m01s Duration training:02h55m31s
Episode:246 Score:19.72549955910072 Mean Score(last 100 episodes):27.081519394680857 Duration episode:07m05s Duration training:03h02m37s
Episode:247 Score:35.74999920092523 Mean Score(last 100 episodes):27.136089393461123 Duration episode:07m02s Duration training:03h09m39s
Episode:248 Score:34.81149922190234 Me

In [None]:
env.close()