In [1]:
import time
from unityagents import UnityEnvironment
import numpy as np
import torch
import random
from agent import Agent
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = UnityEnvironment(file_name='Env_20_Agents/Reacher_Linux/Reacher.x86_64', no_graphics = True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
#definition hyperparameters and trainingsconditions
n_episodes = 250
max_eps= 1.0
min_eps = 0.01
eps_decay = 0.995
training_mode = True
random_seed = 0

In [6]:
# Initialize Feed-forward DNNs for Actor and Critic models. 
agent = Agent(state_size, action_size, random_seed, max_eps, eps_decay)

#train the agent
def ddpg_train(n_episodes):
    #list containing scores from each episode
    scores = []
    #last 100 scores
    scores_window = deque(maxlen = 100)
    # messure time for achieving a mean score over 30
    start_time = time.time()
    time_episode = None
    for episode in range(n_episodes):
        # messure time for achieving a mean score over 30
        start_episode = time.time()
        # Reset the enviroment
        env_info = env.reset(train_mode=training_mode)[brain_name] 
        cur_states = env_info.vector_observations
        score = np.zeros(num_agents)
        # initialize timestep
        timestep = 0
        # reset noise
        agent.reset()
        while True:
            # Choose best action for given network
            actions = agent.act(cur_states, add_noise = True)
            # Action is performed and new state, reward, info are received. 
            env_info = env.step(actions)[brain_name]
            # get next state 
            next_states = env_info.vector_observations
            # see if episode is finished
            dones = env_info.local_done
            # get reward
            rewards = env_info.rewards
            # save experience to replay buffer, perform learning step at defined interval"
            for cur_state, action, reward, next_state, done in zip(cur_states, actions, rewards, next_states, dones)
                # current state, action, reward, new state are stored in the experience replay"
                agent.step(cur_state, action, reward, next_state, done, timestep)
            # roll over new state
            cur_states = next_states      
            #add reward to score
            score += rewards
            # count timestep
            timestep+=1
            
            if np.any(dones):
                break
        
        # calculate time
        time_epsisode = time.time() - start_episode
        time_entire = time.time() - start_time
        #save most recent score
        scores_window.append(score)
        scores.append(score)
        print("Episode:" + str(episode) + " Score:" + str(score) + 
              " Mean Score(100 episodes):" + str(np.mean(scores_window)) +
              " Duration episode:" + time.strftime('%Mm%Ss', time.gmtime(time_episode)) + 
              " Duration training:" + time.strftime('%Mm%Ss', time.gmtime(time_entire)))
        
        # save model weights
        if (episode+1) % 10 == 0 or np.mean(scores_window) >= 30:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_Actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_Critic.pth')
            if np.mean(scores_window) >= 30:
                print("Environment solved in " + str(episode) + " episodes. Mean score over all 20 agents " +
                      str(np.mean(scores_window)) + " for the last 100 episodes")
    
    return scores

scores = ddpg_train(n_episodes)

#plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode')
plt.show()

RuntimeError: The size of tensor a (2560) must match the size of tensor b (128) at non-singleton dimension 0

In [None]:
env.close()