# Training a DDPG agent

In [1]:
from unityagents import UnityEnvironment

import numpy as np
import math
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
PATH = 'C:/Users/Gebruiker/Documents/Coding testr/Reacher/multiAgent/Reacher.exe'

In [4]:
env = UnityEnvironment(file_name=PATH, no_graphics = False)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [5]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [6]:
# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [7]:
from agent import Agent

In [8]:
agent = Agent(state_size, action_size, num_agents)

In [None]:
epocs = 250 #number of episodes
epoc_length = 1000 #steps per episode
rewards     = []

In [None]:
for epoc in range(epocs):
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations
    cumulative_state = np.concatenate((state, state), axis = 1)
    epoc_reward = 0

    for step in range(epoc_length):
        action = agent.get_action(cumulative_state)
        action = np.clip(action, -1, 1)  
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        
        next_cumulative_state = np.concatenate((next_state, state), axis = 1)
        agent.add_replay(cumulative_state, action, reward, next_cumulative_state, done)
        
        agent.learning_step()
        
        state = next_state
        cumulative_state = next_cumulative_state
        epoc_reward += sum(reward)/num_agents
        
    print ("epoc: {0:3} reward: {1:2.2f}".format(epoc,epoc_reward))
                 
    rewards.append(epoc_reward)



epoc:   0 reward: 0.74
epoc:   1 reward: 1.09
epoc:   2 reward: 2.50
epoc:   3 reward: 3.65
epoc:   4 reward: 4.26
epoc:   5 reward: 5.57
epoc:   6 reward: 5.54
epoc:   7 reward: 5.61
epoc:   8 reward: 6.49
epoc:   9 reward: 6.31
epoc:  10 reward: 7.16
epoc:  11 reward: 9.18
epoc:  12 reward: 7.13
epoc:  13 reward: 6.26
epoc:  14 reward: 6.79
epoc:  15 reward: 7.10
epoc:  16 reward: 7.63
epoc:  17 reward: 8.52
epoc:  18 reward: 6.96
epoc:  19 reward: 7.48
epoc:  20 reward: 6.86
epoc:  21 reward: 7.14
epoc:  22 reward: 7.64
epoc:  23 reward: 6.52
epoc:  24 reward: 7.15
epoc:  25 reward: 7.83
epoc:  26 reward: 7.16
epoc:  27 reward: 7.40
epoc:  28 reward: 7.30
epoc:  29 reward: 6.66
epoc:  30 reward: 6.76
epoc:  31 reward: 5.92
epoc:  32 reward: 7.98
epoc:  33 reward: 8.32
epoc:  34 reward: 9.12
epoc:  35 reward: 8.47
epoc:  36 reward: 8.29
epoc:  37 reward: 7.49
epoc:  38 reward: 8.95
epoc:  39 reward: 9.84
epoc:  40 reward: 9.68
epoc:  41 reward: 9.90
epoc:  42 reward: 9.97
epoc:  43 r

In [None]:
agent.save("model2")

In [None]:
plt.plot(rewards)
plt.ylabel('reward')
plt.xlabel('epoc')
plt.show()

In [None]:
env.close()