In [1]:
import numpy as np
import torch as T
from models import DDPGActor, DDPGCritic
from utils import DDPGExperienceBuffer, EnvWrapper
from torch.nn.utils.clip_grad import clip_grad_norm_
import torch.nn.functional as F

In [2]:
PATH = 'Reacher_Linux/Reacher.x86_64'
env = EnvWrapper(PATH)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
args = {'obs_space': env.observation_space,     # length of state vector
        'action_space': env.action_space,       # number of actions
        'n_hidden': 256,                        # number of hidden neurons per layer
        'bs': 64,                               # number of samples per batch
        'lr_actor': 1e-4,                       # learning rate for actor network
        'lr_critic': 1e-3,                      # learning rate for critic network
        'device': 'cuda:0',                     # device to use for computations
        'gamma': .99,                           # discount factor
        'noise_factor': 1.,                     # noise factor for action noise
        'noise_decay': 0.999,                   # decay of noise factor applied after every update
        'noise_minimum': 0.0001,                # minimum of noise factor
        'buffer_size': 500_000,                 # buffer size/length of experience buffer
        'episodes': 5000,                       # maximum of episodes to train for
        'buffer_threshold': .10,                # buffer threshold before starting training
        'train_every_n': 4,                     # train on one batch every n steps of the env
        'model_update_every_n': 8,              # soft update of target models every n steps
        'tau': 0.01,                            # value for soft updating the target networks tau*local+(1-tau)*target
        'clip_grad': 1.,                        # clipping value for gradients
        'exploration_steps': 50000              # number of random explorations steps before using network predictions
        }

In [4]:
# initialize the networks
actor = DDPGActor(args['obs_space'], args['action_space'], args['n_hidden'], args['lr_actor'], args['device'])
critic = DDPGCritic(args['obs_space'], args['action_space'], args['n_hidden'], args['lr_critic'], args['device'])
actor_target = DDPGActor(args['obs_space'], args['action_space'], args['n_hidden'], args['lr_actor'], args['device'])
critic_target = DDPGCritic(args['obs_space'], args['action_space'], args['n_hidden'], args['lr_critic'], args['device'])
actor_target.load_state_dict(actor.state_dict())
critic_target.load_state_dict(critic.state_dict())

In [5]:
# initialize the experience buffer
exp = DDPGExperienceBuffer(args['buffer_size'], args['bs'], args['buffer_threshold'], args['device'])

In [6]:
# global stats to monitor
step = 0
mean_rewards = []

In [7]:
for episode in range(args['episodes']):

    # episodic stats to keep track of
    stats = {
        'rewards': 0.,
        'actor_loss': 0.,
        'critic_loss': 0.,
        'loss': 0.
    }
    # store for single step experience tuple
    exp_cache = []
    done = False
    state = env.reset()
    while not done:
        actor.eval()
        exp_cache.append(T.Tensor(state))

        # begin with explorations steps and sample from uniform distribution
        if step < args['exploration_steps']:
            action = np.random.uniform(-1, 1, size=(20, args['action_space']))
        else:
            action = actor(state)
            action = action.squeeze().detach().cpu().numpy()
            # add noise to network actions
            noise = np.random.randn(args['action_space']) * max(args['noise_factor'], args['noise_minimum'])
            action = np.clip(action + noise, -1., 1.)

        exp_cache.append(T.Tensor(action))
        # take step in the environment
        next_state, reward, done_flags, _ = env.step(action)

        stats['rewards'] += (sum(reward) / 20)

        step += 1

        # store transitions
        exp_cache.append(T.Tensor(reward).unsqueeze(1))
        exp_cache.append(T.Tensor(done_flags).unsqueeze(1))
        exp_cache.append(T.Tensor(next_state))
        exp.add(*exp_cache)
        exp_cache.clear()

        if np.any(done_flags):
            done = True

        state = next_state.copy()

        # only train the network every n steps AND when threshold is reached
        if (step % args['train_every_n'] == 0) and exp.threshold:
            # sample from experience buffer
            exp_states, exp_actions, exp_rewards , exp_dones, exp_next_states = exp.draw()
            # train the critic, compute the state-action values
            q_vals = critic(exp_states, exp_actions)
            next_q_vals = critic_target(exp_next_states, actor_target(exp_next_states))
            next_state_v = exp_rewards.squeeze() + (args['gamma'] * next_q_vals.squeeze().detach() * (1 - exp_dones.squeeze()))
            critic_loss = F.mse_loss(q_vals.squeeze(), next_state_v)
            # minimize critic_loss
            critic.optimizer.zero_grad()
            critic_loss.backward()
            clip_grad_norm_(critic.parameters(), args['clip_grad'])
            critic.optimizer.step()
            # train the actor network by minimizing the negative state-action values
            actions = actor(exp_states)
            actor_loss = -critic(exp_states, exp_actions).mean()
            actor.optimizer.zero_grad()
            actor_loss.backward()
            clip_grad_norm_(actor.parameters(), args['clip_grad'])
            actor.optimizer.step()

            # update the stats
            stats['actor_loss'] += actor_loss
            stats['critic_loss'] += critic_loss
            stats['loss'] += (actor_loss + critic_loss)
            # update the noise factor with decay
            args['noise_factor'] *= args['noise_decay']
        # do a soft update of the target networks every n steps with value tau
        if step % args['model_update_every_n'] == 0:

            for target_param, local_param in zip(actor_target.parameters(), actor.parameters()):
                target_param.data.copy_(args['tau'] * local_param.data + (1.0 - args['tau']) * target_param.data)

            for target_param, local_param in zip(critic_target.parameters(), critic.parameters()):
                target_param.data.copy_(args['tau'] * local_param.data + (1.0 - args['tau']) * target_param.data)

    # append the episode rewards
    mean_rewards.append(stats['rewards'])

    print(f'episode {episode}:')
    print(f'rewards: {stats["rewards"]:.2f}')
    print(f'actor loss: {stats["actor_loss"]:.3f}')
    print(f'critic loss: {stats["critic_loss"]:.3f}')
    print(f'noise factor: {max(args["noise_factor"], args["noise_minimum"]):.4f}')
    print(f'buffer size: {len(exp)}\n')



    # save models every 100 episodes
    if episode % 50 == 0:
        T.save(actor.state_dict(), f'actor_eps_{episode}_rew_{stats["rewards"]:.2f}.h5')
        T.save(critic.state_dict(), f'critic_eps_{episode}_rew_{stats["rewards"]:.2f}.h5')
    # env is considered solved after mean rewards of +30, save models
    if np.mean(np.array(mean_rewards[-100:])) > 30:
        print(f'SOLVED ENV AFTER {episode} EPISODES')
        T.save(actor.state_dict(), f'solved_actor_eps_{episode}_rew_{stats["rewards"]:.2f}.h5')
        T.save(critic.state_dict(), f'solved_critic_eps_{episode}_rew_{stats["rewards"]:.2f}.h5')
        break

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
episode 0:
rewards: 0.16
actor loss: 0.000
critic loss: 0.000
noise factor: 1.0000
buffer size: 20020

episode 1:
rewards: 0.17
actor loss: 0.000
critic loss: 0.000
noise factor: 1.0000
buffer size: 40040

episode 2:
rewards: 0.24
actor loss: -17.735
critic loss: 0.321
noise factor: 0.8824
buffer size: 60060

episode 3:
rewards: 0.17
actor loss: -55.441
critic loss: 0.168
noise factor: 0.6

In [9]:
#evaluate the model
#load actor model

actor = T.load('solved_actor_eps_265_rew_36.75')
state = env.reset()
rewards = []
steps = 0
while True:
    actor.eval()
    action = actor(state)
    action = action.squeeze().detach().cpu().numpy()
    # add noise to network actions
    noise = np.random.randn(args['action_space']) * args['noise_minimum']
    action = np.clip(action+noise, -1., 1.)
    next_state, reward, done, _ = env.step(action)
    rewards.append((sum(reward) / 20))
    steps += 1
    state = next_state.copy()
    if np.any(done):
        break
print(f'rewards: {sum(rewards):.2f} after {steps} steps')

rewards: 37.91 after 1001 steps
