In [1]:
%matplotlib inline

from unityagents import UnityEnvironment
import numpy as np
import random
import sys
from collections import deque

import ptan
import torch
import torch.optim as optim
import torch.nn.functional as F

from tensorboardX import SummaryWriter

import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
sys.path.append('./')

In [3]:
from ddpg_model_ptan import DDPGActor,DDPGCritic,Config
from utils.experience_unity import UnityExperienceSourceFirstLast

In [4]:
env = UnityEnvironment(file_name='Reacher-2.app',no_graphics=True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [5]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
state_size = brain.vector_observation_space_size
action_size = brain.vector_action_space_size

In [6]:
writer = SummaryWriter(comment="-reacherddpg_ptan")

In [7]:
config = Config()

In [8]:
config.ACTOR_FC1_UNITS = 64
config.ACTOR_FC2_UNITS = 64
config.CRITIC_FC1_UNITS = 64
config.CRITIC_FC2_UNITS = 64
config.NOISE_THETA = 0.15
config.NOISE_SIGMA = 0.2
config.LR_ACTOR = 1e-4
config.LR_CRITIC = 1e-4
config.TAU = 1e-4

#REPLAY BUFFER
config.BUFFER_SIZE = int(1e7)
config.BATCH_SIZE = 256
config.STEPS = 5

In [9]:
actor_net = DDPGActor(state_size,action_size,config.ACTOR_FC1_UNITS,config.ACTOR_FC2_UNITS)
critic_net = DDPGCritic(state_size,action_size,config.CRITIC_FC1_UNITS,config.CRITIC_FC2_UNITS)

actor_target = ptan.agent.TargetNet(actor_net)
critic_target = ptan.agent.TargetNet(critic_net)

In [10]:
class AgentDDPG(ptan.agent.BaseAgent):
    
    def __init__(self, net, device="cpu", ou_enabled=True, ou_mu=0.0, ou_teta=0.15, ou_sigma=0.2, ou_epsilon=1.0):
        self.net = net
        self.device = device
        self.ou_enabled = ou_enabled
        self.ou_mu = ou_mu
        self.ou_teta = ou_teta
        self.ou_sigma = ou_sigma
        self.ou_epsilon = ou_epsilon

    def initial_state(self):
        return None

    def __call__(self, states, agent_states):
        states_v = ptan.agent.float32_preprocessor(states).to(self.device)
        mu_v = self.net(states_v)
        actions = mu_v.data.cpu().numpy()

        if self.ou_enabled and self.ou_epsilon > 0:
            new_a_states = []
            for a_state, action in zip(agent_states, actions):
                if a_state is None:
                    a_state = np.zeros(shape=action.shape, dtype=np.float32)
                a_state += self.ou_teta * (self.ou_mu - a_state)
                a_state += self.ou_sigma * np.random.normal(size=action.shape)

                action += self.ou_epsilon * a_state
                new_a_states.append(a_state)
        else:
            new_a_states = agent_states

        actions = np.clip(actions, -1, 1)
        return actions, new_a_states

In [11]:
def unpack_batch(batch, device="cpu"):
    states, actions, rewards, dones, last_states = [], [], [], [], []
    for exp in batch:
        states.append(exp.state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        dones.append(exp.last_state is None)
        if exp.last_state is None:
            last_states.append(exp.state)
        else:
            last_states.append(exp.last_state)
    states_v = ptan.agent.float32_preprocessor(states).to(device)
    actions_v = ptan.agent.float32_preprocessor(actions).to(device)
    rewards_v = ptan.agent.float32_preprocessor(rewards).to(device)
    last_states_v = ptan.agent.float32_preprocessor(last_states).to(device)
    dones_t = torch.BoolTensor(dones).to(device)
    return states_v, actions_v, rewards_v, dones_t, last_states_v


In [12]:
agent = AgentDDPG(actor_net, device=config.device,ou_teta=config.NOISE_THETA,ou_sigma=config.NOISE_SIGMA)

In [13]:
exp_source = UnityExperienceSourceFirstLast(env, agent, gamma=config.GAMMA, steps_count=1)
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=config.BUFFER_SIZE)
act_opt = optim.Adam(actor_net.parameters(), lr=config.LR_ACTOR)
crt_opt = optim.Adam(critic_net.parameters(), lr=config.LR_CRITIC)

In [None]:
frame_idx = 0
best_reward = None
stop_score = 30

with ptan.common.utils.RewardTracker(writer,min_ts_diff=30) as tracker:
    with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
        while True:
            
            frame_idx += 1
            buffer.populate(config.STEPS)
            
            rewards_steps = exp_source.pop_rewards_steps()
            if rewards_steps:
                rewards, steps = zip(*rewards_steps)
                tb_tracker.track("episode_steps", steps[0], frame_idx)
                mean_reward = tracker.reward(rewards[0], frame_idx)
                
                if mean_reward and mean_reward > stop_score:
                    print('Solved!')
                    break
                
            if len(buffer) < 10000:
                continue
                
            batch = buffer.sample(config.BATCH_SIZE)
            states_v, actions_v, rewards_v, dones_mask, last_states_v = unpack_batch(batch, config.device)
            
            #Critic Training
            crt_opt.zero_grad()
            q_v = critic_net(states_v, actions_v)
            last_act_v = actor_target.target_model(last_states_v)
            q_last_v = critic_target.target_model(last_states_v, last_act_v)
            q_last_v[dones_mask] = 0.0
            q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * config.GAMMA
            critic_loss_v = F.mse_loss(q_v, q_ref_v.detach())
            critic_loss_v.backward()
            crt_opt.step()
            tb_tracker.track("loss_critic", critic_loss_v, frame_idx)
            tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx)
            
            #Actor Training            
            act_opt.zero_grad()
            cur_actions_v = actor_net(states_v)
            actor_loss_v = -critic_net(states_v, cur_actions_v)
            actor_loss_v = actor_loss_v.mean()
            actor_loss_v.backward()
            act_opt.step()
            tb_tracker.track("loss_actor", actor_loss_v, frame_idx)

            actor_target.alpha_sync(alpha=1 - config.TAU)
            critic_target.alpha_sync(alpha=1 - config.TAU)
            
            
                
            

7997: done 10 episodes, mean reward 0.534, speed 181.04 f/s
11997: done 19 episodes, mean reward 0.677, speed 81.53 f/s
15997: done 28 episodes, mean reward 0.633, speed 83.32 f/s
19997: done 37 episodes, mean reward 0.596, speed 88.44 f/s
23997: done 46 episodes, mean reward 0.674, speed 83.07 f/s
27997: done 55 episodes, mean reward 0.644, speed 78.69 f/s
31997: done 64 episodes, mean reward 0.637, speed 79.18 f/s
35997: done 73 episodes, mean reward 0.650, speed 79.69 f/s
39997: done 82 episodes, mean reward 0.649, speed 81.86 f/s
43997: done 91 episodes, mean reward 0.663, speed 78.03 f/s
47997: done 100 episodes, mean reward 0.661, speed 76.26 f/s
51997: done 109 episodes, mean reward 0.698, speed 80.74 f/s
55997: done 118 episodes, mean reward 0.679, speed 84.72 f/s
59997: done 127 episodes, mean reward 0.681, speed 85.81 f/s
63997: done 136 episodes, mean reward 0.726, speed 79.72 f/s
67997: done 145 episodes, mean reward 0.719, speed 82.49 f/s
71997: done 154 episodes, mean rew

631997: done 1414 episodes, mean reward 12.041, speed 105.11 f/s
635997: done 1423 episodes, mean reward 12.317, speed 89.48 f/s
639997: done 1432 episodes, mean reward 12.644, speed 44.05 f/s
643997: done 1441 episodes, mean reward 12.570, speed 46.76 f/s
647997: done 1450 episodes, mean reward 12.688, speed 53.70 f/s
651997: done 1459 episodes, mean reward 12.734, speed 49.74 f/s
655997: done 1468 episodes, mean reward 12.806, speed 47.24 f/s
659997: done 1477 episodes, mean reward 12.998, speed 55.67 f/s
663997: done 1486 episodes, mean reward 12.748, speed 36.70 f/s
667997: done 1495 episodes, mean reward 12.737, speed 62.23 f/s
671997: done 1504 episodes, mean reward 12.821, speed 68.91 f/s
675997: done 1513 episodes, mean reward 13.082, speed 41.15 f/s
679997: done 1522 episodes, mean reward 12.940, speed 44.08 f/s
683997: done 1531 episodes, mean reward 12.802, speed 58.14 f/s
687997: done 1540 episodes, mean reward 12.827, speed 67.52 f/s
691997: done 1549 episodes, mean reward

1183997: done 2656 episodes, mean reward 15.762, speed 89.91 f/s
1187997: done 2665 episodes, mean reward 15.591, speed 85.19 f/s
1191997: done 2674 episodes, mean reward 15.635, speed 83.72 f/s
1195997: done 2683 episodes, mean reward 15.870, speed 82.90 f/s
1199997: done 2692 episodes, mean reward 15.985, speed 80.01 f/s
1203997: done 2701 episodes, mean reward 15.987, speed 80.18 f/s
1207997: done 2710 episodes, mean reward 16.270, speed 78.35 f/s
1211997: done 2719 episodes, mean reward 16.585, speed 79.30 f/s
1215997: done 2728 episodes, mean reward 16.610, speed 80.81 f/s
1219997: done 2737 episodes, mean reward 16.594, speed 75.47 f/s
1223997: done 2746 episodes, mean reward 16.842, speed 73.30 f/s
1227997: done 2755 episodes, mean reward 16.862, speed 78.16 f/s
1231997: done 2764 episodes, mean reward 16.983, speed 80.85 f/s
1235997: done 2773 episodes, mean reward 16.642, speed 78.11 f/s
1243997: done 2791 episodes, mean reward 16.255, speed 161.48 f/s
1251997: done 2809 episo

1779997: done 3997 episodes, mean reward 16.200, speed 72.90 f/s
1783997: done 4006 episodes, mean reward 16.519, speed 76.83 f/s
1787997: done 4015 episodes, mean reward 16.610, speed 78.90 f/s
1791997: done 4024 episodes, mean reward 16.421, speed 77.49 f/s
1795997: done 4033 episodes, mean reward 16.460, speed 73.45 f/s
1799997: done 4042 episodes, mean reward 16.012, speed 76.28 f/s
1803997: done 4051 episodes, mean reward 15.973, speed 80.58 f/s
1807997: done 4060 episodes, mean reward 15.817, speed 77.32 f/s
1811997: done 4069 episodes, mean reward 16.129, speed 73.53 f/s
1815997: done 4078 episodes, mean reward 16.067, speed 73.76 f/s
1819997: done 4087 episodes, mean reward 16.257, speed 72.16 f/s
1823997: done 4096 episodes, mean reward 16.191, speed 74.99 f/s
1827997: done 4105 episodes, mean reward 16.133, speed 77.88 f/s
1831997: done 4114 episodes, mean reward 15.949, speed 76.58 f/s
1835997: done 4123 episodes, mean reward 16.375, speed 75.56 f/s
1839997: done 4132 episod

2375997: done 5338 episodes, mean reward 20.508, speed 64.63 f/s
2379997: done 5347 episodes, mean reward 20.209, speed 50.93 f/s
2383997: done 5356 episodes, mean reward 19.964, speed 72.35 f/s
2387997: done 5365 episodes, mean reward 20.361, speed 73.71 f/s
2391997: done 5374 episodes, mean reward 19.856, speed 71.85 f/s
2395997: done 5383 episodes, mean reward 19.964, speed 72.04 f/s
2399997: done 5392 episodes, mean reward 19.818, speed 74.29 f/s
2403997: done 5401 episodes, mean reward 19.734, speed 71.38 f/s
2407997: done 5410 episodes, mean reward 19.457, speed 73.49 f/s
2411997: done 5419 episodes, mean reward 19.232, speed 70.41 f/s
2415997: done 5428 episodes, mean reward 19.588, speed 72.11 f/s
2419997: done 5437 episodes, mean reward 19.671, speed 77.35 f/s
2423997: done 5446 episodes, mean reward 19.977, speed 72.91 f/s
2427997: done 5455 episodes, mean reward 20.259, speed 69.65 f/s
2431997: done 5464 episodes, mean reward 19.952, speed 75.98 f/s
2435997: done 5473 episod

2967997: done 6670 episodes, mean reward 21.333, speed 76.22 f/s
2971997: done 6679 episodes, mean reward 21.858, speed 79.00 f/s
2975997: done 6688 episodes, mean reward 21.809, speed 81.18 f/s
2979997: done 6697 episodes, mean reward 21.492, speed 78.04 f/s
2983997: done 6706 episodes, mean reward 21.734, speed 77.82 f/s
2987997: done 6715 episodes, mean reward 21.250, speed 82.50 f/s
2991997: done 6724 episodes, mean reward 21.256, speed 80.00 f/s
2995997: done 6733 episodes, mean reward 21.142, speed 77.15 f/s
2999997: done 6742 episodes, mean reward 21.034, speed 79.31 f/s
3003997: done 6751 episodes, mean reward 21.607, speed 80.88 f/s
3007997: done 6760 episodes, mean reward 21.668, speed 77.99 f/s
3011997: done 6769 episodes, mean reward 21.896, speed 81.16 f/s
3015997: done 6778 episodes, mean reward 21.714, speed 81.02 f/s
3019997: done 6787 episodes, mean reward 21.869, speed 77.26 f/s
3023997: done 6796 episodes, mean reward 22.029, speed 82.28 f/s
3027997: done 6805 episod

3559997: done 8002 episodes, mean reward 22.088, speed 143.14 f/s
3567997: done 8020 episodes, mean reward 22.227, speed 140.86 f/s
3575997: done 8038 episodes, mean reward 22.377, speed 126.00 f/s
3579997: done 8047 episodes, mean reward 22.592, speed 106.47 f/s
3583997: done 8056 episodes, mean reward 22.104, speed 102.02 f/s
3587997: done 8065 episodes, mean reward 22.439, speed 89.51 f/s
3591997: done 8074 episodes, mean reward 22.620, speed 81.72 f/s
3595997: done 8083 episodes, mean reward 22.683, speed 78.86 f/s
3599997: done 8092 episodes, mean reward 22.457, speed 76.51 f/s
3603997: done 8101 episodes, mean reward 22.325, speed 80.39 f/s
3607997: done 8110 episodes, mean reward 21.860, speed 79.03 f/s
3611997: done 8119 episodes, mean reward 21.868, speed 77.09 f/s
3615997: done 8128 episodes, mean reward 21.912, speed 74.78 f/s
3619997: done 8137 episodes, mean reward 22.141, speed 80.41 f/s
3623997: done 8146 episodes, mean reward 21.916, speed 82.10 f/s
3627997: done 8155 e

In [None]:
plt.hist(np.asanyarray(critic_losses))

In [None]:
plt.plot(np.asanyarray(critic_losses))

In [None]:
max_score_line = [s for s in log_data if "max_score: " in s]

In [None]:
max_scores = [float(s.strip().split()[-1]) for s in max_score_line]

In [None]:
len(max_scores)

In [None]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
env_info = env.reset()[brain_name]

In [None]:
env_info.previous_vector_actions

In [None]:
len(env_info.vector_observations)

In [None]:
config.TAU