In [1]:
%matplotlib inline

from unityagents import UnityEnvironment
import numpy as np
import random
import sys
from collections import deque


import torch
import torch.optim as optim
import torch.nn.functional as F


import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
sys.path.append('./')

In [3]:
from utils.replay_buffer import ReplayBuffer
from utils.ou_noise import OUNoise
from networks.ddpg_actor import DDPGActor
from networks.ddpg_critic import DDPGCritic

from utils.config import *

In [None]:
class DDPGAgent():
    
    def __init__(self,state_size, action_size, random_seed):
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        
        # Actor Networks
        self.actor_local = DDPGActor(state_size, action_size, random_seed,ACTOR_FC1_UNITS,ACTOR_FC2_UNITS).to(device)
        self.actor_target = DDPGActor(state_size, action_size, random_seed,ACTOR_FC1_UNITS,ACTOR_FC2_UNITS).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks 
        self.critic_local = DDPGCritic(state_size, action_size, random_seed,CRITIC_FC1_UNITS,CRITIC_FC2_UNITS).to(device)
        self.critic_target = DDPGCritic(state_size, action_size, random_seed,CRITIC_FC1_UNITS,CRITIC_FC2_UNITS).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        
        self.noise = OUNoise(action_size,random_seed,NOISE_THETA,NOISE_SIGMA)
        
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
    def add_to_memory(self, state, action, reward, next_state, done):
                
        self.memory.add(state, action, reward, next_state, done)
    
    def learning_step(self):
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
                
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions.float())
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
    
    

In [None]:
env = UnityEnvironment(file_name='Reacher-2.app',no_graphics=True)

In [None]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
state_size = brain.vector_observation_space_size
action_size = brain.vector_action_space_size
num_agents = 20
env_info = env.reset(train_mode=True)[brain_name]

In [None]:
seed = 1234
n_epochs = 5000
avg_score_target = 10
avg_score_runs = 100
max_steps = 750
print_every = 10


In [None]:
BUFFER_SIZE = int(1e6)
BATCH_SIZE = 128
GAMMA = 0.99
TAU = 1e-3

NOISE_THETA = 0.15
NOISE_SIGMA = 0.2

LR_ACTOR = 1e-4
LR_CRITIC = 1e-4

In [None]:
agent = DDPGAgent(state_size,action_size,seed)

In [None]:
scores_deque_10 = deque(maxlen=10)
scores_deque_50 = deque(maxlen=50)
scores_deque_100 = deque(maxlen=100)
scores = []

for epoch in range(1, n_epochs+1):
    
    states = env_info.vector_observations
    agent.reset()
    
    total_scores = np.zeros(num_agents)
    
    for t in range(max_steps):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
               
        total_scores += rewards
        
        _ = [agent.add_to_memory(states[i],actions[i],rewards[i],next_states[i],dones[i]) for i in range(num_agents)]        
        
        if t % 20 == 0:
            for _ in range(10):
                agent.learning_step()
        
        states = next_states
    
    
    
    scores_deque_10.extend(total_scores)
    scores_deque_50.extend(total_scores)
    scores_deque_100.extend(total_scores)
    scores.extend(total_scores)
        
    if epoch % print_every == 0:
        
        avg_scores_10 = np.asanyarray(scores_deque_10).mean()
        avg_scores_50 = np.asanyarray(scores_deque_50).mean()
        avg_scores_100 = np.asanyarray(scores_deque_100).mean()
        
        print(f"Epoch: {epoch} \tavg_score_10: {avg_scores_10}\tavg_score_50: {avg_scores_50}\tavg_score_100: {avg_scores_100}")
        if avg_scores_100 > avg_score_runs:
            print("Enviroment Solved!")
            break

env.close()

In [68]:
config_grid = {

        #NETWORK
        'FC1_UNITS' : [32,64,128],
        'FC2_UNITS': [32, 64, 128],
        'ACTOR_LR': [1e-2, 1e-3,1e-4],
        'CRITIC_LR': [1e-2, 1e-3, 1e-4],
        'TAU': [1e-2, 1e-3, 1e-4],
        'NOISE_SIGMA': [0.1, 0.3, 0.5],

        #REPLAY BUFF
        'BUFFER_SIZE': [1e4,1e5,1e6],
        'BATCH_SIZE': [64, 128, 256],

        'MAX_STEPS': [ 200,300,400],
        'TRAIN_STEP': [10,50,100],
        'TRAIN_TIME': [10, 50, 100],

    }

In [158]:
experiments = np.array([np.random.choice([0,1,2],200) for _ in range(len(config_grid.keys()))])
experiments = np.unique(experiments,axis=1)[:,:100].T

In [189]:
keys = list(config_grid.keys())

(3, 11)

In [166]:
e = experiments[0]

In [192]:
configs = []
for i in range(100):
    e = experiments[i]
    c = { k:config_grid[k][e[i]] for i,k in enumerate(keys)}
    configs.append(c)

100

In [168]:
a = np.unique(np.array([[1,1,1,4],[1,1,1,4],[2,1,1,1],[2,2,2,2],[2,2,2,2]]),axis=0)

In [195]:
adv = 10000
sumv = 5000


In [197]:
5000/0.3

16666.666666666668

In [198]:
log_data = open('./reacher.log').readlines()

In [200]:
len(log_data)

992

In [204]:
max_score_line = [s for s in log_data if "max_score: " in s]

In [211]:
max_scores = [float(s.strip().split()[-1]) for s in max_score_line]

In [213]:
len(max_scores)

27

##### 