In [1]:
%matplotlib inline

from unityagents import UnityEnvironment
import numpy as np
import random
import sys
from collections import deque


import torch
import torch.optim as optim
import torch.nn.functional as F


import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
sys.path.append('./')

In [3]:
from ddpg_model import *

In [4]:
env = UnityEnvironment(file_name='Reacher-2.app',no_graphics=True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [5]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
state_size = brain.vector_observation_space_size
action_size = brain.vector_action_space_size
num_agents = 20
env_info = env.reset(train_mode=True)[brain_name]

In [6]:
seed = 1234
n_epochs = 5000
avg_score_target = 10
avg_score_runs = 100
print_every = 10


In [7]:
config = Config()

In [8]:
agent = DDPGAgent(state_size,action_size,seed,config)

In [None]:
scores_deque_10 = deque(maxlen=10)
scores_deque_50 = deque(maxlen=50)
scores_deque_100 = deque(maxlen=100)
scores = []

for epoch in range(1, n_epochs+1):
    
    states = env_info.vector_observations
    agent.reset()
    
    total_scores = np.zeros(num_agents)
    
    for t in range(config.MAX_STEPS):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
               
        total_scores += rewards
        
        _ = [agent.add_to_memory(states[i],actions[i],rewards[i],next_states[i],dones[i]) for i in range(num_agents)]        
        
        if t % config.TRAIN_STEPS == 0:
            for _ in range(config.TRAIN_TIMES):
                agent.learning_step()
        
        states = next_states
    
    
    
    scores_deque_10.extend(total_scores)
    scores_deque_50.extend(total_scores)
    scores_deque_100.extend(total_scores)
    scores.extend(total_scores)
        
    if epoch % print_every == 0:
        
        avg_scores_10 = np.asanyarray(scores_deque_10).mean()
        avg_scores_50 = np.asanyarray(scores_deque_50).mean()
        avg_scores_100 = np.asanyarray(scores_deque_100).mean()
        
        print(f"Epoch: {epoch} \tavg_score_10: {avg_scores_10}\tavg_score_50: {avg_scores_50}\tavg_score_100: {avg_scores_100}")
        if avg_scores_100 > avg_score_runs:
            print("Enviroment Solved!")
            break

env.close()

Epoch: 10 	avg_score_10: 0.0	avg_score_50: 0.011599999740719796	avg_score_100: 0.005799999870359898
Epoch: 20 	avg_score_10: 0.4959999889135361	avg_score_50: 0.3745999916270375	avg_score_100: 0.32509999273344875
Epoch: 30 	avg_score_10: 0.35999999195337296	avg_score_50: 0.274999993853271	avg_score_100: 0.31539999295026067
Epoch: 40 	avg_score_10: 0.19299999568611384	avg_score_50: 0.28379999365657566	avg_score_100: 0.26549999406561253
Epoch: 50 	avg_score_10: 0.2559999942779541	avg_score_50: 0.24639999449253083	avg_score_100: 0.281899993699044
Epoch: 60 	avg_score_10: 0.07699999827891588	avg_score_50: 0.17799999602138997	avg_score_100: 0.23759999468922616
Epoch: 70 	avg_score_10: 0.1839999958872795	avg_score_50: 0.23839999467134476	avg_score_100: 0.2397999946400523
Epoch: 80 	avg_score_10: 0.15499999653548002	avg_score_50: 0.24259999457746745	avg_score_100: 0.21939999509602784
Epoch: 90 	avg_score_10: 0.27799999378621576	avg_score_50: 0.2825999936833978	avg_score_100: 0.2930999934487044

In [214]:
log_data = open('./reacher.log').readlines()

In [215]:
len(log_data)

1281

In [216]:
max_score_line = [s for s in log_data if "max_score: " in s]

In [217]:
max_scores = [float(s.strip().split()[-1]) for s in max_score_line]

In [219]:
len(max_scores)

33

In [220]:
max(max_scores)

0.6968999844230711

##### 