In [1]:
import time
from collections import deque

from unityagents import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt
import torch

from maddpg import MADDPGAgent

In [2]:
def plot_scores(scores):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()


In [3]:
def run_episode(env, brain_name, agents, n_episodes=2000, max_steps=1000, update_every=2, update_count=4, save_checkpoint=True):
    score_total_means = deque(maxlen=100)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        # Reset Env and Agent
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        
        # total score of 20 agents
        total_rewards = np.zeros(len(agents))
        
        for agent in agents:
            agent.reset()
        
        start_time = time.time()
        
        for t in range(max_steps):
            actions = [agent.act(state) for agent, state in zip(agents, states)]
            env_info = env.step(actions)[brain_name]
            
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done 
            
            for i, agent in enumerate(agents):
                agent.step(states[i], actions[i], actions[1-i], rewards[i], next_states[i], dones[i])
            
            states = next_states
            total_rewards += env_info.rewards
            
            if t % update_every == 0:
                for _ in range(update_count):
                    for agent in agents:
                        agent.start_learn()

            if np.any(dones):
                break

    
        duration = time.time() - start_time
        
        min_score = np.min(total_rewards)
        max_score = np.max(total_rewards)
        scores.append(max_score)
        score_total_means.append(max_score)
        total_average = np.mean(score_total_means)
        
        print('\rEpisode {}({} steps)\tTotal Average Score: {:.2f}\tMin: {:.2f}\tMax: {:.2f}\tDuration: {:.2f}'
              .format(i_episode, t, total_average, min_score, max_score, duration))

        if i_episode % 10 == 0:
            for agent in agents:
                agent.save()
            
        if total_average >= 0.5 and i_episode >= 100:
            print('Problem Solved after {} epsisodes!! Total Average score: {:.2f}'.format(i_episode, total_average))
            for agent in agents:
                agent.save("solved")
            break
    
    return scores

In [4]:
env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]
Device: cuda:0


In [None]:
load_checkpoint = True
save_checkpoint = False

agents = [
    MADDPGAgent(i, state_size, action_size, num_agents, random_seed=7, device=device, load_checkpoint=load_checkpoint) 
    for i in range(num_agents)
]
scores = run_episode(env, brain_name, agents, save_checkpoint=False)



Episode 1(31 steps)	Total Average Score: 0.10	Min: -0.01	Max: 0.10	Duration: 1.58
Episode 2(51 steps)	Total Average Score: 0.10	Min: -0.01	Max: 0.10	Duration: 0.31
Episode 3(170 steps)	Total Average Score: 0.20	Min: 0.30	Max: 0.39	Duration: 0.86
Episode 4(64 steps)	Total Average Score: 0.17	Min: 0.09	Max: 0.10	Duration: 0.31
Episode 5(33 steps)	Total Average Score: 0.16	Min: -0.01	Max: 0.10	Duration: 0.15
Episode 6(85 steps)	Total Average Score: 0.17	Min: 0.19	Max: 0.20	Duration: 0.38
Episode 7(13 steps)	Total Average Score: 0.14	Min: -0.01	Max: 0.00	Duration: 0.06
Episode 8(18 steps)	Total Average Score: 0.12	Min: -0.01	Max: 0.00	Duration: 0.08
Episode 9(101 steps)	Total Average Score: 0.13	Min: 0.19	Max: 0.20	Duration: 9.02
Episode 10(13 steps)	Total Average Score: 0.12	Min: -0.01	Max: 0.00	Duration: 2.04
Episode 11(31 steps)	Total Average Score: 0.11	Min: -0.01	Max: 0.00	Duration: 4.58
Episode 12(13 steps)	Total Average Score: 0.10	Min: -0.01	Max: 0.00	Duration: 1.98
Episode 13(13 s

Episode 100(14 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.00	Duration: 2.30
Episode 101(31 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.10	Duration: 4.82
Episode 102(113 steps)	Total Average Score: 0.08	Min: 0.19	Max: 0.30	Duration: 16.66
Episode 103(31 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.10	Duration: 4.62
Episode 104(51 steps)	Total Average Score: 0.08	Min: 0.09	Max: 0.10	Duration: 7.41
Episode 105(50 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.10	Duration: 7.38
Episode 106(49 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.10	Duration: 7.16
Episode 107(32 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.10	Duration: 4.92
Episode 108(138 steps)	Total Average Score: 0.08	Min: 0.29	Max: 0.30	Duration: 20.02
Episode 109(13 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.00	Duration: 2.03
Episode 110(15 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.00	Duration: 2.47
Episode 111(32 steps)	Total Average Score: 0.08	Min: -0.01	Max: 0.10	Durati

Episode 198(13 steps)	Total Average Score: 0.07	Min: -0.01	Max: 0.00	Duration: 2.02
Episode 199(71 steps)	Total Average Score: 0.07	Min: 0.09	Max: 0.10	Duration: 10.39
Episode 200(43 steps)	Total Average Score: 0.07	Min: -0.01	Max: 0.10	Duration: 6.30
Episode 201(69 steps)	Total Average Score: 0.07	Min: 0.09	Max: 0.10	Duration: 10.23
Episode 202(71 steps)	Total Average Score: 0.07	Min: 0.09	Max: 0.20	Duration: 10.25
Episode 203(13 steps)	Total Average Score: 0.07	Min: -0.01	Max: 0.00	Duration: 2.01
Episode 204(13 steps)	Total Average Score: 0.07	Min: -0.01	Max: 0.00	Duration: 2.05
Episode 205(18 steps)	Total Average Score: 0.06	Min: -0.01	Max: 0.00	Duration: 2.87
Episode 206(13 steps)	Total Average Score: 0.06	Min: -0.01	Max: 0.00	Duration: 1.98
Episode 207(32 steps)	Total Average Score: 0.06	Min: -0.01	Max: 0.10	Duration: 4.93
Episode 208(30 steps)	Total Average Score: 0.06	Min: -0.01	Max: 0.10	Duration: 4.60
Episode 209(13 steps)	Total Average Score: 0.06	Min: -0.01	Max: 0.00	Duratio

Episode 296(69 steps)	Total Average Score: 0.09	Min: 0.09	Max: 0.20	Duration: 10.00
Episode 297(88 steps)	Total Average Score: 0.09	Min: 0.19	Max: 0.20	Duration: 12.90
Episode 298(14 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.00	Duration: 2.28
Episode 299(16 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.00	Duration: 2.57
Episode 300(31 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.10	Duration: 4.58
Episode 301(14 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.00	Duration: 2.53
Episode 302(50 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.10	Duration: 7.42
Episode 303(13 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.00	Duration: 2.01
Episode 304(14 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.00	Duration: 2.29
Episode 305(33 steps)	Total Average Score: 0.09	Min: -0.01	Max: 0.10	Duration: 4.81
Episode 306(141 steps)	Total Average Score: 0.09	Min: 0.30	Max: 0.39	Duration: 20.54
Episode 307(146 steps)	Total Average Score: 0.10	Min: 0.29	Max: 0.30	Durati

Episode 394(85 steps)	Total Average Score: 0.13	Min: 0.19	Max: 0.20	Duration: 12.40
Episode 395(51 steps)	Total Average Score: 0.13	Min: 0.09	Max: 0.10	Duration: 7.68
Episode 396(13 steps)	Total Average Score: 0.13	Min: -0.01	Max: 0.00	Duration: 2.03
Episode 397(13 steps)	Total Average Score: 0.13	Min: -0.01	Max: 0.00	Duration: 2.03
Episode 398(144 steps)	Total Average Score: 0.13	Min: 0.29	Max: 0.30	Duration: 20.97
Episode 399(131 steps)	Total Average Score: 0.14	Min: 0.29	Max: 0.30	Duration: 18.93
Episode 400(13 steps)	Total Average Score: 0.13	Min: -0.01	Max: 0.00	Duration: 2.04
Episode 401(89 steps)	Total Average Score: 0.14	Min: 0.09	Max: 0.20	Duration: 13.68
Episode 402(14 steps)	Total Average Score: 0.14	Min: -0.01	Max: 0.00	Duration: 2.74
Episode 403(13 steps)	Total Average Score: 0.14	Min: -0.01	Max: 0.00	Duration: 2.39
Episode 404(41 steps)	Total Average Score: 0.14	Min: -0.01	Max: 0.10	Duration: 7.33
Episode 405(52 steps)	Total Average Score: 0.14	Min: 0.09	Max: 0.10	Duratio

In [None]:
plot_scores(scores)