In [1]:
from unityagents import UnityEnvironment
from buffer import ReplayBuffer
from maddpg3 import MADDPG
import torch
import numpy as np
import os
from utilities import transpose_list, transpose_to_tensor

In [2]:
from tensorboardX import SummaryWriter

In [3]:
import torch

In [4]:
env = UnityEnvironment(file_name="Tennis.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [5]:
# keep training awake
from workspace_utils import keep_awake
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])


Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [6]:
def seeding(seed=1):
    np.random.seed(seed)
    torch.manual_seed(seed)

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
samples2 = []
def mainA():
    seeding()
    parallel_envs = 1
    number_of_episodes = 100
    episode_length = 80
    batchsize = 240
    save_interval = 100
    noise = 2
    averageR = []
    noise_reduction = 0.9999
    episode_per_update = 2 * parallel_envs
    log_path = os.getcwd()+"/log"
    model_dir= os.getcwd()+"/model_dir"
    os.makedirs(model_dir, exist_ok=True)
    torch.set_num_threads(parallel_envs)
    env_info = env.reset(train_mode=True)[brain_name]
    states_all = env_info.vector_observations
    buffer = ReplayBuffer(int(5000*episode_length))
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    for episode in (range(0, 10000)):
        env_info = env.reset(train_mode=True)[brain_name]
        reward_this_episode = np.zeros((parallel_envs, 2))
        save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs)
        
        for ag in range(2):
            obs_full = env_info.vector_observations #
            obs = np.array(obs_full[ag])
            actions = maddpg.act(torch.from_numpy(obs_full).float().to(device), noise=noise)
            #print(obs_full)
            #print(obs)
            #for episode_t in range(2):
            actions = maddpg.act(torch.from_numpy(obs_full).float().to(device), noise=noise)

            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()
            actions_for_env = np.rollaxis(actions_array,1)
            env_info = env.step(actions_for_env)[brain_name]
            next_obs_full = env_info.vector_observations
            next_obs = np.array(next_obs_full[ag])
            rewards = env_info.rewards
            dones = env_info.local_done
            transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones)
            buffer.push(transition)
            reward_this_episode += rewards
            obs, obs_full = next_obs, next_obs_full


    
        if len(buffer) % batchsize == 0 :
                for a_i in range(2):
                    samples = buffer.sample(24)
                    maddpg.update([samples], a_i, logger)
                maddpg.update_targets() #soft update the target network towards the actual networks
        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i,0])
            agent1_reward.append(reward_this_episode[i,1])
        
        if episode % 100 == 0 or episode == number_of_episodes-1:
            avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)]
            max_rewards = [reward_this_episode[i,0], reward_this_episode[i,1]]
            averageR.append(max(max_rewards))
            agent0_reward = []
            agent1_reward = []
            
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)
                print('agent%i/mean_episode_rewards' % a_i, np.sum(averageR), episode)
                
        save_dict_list =[]
        if save_info:
            for i in range(2):

                save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                             'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                             'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                             'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)

                torch.save(save_dict_list, 
                           os.path.join(model_dir, 'episode-{}.pt'.format(episode)))
            




    #env.close()
    logger.close()
    #samples = buffer.sample(24)
    #maddpg.update([samples], 0, logger)
    #maddpg.update_targets()


In [9]:
tester = mainA()



agent0/mean_episode_rewards 0.0 0
agent1/mean_episode_rewards 0.0 0
agent0/mean_episode_rewards 0.0 99
agent1/mean_episode_rewards 0.0 99
agent0/mean_episode_rewards 0.0 100
agent1/mean_episode_rewards 0.0 100
agent0/mean_episode_rewards 0.0 200
agent1/mean_episode_rewards 0.0 200
agent0/mean_episode_rewards 0.0 300
agent1/mean_episode_rewards 0.0 300
agent0/mean_episode_rewards 0.0 400
agent1/mean_episode_rewards 0.0 400
agent0/mean_episode_rewards 0.0 500
agent1/mean_episode_rewards 0.0 500
agent0/mean_episode_rewards 0.0 600
agent1/mean_episode_rewards 0.0 600
agent0/mean_episode_rewards 0.0 700
agent1/mean_episode_rewards 0.0 700
agent0/mean_episode_rewards 0.0 800
agent1/mean_episode_rewards 0.0 800
agent0/mean_episode_rewards 0.0 900
agent1/mean_episode_rewards 0.0 900
agent0/mean_episode_rewards 0.0 1000
agent1/mean_episode_rewards 0.0 1000
agent0/mean_episode_rewards 0.0 1100
agent1/mean_episode_rewards 0.0 1100
agent0/mean_episode_rewards 0.0 1200
agent1/mean_episode_rewards 0

agent0/mean_episode_rewards 0.20000000298023224 9600
agent1/mean_episode_rewards 0.20000000298023224 9600
agent0/mean_episode_rewards 0.20000000298023224 9700
agent1/mean_episode_rewards 0.20000000298023224 9700
agent0/mean_episode_rewards 0.20000000298023224 9800
agent1/mean_episode_rewards 0.20000000298023224 9800
agent0/mean_episode_rewards 0.20000000298023224 9900
agent1/mean_episode_rewards 0.20000000298023224 9900


In [None]:
env.close()