In [2]:
from unityagents import UnityEnvironment
import numpy as np
import random
import sys
from collections import deque,namedtuple
import copy
import ptan

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tensorboardX import SummaryWriter

import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
sys.path.append('./')


In [4]:
from maddpg_model import Config

In [5]:

class ReplayBuffer(object):
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size,device):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
        """
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.device = device

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience( state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)
        
        states = torch.from_numpy(np.array([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.array([e.action for e in experiences if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(np.array([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.array([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.array([e.done for e in experiences if e is not None]).astype('uint8')).to(self.device)
        
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [6]:
env = UnityEnvironment(file_name='Tennis.app',no_graphics=True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [7]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [8]:
env_info = env.reset(train_mode=True)[brain_name]
n_agents = len(env_info.agents)
state_size = env_info.vector_observations.shape[1]
action_size = env_info.previous_vector_actions.shape[0]

In [9]:
config = Config()

writer = SummaryWriter(comment="-tennis_maddpg")

In [18]:
config.ACTOR_FC1_UNITS = 128
config.ACTOR_FC2_UNITS = 64
config.CRITIC_FC1_UNITS = 128
config.CRITIC_FC2_UNITS = 64
config.NOISE_THETA = 0.15
config.NOISE_SIGMA = 0.2
config.LR_ACTOR = 1e-4
config.LR_CRITIC = 3e-4
config.TAU = 1e-4

#REPLAY BUFFER
config.BUFFER_SIZE = int(1e6)
config.BATCH_SIZE = 512
config.GAMMA = 0.99

config.device = 'cpu'

In [24]:
class AgentDDPG(ptan.agent.BaseAgent):

    def __init__(self, net, device="cpu", ou_enabled=True, ou_mu=0.0, ou_teta=0.15, ou_sigma=0.2, ou_epsilon=1.0):
        self.net = net
        self.device = device
        self.ou_enabled = ou_enabled
        self.ou_mu = ou_mu
        self.ou_teta = ou_teta
        self.ou_sigma = ou_sigma
        self.ou_epsilon = ou_epsilon

    def initial_state(self):
        return None

    def __call__(self, states, agent_states):
        states_v = ptan.agent.float32_preprocessor(states).to(self.device)
        mu_v = self.net(states_v)
        actions = mu_v.data.cpu().numpy()

        if self.ou_enabled and self.ou_epsilon > 0:
            new_a_states = []
            for a_state, action in zip(agent_states, actions):
                if a_state is None:
                    a_state = np.zeros(shape=action.shape, dtype=np.float32)
                a_state += self.ou_teta * (self.ou_mu - a_state)
                a_state += self.ou_sigma * np.random.normal(size=action.shape)

                action += self.ou_epsilon * a_state
                new_a_states.append(a_state)
        else:
            new_a_states = agent_states

        actions = np.clip(actions, -1, 1)
        return actions, new_a_states

In [25]:
class DDPGActor(nn.Module):
    def __init__(self, state_size, action_size, fc1_units=400, fc2_units=300):
        super(DDPGActor, self).__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(state_size),
            nn.Linear(state_size, fc1_units),
            nn.ReLU(),
            nn.BatchNorm1d(fc1_units),
            nn.Linear(fc1_units, fc2_units),
            nn.ReLU(),
            nn.BatchNorm1d(fc2_units),
            nn.Linear(fc2_units, action_size),
            nn.Tanh()
        )

    def forward(self, state):
        return self.net(state)
    
class DDPGCritic(nn.Module):
    def __init__(self,n_agents, state_size, action_size,fc1_units=400, fc2_units=300):
        super(DDPGCritic, self).__init__()

        self.obs_net = nn.Sequential(
            nn.BatchNorm1d(n_agents*state_size),
            nn.Linear(n_agents*state_size, fc1_units),
            nn.ReLU(),
        )

        self.out_net = nn.Sequential(
            nn.BatchNorm1d(fc1_units + n_agents*action_size),
            nn.Linear(fc1_units + n_agents*action_size, fc2_units),
            nn.ReLU(),
            nn.BatchNorm1d(fc2_units),
            nn.Linear(fc2_units, 1)
        )

    def forward(self, x, a):
        obs = self.obs_net(x)
        return self.out_net(torch.cat([obs, a], dim=1))

In [26]:
actor_net = DDPGActor(state_size,action_size,config.ACTOR_FC1_UNITS,config.ACTOR_FC2_UNITS)
actor_target = ptan.agent.TargetNet(actor_net)

In [27]:
critic_net = DDPGCritic(n_agents, state_size,action_size,config.CRITIC_FC1_UNITS,config.CRITIC_FC2_UNITS)
critic_target = ptan.agent.TargetNet(critic_net)

In [28]:
actor_optimizer = optim.Adam(actor_net.parameters(),lr=config.LR_ACTOR)
critic_optimizer = optim.Adam(critic_net.parameters(),lr=config.LR_CRITIC)

In [29]:
agent = AgentDDPG(actor_net,device=config.device)
test_agent = AgentDDPG(actor_net,device=config.device,ou_enabled=False)

In [126]:
replay_buffer = ReplayBuffer(config.BUFFER_SIZE,config.BATCH_SIZE,config.device)

In [37]:
def populate_replay_buffer(n_traj):
    
    agent_states = [None for _ in range(n_agents)]
    
    for _ in range(n_traj):
        
        env_info = env.reset(train_mode=True)[brain_name]
        
        for step in range(300):
            states = env_info.vector_observations
            actions,agent_states = agent(states,agent_states)
            env_info= env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            dones = env_info.local_done
            rewards = env_info.rewards            

            replay_buffer.add(states,actions,rewards,next_states,dones)

            if np.any(dones):                                
                break

In [127]:
#replay_buffer = ReplayBuffer(config.BUFFER_SIZE,5,config.device)
#populate_replay_buffer(5)
#states,actions,rewards,next_states,dones = replay_buffer.sample()

In [None]:
rewards_history = []
stop_score = 0.5

num_epochs = 50000
max_steps = 300
WAIT_EPOCHS = 300

with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker:
    
    agent_states = [None for _ in range(n_agents)]
    
    for epoch in range(1,num_epochs):

        env_info = env.reset(train_mode=True)[brain_name]
        
        tot_reward = np.zeros(n_agents)
        
        for step in range(max_steps):
            states = env_info.vector_observations
            actions,agent_states = agent(states,agent_states)
            env_info= env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            dones = env_info.local_done
            rewards = env_info.rewards
            tot_reward += rewards

            replay_buffer.add(states,actions,rewards,next_states,dones)

            if np.any(dones):
                max_reward = tot_reward.max()
                tb_tracker.track("reward", max_reward, epoch)               
                rewards_history.append(max_reward)
                
                reward_100 = np.mean(rewards_history[-100:])
                tb_tracker.track("reward_100", reward_100, epoch)
                
                print(f'Epoch: {epoch} reward: {max_reward}',end='\r')

                if epoch % 100 == 0:
                    print(f'Epoch: {epoch} reward_100: {reward_100}')
                
                break

            if len(replay_buffer) > config.BATCH_SIZE and epoch > WAIT_EPOCHS:
            
                states,actions,rewards,next_states,dones = replay_buffer.sample()
                
                #Actor View
                s_v = states.view(-1,state_size)
                a_v = actions.view(-1,action_size)
                ns_v = next_states.view(-1,state_size)
                r_v = rewards.view(-1,1)
                done_v = dones.view(-1,1)
                
                #Critic View. 
                fs_v = states.view(-1,n_agents*state_size)                
                fa_v = actions.view(-1,n_agents*action_size)                
                fns_v = next_states.view(-1,n_agents*state_size)
                fr_v = torch.from_numpy(rewards.detach().numpy().max(axis=1,keepdims=True))
                fd_v = dones.any(axis = 1,keepdims=True)

                #Calculate Q Targets and Loss
                
                fna_v = actor_target.model(ns_v).view(-1,n_agents*action_size)
                q_next = critic_target.model(fns_v,fna_v)                
                q_target = fr_v + config.GAMMA*q_next
                
                q_expected = critic_net(fs_v,fa_v*(1-fd_v))
                critic_loss = F.mse_loss(q_expected, q_target)
                
                #Critic training step. clip gradients
                critic_optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(critic_net.parameters(), 1)
                critic_optimizer.step()
                
                #Actor Training Step
                actions_pred = actor_net(s_v).view(-1,n_agents*action_size)                
                actor_loss = -critic_net(fs_v, actions_pred).mean()
                
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                actor_target.alpha_sync(alpha=1 - config.TAU)
                critic_target.alpha_sync(alpha=1 - config.TAU)
                
                tb_tracker.track("loss_critic", critic_loss, epoch)        
                tb_tracker.track("loss_actor", actor_loss, epoch)
        
       

        if len(rewards_history) > 100 and reward_100 > stop_score:
            print(f'Solved. Episode {epoch}, mean reward {reward_100}')
            break


    learning_state = {
            'actor': actor_net.state_dict(),
            'critic': critic_net.state_dict(),
            'act_opt': actor_optimizer.state_dict(),
            'crt_opt': critic_optimizer.state_dict(),
            'epoch': epoch,
            'rewards_history': rewards_history
    }

    torch.save(learning_state,'./partial_state.ckp')

    writer.close()
    env.close()

Epoch: 100 reward_100: 0.008900000136345626
Epoch: 200 reward_100: 0.007800000142306089
Epoch: 300 reward_100: 0.010900000166147948
Epoch: 400 reward_100: 0.010000000149011612
Epoch: 500 reward_100: 0.0020000000298023225
Epoch: 594 reward: 0.00000000149011612

In [None]:
learning_state = {
        'actor': actor_net.state_dict(),
        'critic': critic_net.state_dict(),
        'act_opt': actor_optimizer.state_dict(),
        'crt_opt': critic_optimizer.state_dict(),
        'epoch': epoch,
        'rewards_history': rewards_history
}
    
torch.save(learning_state,'./partial_state.ckp')