In [1]:
from unityagents import UnityEnvironment
import numpy as np
import random
import sys
from collections import deque,namedtuple
import copy
import ptan

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tensorboardX import SummaryWriter

import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
sys.path.append('./')


In [3]:
from maddpg_model import Config

In [4]:

class ReplayBuffer(object):
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size,device):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
        """
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.device = device

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience( state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)
        
        states = torch.from_numpy(np.array([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.array([e.action for e in experiences if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(np.array([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.array([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.array([e.done for e in experiences if e is not None]).astype('uint8')).to(self.device)
        
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [5]:
#env = UnityEnvironment(file_name='Tennis.app',no_graphics=True)
env = UnityEnvironment(file_name='Tennis_Linux/Tennis.x86_64',no_graphics=True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [6]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [7]:
env_info = env.reset(train_mode=True)[brain_name]
n_agents = len(env_info.agents)
state_size = env_info.vector_observations.shape[1]
action_size = env_info.previous_vector_actions.shape[0]

In [8]:
config = Config()

writer = SummaryWriter(comment="-tennis_maddpg")

In [9]:
config.ACTOR_FC1_UNITS = 128
config.ACTOR_FC2_UNITS = 64
config.CRITIC_FC1_UNITS = 128
config.CRITIC_FC2_UNITS = 64
config.NOISE_THETA = 0.15
config.NOISE_SIGMA = 0.2
config.LR_ACTOR = 1e-4
config.LR_CRITIC = 3e-4
config.TAU = 1e-4

#REPLAY BUFFER
config.BUFFER_SIZE = int(1e6)
config.BATCH_SIZE = 512
config.GAMMA = 0.99
config.WEIGHT_DECAY = 0
config.device = 'cpu'

seed = 43

In [10]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=400, fc2_units=300):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))


class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, n_agents, state_size, action_size, seed, fcs1_units=400, fc2_units=300):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(n_agents*state_size, fcs1_units)
        self.fc2 = nn.Linear(fcs1_units+n_agents*action_size, fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        xs = F.relu(self.fcs1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        return self.fc3(x)
    
class DDPGAgent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, n_agents, state_size, action_size, random_seed,config):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.config = config
        self.device = config.device
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,config.ACTOR_FC1_UNITS,config.ACTOR_FC2_UNITS).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed,config.ACTOR_FC1_UNITS,config.ACTOR_FC2_UNITS).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(n_agents,state_size, action_size, random_seed,config.CRITIC_FC1_UNITS,config.CRITIC_FC2_UNITS).to(self.device)
        self.critic_target = Critic(n_agents, state_size, action_size, random_seed,config.CRITIC_FC1_UNITS,config.CRITIC_FC2_UNITS).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.LR_CRITIC, weight_decay=config.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
    
    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()
        
    
    def critic_training_step(self,full_states,full_actions, full_next_states, full_next_actions, rewards,dones):        
        q_target_next = self.critic_target(full_next_states, full_next_actions)
        q_targets = rewards + (self.config.GAMMA * q_target_next * (1 - dones))
        q_expected = self.critic_local(full_states, full_actions)
        
        critic_loss = F.mse_loss(q_expected, q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        return critic_loss
        
     
    def actor_training_step(self,full_states,full_actions):
        
        actor_loss = -self.critic_local(full_states, full_actions).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        return actor_loss

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state



In [11]:
agents = [DDPGAgent(n_agents,state_size,action_size,seed,config) for _ in range(n_agents)]

In [12]:
replay_buffer = ReplayBuffer(config.BUFFER_SIZE,config.BATCH_SIZE,config.device)

In [13]:
def populate_replay_buffer(agents,n_traj):
    
    for _ in range(n_traj):
        
        env_info = env.reset(train_mode=True)[brain_name]
        
        for step in range(300):
            states = env_info.vector_observations
            
            actions = [agents[i].act(states[i]) for i in range(len(agents))]
            env_info= env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            dones = env_info.local_done
            rewards = env_info.rewards            

            replay_buffer.add(states,actions,rewards,next_states,dones)
            
            
            
            if np.any(dones):                                
                break

In [14]:
#replay_buffer = ReplayBuffer(config.BUFFER_SIZE,5,config.device)
#populate_replay_buffer(agents,5)
#states,actions,rewards,next_states,dones = replay_buffer.sample()

In [15]:
#fs_v = states.view(-1,n_agents*state_size)
#fa_v = actions.view(-1,n_agents*action_size)
#fns_v = next_states.view(-1,n_agents*state_size)
#r_v = rewards.view(-1,n_agents,1)
#d_v = dones.view(-1,n_agents,1)
#next_actions = [agents[i].act(next_states[:,i,:].detach().numpy()) for i in range(len(agents))]
#fna_v = torch.from_numpy(np.concatenate(next_actions,axis=1)).to(config.device)

In [16]:
#_ = [agents[i].critic_training_step(fs_v,fa_v, fns_v, fna_v, r_v[:,i,:], d_v[:,i,:]) for i in range(n_agents)]

In [17]:
#_ = [agents[i].actor_training_step(fs_v,fa_v) for i in range(n_agents)]

In [None]:
rewards_history = []
stop_score = 0.5

num_epochs = 50000
max_steps = 300
WAIT_EPOCHS = 300

with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker:
        
    for epoch in range(1,num_epochs):

        env_info = env.reset(train_mode=True)[brain_name]
        
        tot_reward = np.zeros(n_agents)
        
        for step in range(max_steps):
            states = env_info.vector_observations
            actions = [agents[i].act(states[i]) for i in range(len(agents))]
            env_info= env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            dones = env_info.local_done
            rewards = env_info.rewards
            tot_reward += rewards

            replay_buffer.add(states,actions,rewards,next_states,dones)

            if np.any(dones):
                max_reward = tot_reward.max()
                tb_tracker.track("reward", max_reward, epoch)               
                rewards_history.append(max_reward)
                
                reward_100 = np.mean(rewards_history[-100:])
                tb_tracker.track("reward_100", reward_100, epoch)
                
                print(f'Epoch: {epoch} reward: {max_reward}',end='\r')

                if epoch % 100 == 0:
                    print(f'Epoch: {epoch} reward_100: {reward_100}')
                
                break

            if len(replay_buffer) > config.BATCH_SIZE and epoch > WAIT_EPOCHS:
            
                states,actions,rewards,next_states,dones = replay_buffer.sample()
                
                fs_v = states.view(-1,n_agents*state_size)
                fa_v = actions.view(-1,n_agents*action_size)
                fns_v = next_states.view(-1,n_agents*state_size)
                r_v = rewards.view(-1,n_agents,1)
                d_v = dones.view(-1,n_agents,1)
                next_actions = [agents[i].act(next_states[:,i,:].detach().numpy()) for i in range(len(agents))]
                fna_v = torch.from_numpy(np.concatenate(next_actions,axis=1)).to(config.device)
                
                critics_loss = [agents[i].critic_training_step(fs_v,fa_v, fns_v, fna_v, r_v[:,i,:], d_v[:,i,:]) for i in range(n_agents)]
                actors_loss = [agents[i].actor_training_step(fs_v,fa_v) for i in range(n_agents)]
                                
                _ = [tb_tracker.track(f"loss_critic_{i}", critics_loss[i], epoch) for i in range(n_agents)]
                _ = [tb_tracker.track(f"loss_actor_{i}", actors_loss[i], epoch)for i in range(n_agents)]
        
        if len(rewards_history) > 100 and reward_100 > stop_score:
            print(f'Solved. Episode {epoch}, mean reward {reward_100}')
            break


    learning_state = {
            'actor': actor_net.state_dict(),
            'critic': critic_net.state_dict(),
            'act_opt': actor_optimizer.state_dict(),
            'crt_opt': critic_optimizer.state_dict(),
            'epoch': epoch,
            'rewards_history': rewards_history
    }

    torch.save(learning_state,'./partial_state.ckp')

    writer.close()
    env.close()

Epoch: 100 reward_100: 0.0
Epoch: 200 reward_100: 0.0
Epoch: 300 reward_100: 0.0
Epoch: 400 reward_100: 0.0
Epoch: 500 reward_100: 0.0
Epoch: 600 reward_100: 0.0
Epoch: 700 reward_100: 0.0
Epoch: 800 reward_100: 0.0
Epoch: 897 reward: 0.0

In [None]:
learning_state = {
        'actor': actor_net.state_dict(),
        'critic': critic_net.state_dict(),
        'act_opt': actor_optimizer.state_dict(),
        'crt_opt': critic_optimizer.state_dict(),
        'epoch': epoch,
        'rewards_history': rewards_history
}
    
torch.save(learning_state,'./partial_state.ckp')