# PPO solution for the Reacher environment from Unity ML



In [4]:


# Agent and models
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

# Agent hyperparameters
BATCH_SIZE = 32         # minibatch size
GAMMA = 0.99            # Discount factor
TAU = 0.95              # GAE parameter
BETA = 0.01             # entropy regularization parameter
PPO_CLIP_EPSILON = 0.2  # ppo clip parameter
GRADIENT_CLIP = 5       # gradient clipping parameter


class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return F.tanh(self.fc3(x))
    
class Critic(nn.Module):
    def __init__(self, state_size, value_size=1, hidden_size=64):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, value_size)
        
    def forward(self, states):
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        return self.fc3(x)
    
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64, std=0.0):
        super(ActorCritic, self).__init__()
        self.actor = Actor(state_size, action_size, hidden_size)
        self.critic = Critic(state_size, 1, hidden_size)
        
        self.log_std = nn.Parameter(torch.ones(1, action_size)*std)
        
    def forward(self, states): # TODO: LEARN WHAT THE FUCK THIS DOES
        obs = torch.FloatTensor(states)
        
        # actor and critic outputs
        mu = self.actor(obs)
        values = self.critic(obs)
        
        std = self.log_std.exp().expand_as(mu)
        dist = torch.distributions.Normal(mu, std)
        
        return dist, values
    

class Agent():
    def __init__(self, num_agents, state_size, action_size):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.model = ActorCritic(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LR, eps=EPSILON)
        self.model.train()
        
    def act(self, states):
        """Remember: states are state vectors for each agent
        It is used when collecting trajectories
        """
        dist, values = self.model(states) # pass the state trough the network and get a distribution over actions and the value of the state
        actions = dist.sample() # sample an action from the distribution
        log_probs = dist.log_prob(actions) # calculate the log probability of that action
        log_probs = log_probs.sum(-1).unsqueeze(-1) # sum the log probabilities of all actions taken (in case of multiple actions) and reshape to (batch_size, 1)
        return actions, log_probs, values
    

    def batcher(self, BATCH_SIZE, states, actions, log_probs_old, returns, advantages):
        """Convert trajectories into learning batches."""
        # for _ in range(states.size(0) // BATCH_SIZE):
        rand_ids = np.random.randint(0, states.size(0), BATCH_SIZE)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs_old[rand_ids, :], returns[rand_ids, :], advantages[rand_ids, :]

    def loss(self):
        pass
    
    def learn(self, states, actions, log_probs_old, returns, advantages, sgd_epochs=4):
        """ Performs a learning step given a batch of experiences
        
        Remmeber: in the PPO algorithm, we perform SGD_episodes (usually 4) weights update steps per batch
        using the proximal policy ratio clipped objective function
        """        

        for _ in range(sgd_epochs):
            # for _ in range(states.size(0) // BATCH_SIZE):
                
            for sampled_states, sampled_actions, sampled_log_probs_old, sampled_returns, sampled_advantages in self.batcher(BATCH_SIZE, states, actions, log_probs_old, returns, advantages):

                dist, values = self.model(sampled_states)
                
                log_probs = dist.log_prob(sampled_actions)
                log_probs = torch.sum(log_probs, dim=1, keepdim=True)
                entropy = dist.entropy().mean()
                
                # r(θ) =  π(a|s) / π_old(a|s)
                ratio = (log_probs - sampled_log_probs_old).exp()
                
                # Surrogate Objctive : L_CPI(θ) = r(θ) * A
                obj = ratio * sampled_advantages
                
                # clip ( r(θ), 1-Ɛ, 1+Ɛ )*A
                obj_clipped = ratio.clamp(1.0 - PPO_CLIP_EPSILON, 1.0 + PPO_CLIP_EPSILON) * sampled_advantages
                
                # L_CLIP(θ) = E { min[ r(θ)A, clip ( r(θ), 1-Ɛ, 1+Ɛ )*A ] - β * KL }
                policy_loss = -torch.min(obj, obj_clipped).mean(0) - BETA * entropy.mean()
                
                # L_VF(θ) = ( V(s) - V_t )^2
                value_loss = 0.5 * (sampled_returns - values).pow(2).mean()
               

                self.optimizer.zero_grad()
                (policy_loss + value_loss).backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), GRADIENT_CLIP)
                self.optimizer.step() 


In [5]:
# train the agent
import numpy as np
from collections import deque
import torch



def test_agent(env, agent, brain_name):
    env_info = env.reset(train_mode = True)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)
    while True:
        actions, _, _= agent.act(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += env_info.rewards
        states = next_states
        if np.any(dones):
            break
    return np.mean(scores)


def collect_trajectories(env, brain_name, agent, max_t):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations  
        
    rollout = []
    agents_rewards = np.zeros(num_agents)
    episode_rewards = []

    # Collecting trajectories
    for _ in range(max_t):
        actions, log_probs, values = agent.act(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = np.array([1 if t else 0 for t in env_info.local_done])
        agents_rewards += rewards

        for j, done in enumerate(dones):
            if dones[j]:
                episode_rewards.append(agents_rewards[j])
                agents_rewards[j] = 0

        rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - dones])

        states = next_states

    pending_value = agent.model(states)[-1]
    returns = pending_value.detach()
    rollout.append([states, pending_value, None, None, None, None])
    
    return rollout, returns, episode_rewards, np.mean(episode_rewards)


def calculate_advantages(rollout, returns, num_agents):
    """ Given a rollout, calculates the advantages for each state
    """
    processed_rollout = [None] * (len(rollout) - 1)
    advantages = torch.Tensor(np.zeros((num_agents, 1)))

    for i in reversed(range(len(rollout) - 1)):
        states, value, actions, log_probs, rewards, dones = rollout[i]
        dones = torch.Tensor(dones).unsqueeze(1)
        rewards = torch.Tensor(rewards).unsqueeze(1)
        actions = torch.Tensor(actions)
        states = torch.Tensor(states)
        next_value = rollout[i + 1][1]
        
        # V(s) = r + γ * V(s')
        returns = rewards + GAMMA * dones * returns
        
        # L = r + γ*V(s') - V(s)
        td_error = rewards + GAMMA * dones * next_value.detach() - value.detach()
        
        advantages = advantages * TAU * GAMMA * dones + td_error
        processed_rollout[i] = [states, actions, log_probs, returns, advantages]

    states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout))
    advantages = (advantages - advantages.mean()) / advantages.std()
    
    
    return states, actions, log_probs_old, returns, advantages


def train(env, brain_name, agent, num_agents, n_episodes, max_t):
    env.info = env.reset(train_mode = True)[brain_name]
    all_scores = []
    all_scores_window = deque(maxlen=100)
        
    for i_episode in range(n_episodes):
        # Each iteration, N parallel actors collect T time steps of data
        rollout, returns, episode_rewards, _ = collect_trajectories(env, brain_name, agent, max_t)
        # print(f"Rollout: {len(rollout)}. Returns: {returns.shape}. Episode_rewards: {len(episode_rewards)}")
        
        states, actions, log_probs_old, returns, advantages = calculate_advantages(rollout, returns, num_agents)
        # print(f"States: {states.shape}. Actions: {actions.shape}. Log_probs_old: {log_probs_old.shape}. Returns: {returns.shape}. Advantages: {advantages.shape}")
        agent.learn(states, actions, log_probs_old, returns, advantages)
        
        test_mean_reward = test_agent(env, agent, brain_name)

        all_scores.append(test_mean_reward)
        all_scores_window.append(test_mean_reward)

        if np.mean(all_scores_window) > 30.0:
            torch.save(agent.model.state_dict(), f"ppo_checkpoint.pth")
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(all_scores_window)))
            break       
        
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.format(i_episode + 1, test_mean_reward, min(i_episode + 1, 100), np.mean(all_scores_window)) )

In [3]:
# Load the environment
from unityagents import UnityEnvironment
import time
# env = UnityEnvironment(file_name='../../unity_ml_envs/Reacher_Windows_x86_64/Reacher.exe')
env = UnityEnvironment(file_name='../../PPO-Reacher_UnityML/Reacher_Windows_x86_64/Reacher.exe')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
time.sleep(2)

# Environment variables
num_agents = len(env_info.agents)
state_size = env_info.vector_observations.shape[1]
action_size = brain.vector_action_space_size

# Training Hyperparameters
EPISODES = 1000
MAX_T = 2048
SGD_EPOCHS = 4
# optimizer parameters
LR = 3e-4
EPSILON = 1e-5
GAMMA = 0.99            # Discount factor
TAU = 0.95              # GAE parameter
 
# Instantiate the agent
agent = Agent(num_agents, state_size, action_size)

# Train the agent
train(env, brain_name, agent, num_agents, EPISODES, MAX_T)
env.close()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Episode 1, Total score this episode: 0.3729999916628003, Last 1 average: 0.3729999916628003
Episode 2, Total score this episode: 0.5009999888017773, Last 2 average: 0.4369999902322888
Episode 3, Total score this episode: 0.4004999910481274, Last 3 average: 0.42483332383756833
Episode 4, Total score this episode: 0.5214999883435667, Last 4 average: 0.4489999899640679
Episode 5, Total score this episode: 0.49849998885765673, Last 5 average: 0.4588999897427857
Episode 6, Total score this episode: 0.4484999899752438, Last 6 average: 0.4571666564481954
Episode 7, Total score this episode: 0.4179999906569719, Last 7 average: 0.4515714184780206
Episode 8, Total score this episode: 0.6884999846108257, Last 8 average: 0.4811874892446212
Episode 9, Total score this episode: 0.4814999892376363, Last 9 average: 0.4812222114660673
Episode 10, Total score this episode: 0.49349998896941544, Last 10 average: 0.48244998921640214
Episode 11, Total score this episode: 0.43849999019876124, Last 11 average

In [5]:
env.close()