# PPO solution for the Reacher environment from Unity ML



In [1]:
from pprint import pprint
def debug(item, name, print_e=False, only_shape=False):
    print("New item: {}".format(name))
    if only_shape:
        print(f"Shape: {item.shape}")
        return
    print(f"Type: {type(item)}")
    if print_e:
        pprint(item)
    try:
        print(f"Length: {len(item)}")
        pprint(f"First element: {item[0]}")
        pprint(f"Last element: {item[-1]}")
        try:
            print(f"Shape: {item.shape}")
        except:

            pass
        try:
            pprint("First element shape: {}".format(item[0].shape))
        except Exception as e:
            pprint(e)
    except:
        print("Object has no length")
    print("")

In [7]:


# Agent and models
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Agent hyperparameters
BATCH_SIZE = 32         # minibatch size
GAMMA = 0.99            # Discount factor
TAU = 0.95              # GAE parameter
BETA = 0.01             # entropy regularization parameter
PPO_CLIP_EPSILON = 0.2  # ppo clip parameter
GRADIENT_CLIP = 5       # gradient clipping parameter


class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=64):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return F.tanh(self.fc3(x))
    
    
class Critic(nn.Module):
    def __init__(self, state_size, value_size=1, hidden_size=64):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, value_size)
        
    def forward(self, states):
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        return self.fc3(x)
    
    
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size, value_size=1, hidden_size=64, std=0.0):
        super(ActorCritic, self).__init__()
        self.actor = Actor(state_size, action_size, hidden_size)
        self.critic = Critic(state_size, value_size, hidden_size)
        
        self.log_std = nn.Parameter(torch.ones(1, action_size)*std)
        
    def forward(self, states): # TODO: LEARN WHAT THE FUCK THIS DOES
        obs = torch.FloatTensor(states)
        
        # Critic
        values = self.critic(obs)
        
        # Actor
        mu = self.actor(obs)
        std = self.log_std.exp().expand_as(mu)
        dist = torch.distributions.Normal(mu, std)
        
        return dist, values
    
    
class Agent():
    def __init__(self, num_agents, state_size, action_size):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.model = ActorCritic(state_size, action_size, value_size=1)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LR, eps=EPSILON)
        self.model.train()
        
    def act(self, states): # TODO: IS THIS CORRECT? WE SHOULD USE MU AS THE ACTIONS, AND NOT SAMPLE FROM THE DISTRIBUTION
        """Remember: states are state vectors for each agent
        It is used when collecting trajectories
        """
        dist, values = self.model(states) # pass the state trough the network and get a distribution over actions and the value of the state
        actions = dist.sample() # sample an action from the distribution
        log_probs = dist.log_prob(actions) # calculate the log probability of that action
        log_probs = log_probs.sum(-1).unsqueeze(-1) # sum the log probabilities of all actions taken (in case of multiple actions) and reshape to (batch_size, 1)
        
        return actions, log_probs, values

    # def batcher(self, BATCH_SIZE, states, actions, log_probs_old, returns, advantages):
    #     """Convert trajectories into learning batches."""
    #     # for _ in range(states.size(0) // BATCH_SIZE):
    #     rand_ids = np.random.randint(0, states.size(0), BATCH_SIZE)
    #     yield states[rand_ids, :], actions[rand_ids, :], log_probs_old[rand_ids, :], returns[rand_ids, :], advantages[rand_ids, :]

    
    def learn(self, states, actions, log_probs_old, returns, advantages, sgd_epochs=4):
        """ Performs a learning step given a batch of experiences
        
        Remmeber: in the PPO algorithm, we perform SGD_episodes (usually 4) weights update steps per batch
        using the proximal policy ratio clipped objective function
        """        

        num_batches = states.size(0) // BATCH_SIZE
        for i in range(sgd_epochs):
            # batch_count = 0
            # batch_ind = 0
            # for i in range(num_batches):
            #     sampled_states = states[batch_ind:batch_ind+BATCH_SIZE, :]
            #     sampled_actions = actions[batch_ind:batch_ind+BATCH_SIZE, :]
            #     sampled_log_probs_old = log_probs_old[batch_ind:batch_ind+BATCH_SIZE, :]
            #     sampled_returns = returns[batch_ind:batch_ind+BATCH_SIZE, :]
            #     sampled_advantages = advantages[batch_ind:batch_ind+BATCH_SIZE, :]
                
            #     L = ppo_loss(self.model, sampled_states, sampled_actions, sampled_log_probs_old, sampled_returns, sampled_advantages)
                
            #     self.optimizer.zero_grad()
            #     (L).backward()
            #     nn.utils.clip_grad_norm_(self.model.parameters(), GRADIENT_CLIP)
            #     self.optimizer.step()
                
            #     batch_ind += BATCH_SIZE
            #     batch_count += 1
            
            
            batch_count = 0
            batch_ind = 0
            for i in range(num_batches):
                sampled_states = states[batch_ind:batch_ind+BATCH_SIZE, :]
                sampled_actions = actions[batch_ind:batch_ind+BATCH_SIZE, :]
                sampled_log_probs_old = log_probs_old[batch_ind:batch_ind+BATCH_SIZE, :]
                sampled_returns = returns[batch_ind:batch_ind+BATCH_SIZE, :]
                sampled_advantages = advantages[batch_ind:batch_ind+BATCH_SIZE, :]
                
                dist, values = self.model(sampled_states)
                
                log_probs = dist.log_prob(sampled_actions)
                log_probs = torch.sum(log_probs, dim=1, keepdim=True)
                entropy = dist.entropy().mean()
                
                # r(θ) =  π(a|s) / π_old(a|s)
                ratio = (log_probs - sampled_log_probs_old).exp()
                
                # Surrogate Objctive : L_CPI(θ) = r(θ) * A
                obj = ratio * sampled_advantages
                
                # clip ( r(θ), 1-Ɛ, 1+Ɛ )*A
                obj_clipped = ratio.clamp(1.0 - PPO_CLIP_EPSILON, 1.0 + PPO_CLIP_EPSILON) * sampled_advantages
                
                # L_CLIP(θ) = E { min[ r(θ)A, clip ( r(θ), 1-Ɛ, 1+Ɛ )*A ] - β * KL }
                policy_loss = -torch.min(obj, obj_clipped).mean(0) - BETA * entropy.mean()
                
                # L_VF(θ) = ( V(s) - V_t )^2
                value_loss = 0.5 * (sampled_returns - values).pow(2).mean()
               

                self.optimizer.zero_grad()
                (policy_loss + value_loss).backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), GRADIENT_CLIP)
                self.optimizer.step()
                
                batch_ind += BATCH_SIZE
                batch_count += 1
            


In [8]:
# Loss function. NOT INTEGRATED YET

def ppo_loss(model, states, actions, log_probs_old, returns, advantages):
    dist, values = model(states)
    
    log_probs = dist.log_prob(actions)
    log_probs = torch.sum(log_probs, dim=1, keepdim=True)
    entropy = dist.entropy().mean()
    
    # r(θ) =  π(a|s) / π_old(a|s)
    ratio = (log_probs - log_probs_old).exp()
    
    # Surrogate Objctive : L_CPI(θ) = r(θ) * A
    obj = ratio * advantages
    
    # clip ( r(θ), 1-Ɛ, 1+Ɛ )*A
    obj_clipped = ratio.clamp(1.0 - PPO_CLIP_EPSILON, 1.0 + PPO_CLIP_EPSILON) * advantages
    
    # L_CLIP(θ) = E { min[ r(θ)A, clip ( r(θ), 1-Ɛ, 1+Ɛ )*A ] - β * KL }
    policy_loss = -torch.min(obj, obj_clipped).mean(0) - BETA * entropy.mean()
    
    # L_VF(θ) = ( V(s) - V_t )^2
    value_loss = 0.5 * (returns - values).pow(2).mean()
    
    return policy_loss + value_loss

In [3]:
# train the agent
import numpy as np
from collections import deque
import torch


def test_agent(env, agent, brain_name):
    env_info = env.reset(train_mode = True)[brain_name]
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    scores = np.zeros(num_agents)
    while True:
        actions, _, _= agent.act(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += env_info.rewards
        states = next_states
        if np.any(dones):
            break
    return np.mean(scores)


def collect_trajectories(env, brain_name, agent, max_t):
    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    # debug(states, "states")
        
    rollout = []
    agents_rewards = np.zeros(num_agents)
    episode_rewards = []

    # Collecting trajectories
    for _ in range(max_t):
    # for _ in range(5):
        actions, log_probs, values = agent.act(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        # debug(next_states, "next_states")
        rewards = env_info.rewards 
        dones = np.array([1 if t else 0 for t in env_info.local_done])
        agents_rewards += rewards

        for j, done in enumerate(dones):
            if dones[j]:
                episode_rewards.append(agents_rewards[j])
                agents_rewards[j] = 0

        rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - dones])

        states = next_states

    pending_value = agent.model(states)[-1]
    returns = pending_value.detach() # Why is this called retuns? It's the value of the last state
    rollout.append([states, pending_value, None, None, None, None])
    
    return rollout, returns, episode_rewards, np.mean(episode_rewards)


def calculate_advantages(rollout, returns, num_agents):
    """ Given a rollout, calculates the advantages for each state
    """
    processed_rollout = [None] * (len(rollout) - 1)
    advantages = torch.Tensor(np.zeros((num_agents, 1))) # advantages is a single value for each state

    for i in reversed(range(len(rollout) - 1)):
        states, value, actions, log_probs, rewards, dones = rollout[i]
        dones = torch.Tensor(dones).unsqueeze(1)
        rewards = torch.Tensor(rewards).unsqueeze(1)
        actions = torch.Tensor(actions)
        states = torch.Tensor(states)
        next_value = rollout[i + 1][1]
        
        # V(s) = r + γ * V(s')
        returns = rewards + GAMMA * dones * returns
        
        # L = r + γ*V(s') - V(s)
        td_error = (rewards + GAMMA * dones * next_value.detach()) - value.detach() # targer - current
        
        advantages = advantages * TAU * GAMMA * dones + td_error
        processed_rollout[i] = [states, actions, log_probs, returns, advantages]
    
    states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout))
    advantages = (advantages - advantages.mean()) / advantages.std()
    
    return states, actions, log_probs_old, returns, advantages


def train(env, brain_name, agent, num_agents, n_episodes, max_t, run_name="testing_01"):
    print(f"Starting training...")
    env.info = env.reset(train_mode = True)[brain_name]
    all_scores = []
    all_scores_window = deque(maxlen=100)
    best_so_far = 1.0
        
    for i_episode in range(n_episodes):
        # Each iteration, N parallel actors collect T time steps of data
        rollout, returns, _, _ = collect_trajectories(env, brain_name, agent, max_t)
        
        states, actions, log_probs_old, returns, advantages = calculate_advantages(rollout, returns, num_agents)
        # print(f"States: {states.shape}. Actions: {actions.shape}. Log_probs_old: {log_probs_old.shape}. Returns: {returns.shape}. Advantages: {advantages.shape}")
        agent.learn(states, actions, log_probs_old, returns, advantages)
        
        test_mean_reward = test_agent(env, agent, brain_name)

        all_scores.append(test_mean_reward)
        all_scores_window.append(test_mean_reward)

        if np.mean(all_scores_window) > best_so_far:
            torch.save(agent.model.state_dict(), f"ppo_checkpoint_{np.mean(all_scores_window)}.pth")
            best_so_far = np.mean(all_scores_window)
            if np.mean(all_scores_window) > 30:
                
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(all_scores_window)))
                # break       
        
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.format(i_episode + 1, test_mean_reward, min(i_episode + 1, 100), np.mean(all_scores_window)) )

In [4]:
# Load the environment
from unityagents import UnityEnvironment
import time
# env = UnityEnvironment(file_name='../../unity_ml_envs/Reacher_Windows_x86_64/Reacher.exe')
env = UnityEnvironment(file_name='../../PPO-Reacher_UnityML/Reacher_Windows_x86_64/Reacher.exe')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [5]:

env_info = env.reset(train_mode=True)[brain_name]
time.sleep(2)

# Environment variables
num_agents = len(env_info.agents)
state_size = env_info.vector_observations.shape[1]
action_size = brain.vector_action_space_size

# Training Hyperparameterso
EPISODES = 200
# MAX_T = 2048
MAX_T = 1000
SGD_EPOCHS = 4
# optimizer parameters
LR = 3e-4
EPSILON = 1e-5
GAMMA = 0.99            # Discount factor
TAU = 0.95              # GAE parameter
 
# Instantiate the agent
agent = Agent(num_agents, state_size, action_size)


In [6]:
# Train the agent
train(env, brain_name, agent, num_agents, EPISODES, MAX_T)
env.close()

Starting training...


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Episode 1, Total score this episode: 0.243999994546175, Last 1 average: 0.243999994546175
Episode 2, Total score this episode: 0.18749999580904841, Last 2 average: 0.2157499951776117
Episode 3, Total score this episode: 0.2924999934621155, Last 3 average: 0.241333327939113
Episode 4, Total score this episode: 0.6644999851472676, Last 4 average: 0.3471249922411517
Episode 5, Total score this episode: 0.5759999871253967, Last 5 average: 0.3928999912180007
Episode 6, Total score this episode: 0.7054999842308461, Last 6 average: 0.4449999900534749
Episode 7, Total score this episode: 0.9089999796822668, Last 7 average: 0.511285702857588
Episode 8, Total score this episode: 1.2019999731332063, Last 8 average: 0.5976249866420403
Episode 9, Total score this episode: 1.0934999755583703, Last 9 average: 0.6527222076327437
Episode 10, Total score this episode: 1.3989999687299133, Last 10 average: 0.7273499837424606
Episode 11, Total score this episode: 1.367499969433993, Last 11 average: 0.78554

KeyboardInterrupt: 

In [9]:
# Train the agent
train(env, brain_name, agent, num_agents, EPISODES, MAX_T)
env.close()

Starting training...

Environment solved in 0 episodes!	Average Score: 39.05
Episode 1, Total score this episode: 39.050999127142134, Last 1 average: 39.050999127142134
Episode 2, Total score this episode: 38.54099913854152, Last 2 average: 38.79599913284183
Episode 3, Total score this episode: 39.19849912384525, Last 3 average: 38.930165796509634
Episode 4, Total score this episode: 39.00149912824854, Last 4 average: 38.94799912944436


KeyboardInterrupt: 

In [None]:
env.close()