In [None]:
# Rendering problems in Colab 
# https://stackoverflow.com/questions/63250935/nameerror-name-base-is-not-defined-while-running-open-ai-gym-in-google-colab
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

!pip3 install box2d-py
!pip3 install gym[Box_2D]

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
#=========================================================#

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

# why unwrapped
# https://stackoverflow.com/questions/53836136/why-unwrap-an-openai-gym
env = gym.make('CartPole-v0').unwrapped

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,271 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.10 [784 kB]
Fetched 784 kB in 1s (699 kB/s)
Selecting previously unselected package xvfb.
(Reading database ... 155639 files and directories currently installed.)
Preparing to unpack .../xvfb_2%3a1.19.6-1ubuntu4.10_amd64.deb ...
Unpacking xvfb (2:1.19.6-1ubuntu4.10) ...
Setting up xvfb (2:1.19.6-1ubuntu4.10) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/

In [None]:
class RolloutMemory():
  def __init__(self, batch_size=32):
    self.batch_size = batch_size
    self.states = []
    self.actions = []
    self.rewards = []
    self.log_probs = []
    self.dones = []
  
  def clear_memory(self):
    del self.states[:] # delete the variables instead of assign to new empty list
    del self.actions[:] # it's said that it's do with machine memory, (may faster?)
    del self.rewards[:]
    del self.log_probs[:]
    del self.dones[:]
  
  def generate_batches(self):
    batch_size = self.batch_size
    data_length = len(self.states)
    batches_start_indices = np.arange(0, data_length, batch_size) # handling different length T
    indices = np.arange(data_length, dtype=np.int64)
    np.random.shuffle(indices)
    batches_indices = [indices[i : i + batch_size] for i in batches_start_indices]

    return batches_indices

In [None]:
from torch.distributions import MultivariateNormal

def init_orthogonal_weights(m):
  if isinstance(m, nn.Linear):
    orthogonal_init(m.weight)
    nn.init.constant_(m.bias, 0.1)

def orthogonal_init(tensor, gain=1):
  '''
  https://github.com/implementation-matters/code-for-paper/blob/094994f2bfd154d565c34f5d24a7ade00e0c5bdb/src/policy_gradients/torch_utils.py#L494
  Fills the input `Tensor` using the orthogonal initialization scheme from OpenAI
  Args:
      tensor: an n-dimensional `torch.Tensor`, where :math:`n \geq 2`
      gain: optional scaling factor
  Examples:
      >>> w = torch.empty(3, 5)
      >>> orthogonal_init(w)
  '''
  if tensor.ndimension() < 2:
    raise ValueError("Only tensors with 2 or more dimensions are supported")

  rows = tensor.size(0)
  cols = tensor[0].numel()
  flattened = tensor.new(rows, cols).normal_(0, 1)

  if rows < cols:
    flattened.t_()

  # Compute the qr factorization
  u, s, v = torch.svd(flattened, some=True)
  if rows < cols:
    u.t_()
  q = u if tuple(u.shape) == (rows, cols) else v
  with torch.no_grad():
    tensor.view_as(q).copy_(q)
    tensor.mul_(gain)
  return tensor


class ActorCritic(nn.Module):
  def __init__(self, state_dim, action_dim, action_std_init):
    super(ActorCritic, self).__init__()
    self.action_dim = action_dim
    self.action_var = torch.full((action_dim,), action_std_init ** 2).to(device) 
    # torch.full: https://pytorch.org/docs/stable/generated/torch.full.html

    self.actor = nn.Sequential(
        nn.Linear(state_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, action_dim),
        nn.Tanh()
    )
    self.critic = nn.Sequential(
        nn.Linear(state_dim, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, 1)
    )
    self.actor.apply(init_orthogonal_weights)
    self.critic.apply(init_orthogonal_weights)
  
  def set_action_std(self, new_action_std):
    self.action_var = torch.full((self.action_dim,), new_action_std ** 2).to(device)

  def forward(self, state):
    action_mean = self.actor(state)
    cov_matrix = torch.diag(self.action_var).unsqueeze(dim=0)
    dist = MultivariateNormal(action_mean, cov_matrix)

    action = dist.sample()
    log_prob = dist.log_prob(action)

    return action.detach(), log_prob.detach()

In [None]:
class PPOAgent:
  def __init__(self, state_dim, action_dim, action_std_init=0.6,
      actor_lr=5e-4, critic_lr=1e-3, batch_size=32, num_epochs=40, 
      gamma=0.99, gae_lambda=0.95, policy_clip=0.2):
    
    self.action_std = action_std_init
    self.gamma = gamma
    self.gae_lambda = gae_lambda
    self.policy_clip = policy_clip
    self.num_epochs = num_epochs

    self.memory = RolloutMemory(batch_size)

    self.policy = ActorCritic(state_dim, action_dim, action_std_init).to(device)
    self.optimizer = torch.optim.Adam([
        {'params': self.policy.actor.parameters(), 'lr': actor_lr},
        {'params': self.policy.critic.parameters(), 'lr': critic_lr}
    ])
    self.policy_old = ActorCritic(state_dim, action_dim, action_std_init).to(device)
    self.policy_old.load_state_dict(self.policy.state_dict())

    self.mse_loss = nn.MSELoss()

  def set_action_std(self, new_action_std):
    self.action_std = new_action_std
    self.policy.set_action_std(new_action_std)
    self.policy_old.set_action_std(new_action_std)
  
  def decay_action_std(self, action_std_decay_rate, min_action_std):
    self.action_std -= action_std_decay_rate
    self.action_std = round(self.action_std, 4)
    if self.action_std <= min_action_std:
      self.action_std = min_action_std
      print("setting actor output action_std to min_action_std : ", self.action_std)
    else:
      print("setting actor output action_std to : ", self.action_std)
    self.set_action_std(self.action_std)
  
  def select_action(self, state):
    with torch.no_grad():
      state = torch.FloatTensor(state).to(device)
      action, log_prob = self.policy_old(state)
    
    self.memory.states.append(state)
    self.memory.actions.append(action)
    self.memory.log_probs.append(log_prob)
    
    return action.detach().cpu().numpy().flatten()
  
  def discounted_rewards(self):
    """Monte Carlo estimate of returns"""
    rewards = []
    discounted_reward = 0
    for reward, done in zip(reversed(self.memory.rewards), reversed(self.memory.dones)):
      if done:
        discounted_reward = 0
      discounted_reward = reward + (self.gamma * discounted_reward)
      rewards.insert(0, discounted_reward)
    
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
    
    return rewards
  
  def evaluate(self, state, action):
    action_mean = self.policy.actor(state)
    action_var = self.policy.action_var.expand_as(action_mean) 
    # torch.expand(): https://pytorch.org/docs/stable/generated/torch.Tensor.expand.html#torch.Tensor.expand

    cov_matrix = torch.diag_embed(action_var).to(device)
    # torch.diag_embed(): https://pytorch.org/docs/stable/generated/torch.diag_embed.html

    dist = MultivariateNormal(action_mean, cov_matrix)

    if self.policy.action_dim == 1:
      action = action.reshape(-1, self.policy.action_dim)

    log_probs = dist.log_prob(action)
    dist_entropy = dist.entropy()
    state_values = self.policy.critic(state)

    return log_probs, state_values, dist_entropy
  
  def update(self):
    rewards = self.discounted_rewards()
    old_states = torch.squeeze(torch.stack(self.memory.states, dim=0)).detach().to(device)
    old_actions = torch.squeeze(torch.stack(self.memory.actions, dim=0)).detach().to(device)
    old_log_probs = torch.squeeze(torch.stack(self.memory.log_probs, dim=0)).detach().to(device)

    for _ in range(self.num_epochs):
      log_probs, state_values, dist_entropy = self.evaluate(old_states, old_actions)
      state_values = torch.squeeze(state_values)
      ratio = torch.exp(log_probs - old_log_probs.detach())

      advantages = rewards - state_values.detach()
      surr1 = ratio * advantages
      surr2 = torch.clamp(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * advantages

      loss = -torch.min(surr1, surr2) + 0.5 * self.mse_loss(state_values, rewards) - 0.01 * dist_entropy

      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step()

    self.policy_old.load_state_dict(self.policy.state_dict())
    self.memory.clear_memory()

In [None]:
from collections import deque
from datetime import datetime

def train(agent, num_episodes=2000, max_episode_length=400, update_every=2048):
  scores = []
  scores_window = deque(maxlen=100)
  time_step = 0

  start_time = datetime.now().replace(microsecond=0)

  for i_episode in range(1, num_episodes + 1):
    state = env.reset()
    score = 0
    for t in range(1, max_episode_length + 1):
      action = agent.select_action(state)
      state, reward, done, _ = env.step(action)

      # state, action, log_prob are stored in agent.select_action()
      agent.memory.rewards.append(reward)
      agent.memory.dones.append(done)

      score += reward
      time_step += 1

      if time_step % update_every == 0:
        agent.update()
      
      if time_step % action_std_decay_freq == 0:
        agent.decay_action_std(action_std_decay_rate, min_action_std)

      if done:
        break
      
    scores_window.append(score)
    scores.append(score)
    
    print('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(i_episode, score, np.mean(scores_window)), end='')
    if i_episode % 100 == 0:
      print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

    if np.mean(scores_window) >= 200.0:
      print('\nEnvironment solved in {:d} episodes!\tAverage Score {:.2f}'.format(i_episode-100, np.mean(scores_window)))
      break

  end_time = datetime.now().replace(microsecond=0)
  print("Total training time  : ", end_time - start_time)
    
  return scores

In [None]:
env = gym.make('LunarLanderContinuous-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
actor_lr = 1.5e-3
critic_lr = 2.5e-3

batch_size = 256
num_epochs = 10

action_std_init = 0.6
action_std_decay_rate = 0.05
min_action_std = 0.1
action_std_decay_freq = int(5e4) # timesteps
#action_std_decay_freq = 150 # episodes

agent = PPOAgent(state_dim, action_dim, action_std_init, actor_lr, critic_lr, batch_size, num_epochs)

ppo_scores = train(agent, num_episodes=3000)

Episode 100	Average Score: -314.58
Episode 200	Average Score: -174.40
Episode 300	Average Score: -65.13
Episode 400	Average Score: -30.08
setting actor output action_std to :  0.55
Episode 500	Average Score: 56.95
Episode 547	Score: 78.82	Average Score: 67.06setting actor output action_std to :  0.5
Episode 600	Average Score: 91.82
Episode 675	Score: 13.16	Average Score: 114.27setting actor output action_std to :  0.45
Episode 700	Average Score: 112.42
Episode 800	Average Score: 124.14
Episode 802	Score: 123.16	Average Score: 124.67setting actor output action_std to :  0.4
Episode 900	Average Score: 140.52
Episode 936	Score: 26.97	Average Score: 115.37setting actor output action_std to :  0.35
Episode 1000	Average Score: 106.22
Episode 1079	Score: 6.13	Average Score: 183.61setting actor output action_std to :  0.3
Episode 1097	Score: 187.29	Average Score: 200.03
Environment solved in 997 episodes!	Average Score 200.03
Total training time  :  0:09:44
