Lightning-Universe · SeanNaren · Sep 8, 2021 · Apr 28, 2021 · Apr 29, 2021 · May 1, 2021
@@ -666,3 +666,89 @@ Example::
 
 .. autoclass:: pl_bolts.models.rl.vanilla_policy_gradient_model.VanillaPolicyGradient
    :noindex:
+
+--------------
+
+Actor-Critic Models
+-------------------
+The following models are based on Actor Critic. Actor Critic conbines the approaches of value-based learning (the DQN family)
+and the policy-based learning (the PG family) by learning the value function as well as the policy distribution. This approach
+updates the policy network according to the policy gradient, and updates the value network to fit the discounted rewards.
+
+Actor Critic Key Points:
+    - Actor outputs a distribution of actions for controlling the agent
+    - Critic outputs a value of current state for policy update suggestion
+    - The addition of critic allows the model to do n-step training instead of generating an entire trajectory
+
+Soft Actor Critic (SAC)
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Soft Actor Critic model introduced in `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor<https://arxiv.org/abs/1801.01290>`_
+Paper authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine
+
+Original implementation by: `Jason Wang <https://github.com/blahBlahhhJ>`_
+
+Soft Actor Critic (SAC) is a powerful actor critic algorithm in reinforcement learning. Unlike A2C, SAC's policy outputs a
+special continuous distribution for actions, and its critic estimates the Q value instead of the state value, which
+means it now takes in not only states but also actions. The new actor allows SAC to support continuous action tasks such
+as controlling robots, and the new critic allows SAC to support off-policy learning which is more sample efficient.
+
+The actor has a new objective to maximize entropy to encourage exploration while maximizing the expected rewards.
+The critic uses two separate Q functions to "mitigate positive bias" during training by picking the minimum of the
+two as the predicted Q value.
+
+Since SAC is off-policy, its algorithm's training step is quite similar to DQN:
+
+1. Initialize one policy network, two Q networks, and two corresponding target Q networks.
+2. Run 1 step using action sampled from policy and store the transition into the replay buffer.
+
+.. math::
+    a \sim tanh(N(\mu_\pi(s), \sigma_\pi(s)))
+
+3. Sample transitions (states, actions, rewards, dones, next states) from the replay buffer.
+
+.. math::
+  s, a, r, d, s' \sim B
+
+4. Compute actor loss and update policy network.
+
+.. math::
+  J_\pi = \frac1n\sum_i(\log\pi(\pi(a | s_i) | s_i) - Q_{min}(s_i, \pi(a | s_i)))
+
+5. Compute Q target
+
+.. math::
+  target_i = r_i + (1 - d_i) \gamma (\min_i Q_{target,i}(s'_i, \pi(a', s'_i)) - log\pi(\pi(a | s'_i) | s'_i))
+
+5. Compute critic loss and update Q network..
+
+.. math::
+  J_{Q_i} = \frac1n \sum_i(Q_i(s_i, a_i) - target_i)^2
+
+4. Soft update the target Q network using a weighted sum of itself and the Q network.
+
+.. math::
+  Q_{target,i} := \tau Q_{target,i} + (1-\tau) Q_i
+
+SAC Benefits
+~~~~~~~~~~~~~~~~~~~
+
+- More sample efficient due to off-policy training
+
+- Supports continuous action space
+
+SAC Results
+~~~~~~~~~~~~~~~~
+
+.. image:: _images/rl_benchmark/pendulum_sac_results.jpg
+  :width: 300
+  :alt: SAC Results
+
+Example::
+  from pl_bolts.models.rl import SAC
+  sac = SAC("Pendulum-v0")
+  trainer = Trainer()
+  trainer.fit(sac)
+
+.. autoclass:: pl_bolts.models.rl.SAC
+:noindex:
@@ -4,6 +4,7 @@
 from pl_bolts.models.rl.noisy_dqn_model import NoisyDQN  # noqa: F401
 from pl_bolts.models.rl.per_dqn_model import PERDQN  # noqa: F401
 from pl_bolts.models.rl.reinforce_model import Reinforce  # noqa: F401
+from pl_bolts.models.rl.sac_model import SAC  # noqa: F401
 from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient  # noqa: F401
 
 __all__ = [
@@ -13,5 +14,6 @@
     "NoisyDQN",
     "PERDQN",
     "Reinforce",
+    "SAC"
     "VanillaPolicyGradient",
 ]
@@ -138,3 +138,50 @@ def __call__(self, states: Tensor, device: str) -> List[int]:
         actions = [np.random.choice(len(prob), p=prob) for prob in prob_np]
 
         return actions
+
+
+class SoftActorCriticAgent(Agent):
+    """Actor-Critic based agent that returns a continuous action based on the policy"""
+
+    def __call__(self, states: Tensor, device: str) -> List[float]:
+        """
+        Takes in the current state and returns the action based on the agents policy
+
+        Args:
+            states: current state of the environment
+            device: the device used for the current batch
+
+        Returns:
+            action defined by policy
+        """
+        if not isinstance(states, list):
+            states = [states]
+
+        if not isinstance(states, Tensor):
+            states = torch.tensor(states, device=device)
+
+        dist = self.net(states)
+        actions = [a for a in dist.sample().cpu().numpy()]
+
+        return actions
+
+    def get_action(self, states: Tensor, device: str) -> List[float]:
+        """
+        Get the action greedily (without sampling)
+
+        Args:
+            states: current state of the environment
+            device: the device used for the current batch
+
+        Returns:
+            action defined by policy
+        """
+        if not isinstance(states, list):
+            states = [states]
+
+        if not isinstance(states, Tensor):
+            states = torch.tensor(states, device=device)
+
+        actions = [self.net.get_action(states).cpu().numpy()]
+
+        return actions
@@ -0,0 +1,67 @@
+"""
+Distributions used in some continuous RL algorithms
+"""
+import torch
+
+
+class TanhMultivariateNormal(torch.distributions.MultivariateNormal):
+    """
+    The distribution of X is an affine of tanh applied on a normal distribution
+        X = action_scale * tanh(Z) + action_bias
+        Z ~ Normal(mean, variance)
+    """
+
+    def __init__(self, action_bias, action_scale, **kwargs):
+        super().__init__(**kwargs)
+
+        self.action_bias = action_bias
+        self.action_scale = action_scale
+
+    def rsample_with_z(self, sample_shape=torch.Size()):
+        """
+        Samples X using reparametrization trick with the intermediate variable Z
+
+        Returns:
+            Sampled X and Z
+        """
+        z = super().rsample()
+        return self.action_scale * torch.tanh(z) + self.action_bias, z
+
+    def log_prob_with_z(self, value, z):
+        """
+        Computes the log probability of a sampled X
+
+        Refer to the original paper of SAC for more details in equation (20), (21)
+
+        Args:
+            value: the value of X
+            z: the value of Z
+        Returns:
+            Log probability of the sample
+        """
+        value = (value - self.action_bias) / self.action_scale
+        z_logprob = super().log_prob(z)
+        correction = torch.log(self.action_scale * (1 - value**2) + 1e-7).sum(1)
+        return z_logprob - correction
+
+    def rsample_and_log_prob(self, sample_shape=torch.Size()):
+        """
+        Samples X and computes the log probability of the sample
+
+        Returns:
+            Sampled X and log probability
+        """
+        z = super().rsample()
+        z_logprob = super().log_prob(z)
+        value = torch.tanh(z)
+        correction = torch.log(self.action_scale * (1 - value**2) + 1e-7).sum(1)
+        return self.action_scale * value + self.action_bias, z_logprob - correction
+
+    def rsample(self, sample_shape=torch.Size()):
+        fz, z = self.rsample_with_z(sample_shape)
+        return fz
+
+    def log_prob(self, value):
+        value = (value - self.action_bias) / self.action_scale
+        z = torch.log(1 + value) / 2 - torch.log(1 - value) / 2
+        return self.log_prob_with_z(value, z)
@@ -6,9 +6,11 @@
 
 import numpy as np
 import torch
-from torch import nn, Tensor
+from torch import FloatTensor, nn, Tensor
 from torch.nn import functional as F
 
+from pl_bolts.models.rl.common.distributions import TanhMultivariateNormal
+
 
 class CNN(nn.Module):
     """
@@ -92,6 +94,68 @@ def forward(self, input_x):
         return self.net(input_x.float())
 
 
+class ContinuousMLP(nn.Module):
+    """
+    MLP network that outputs continuous value via Gaussian distribution
+    """
+
+    def __init__(
+        self,
+        input_shape: Tuple[int],
+        n_actions: int,
+        hidden_size: int = 128,
+        action_bias: int = 0,
+        action_scale: int = 1
+    ):
+        """
+        Args:
+            input_shape: observation shape of the environment
+            n_actions: dimension of actions in the environment
+            hidden_size: size of hidden layers
+            action_bias: the center of the action space
+            action_scale: the scale of the action space
+        """
+        super(ContinuousMLP, self).__init__()
+        self.action_bias = action_bias
+        self.action_scale = action_scale
+
+        self.shared_net = nn.Sequential(
+            nn.Linear(input_shape[0], hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU()
+        )
+        self.mean_layer = nn.Linear(hidden_size, n_actions)
+        self.logstd_layer = nn.Linear(hidden_size, n_actions)
+
+    def forward(self, x: FloatTensor) -> TanhMultivariateNormal:
+        """
+        Forward pass through network. Calculates the action distribution
+
+        Args:
+            x: input to network
+        Returns:
+            action distribution
+         """
+        x = self.shared_net(x.float())
+        batch_mean = self.mean_layer(x)
+        logstd = torch.clamp(self.logstd_layer(x), -20, 2)
+        batch_scale_tril = torch.diag_embed(torch.exp(logstd))
+        return TanhMultivariateNormal(
+            action_bias=self.action_bias, action_scale=self.action_scale, loc=batch_mean, scale_tril=batch_scale_tril
+        )
+
+    def get_action(self, x: FloatTensor) -> Tensor:
+        """
+        Get the action greedily (without sampling)
+
+        Args:
+            x: input to network
+        Returns:
+            mean action
+        """
+        x = self.shared_net(x.float())
+        batch_mean = self.mean_layer(x)
+        return self.action_scale * torch.tanh(batch_mean) + self.action_bias
+
+
 class DuelingMLP(nn.Module):
     """
     MLP network with duel heads for val and advantage