JuliaReinforcementLearning · findmyway · Sep 25, 2020 · Sep 11, 2020 · Sep 11, 2020 · Sep 12, 2020
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@
 This project aims to provide some implementations of the most typical reinforcement learning algorithms.
 
 # Algorithms Implemented
-
+- VPG (Vanilla Policy Gradient, with a baseline)
 - DQN
 - Prioritized DQN
 - Rainbow
@@ -43,6 +43,9 @@ Some built-in experiments are exported to help new users to easily run benchmark
 - ``E`JuliaRL_A2C_CartPole` ``
 - ``E`JuliaRL_A2CGAE_CartPole` `` (Thanks to [@sriram13m](https://github.com/sriram13m))
 - ``E`JuliaRL_PPO_CartPole` ``
+- ``E`JuliaRL_VPG_CartPole` ``
+- ``E`JuliaRL_VPG_Pendulum` `` (continuous action space)
+- ``E`JuliaRL_VPG_PendulumD` `` (discrete action space)
 - ``E`JuliaRL_DDPG_Pendulum` ``
 - ``E`JuliaRL_SAC_Pendulum` `` (Thanks to [@rbange](https://github.com/rbange))
 - ``E`JuliaRL_BasicDQN_MountainCar` `` (Thanks to [@felixchalumeau](https://github.com/felixchalumeau))

diff --git a/src/algorithms/policy_gradient/policy_gradient.jl b/src/algorithms/policy_gradient/policy_gradient.jl
@@ -3,3 +3,4 @@ include("ppo.jl")
 include("A2CGAE.jl")
 include("ddpg.jl")
 include("sac.jl")
+include("vpg.jl") 
diff --git a/src/algorithms/policy_gradient/vpg.jl b/src/algorithms/policy_gradient/vpg.jl
@@ -0,0 +1,140 @@
+using Flux: normalise
+using Random: shuffle
+
+using ReinforcementLearningBase
+using ReinforcementLearningCore
+
+export VPGPolicy, GaussianNetwork
+
+struct GaussianNetwork
+    pre::Chain
+    μ::Chain
+    σ::Chain
+end
+Flux.@functor GaussianNetwork
+function (m::GaussianNetwork)(S)
+    x = m.pre(S)
+    m.μ(x), m.σ(x) .|> exp
+end
+
+"""
+Vanilla Policy Gradient
+
+VPGPolicy(;kwargs)
+
+# Keyword arguments
+- `approximator`,
+- `baseline`,
+- `dist`, distribution function of the action
+- `γ`, discount factor
+- `α_θ`, step size of policy parameter
+- `α_w`, step size of baseline parameter
+- `batch_size`,
+- `rng`,
+- `loss`,
+- `baseline_loss`,
+
+
+if the action space is continuous,
+then the env should transform the action value, (such as using tanh),
+in order to make sure low ≤ value ≤ high
+"""
+Base.@kwdef mutable struct VPGPolicy{
+    A<:NeuralNetworkApproximator,
+    B<:Union{NeuralNetworkApproximator,Nothing},
+    S<:AbstractSpace,
+    R<:AbstractRNG,
+} <: AbstractPolicy
+    approximator::A
+    baseline::B = nothing
+    action_space::S
+    dist::Any
+    γ::Float32 = 0.99f0 # discount factor
+    α_θ = 1.0f0 # step size of policy
+    α_w = 1.0f0 # step size of baseline
+    batch_size::Int = 1024
+    rng::R = Random.GLOBAL_RNG
+    loss::Float32 = 0.0f0
+    baseline_loss::Float32 = 0.0f0
+end
+
+"""
+About continuous action space, see
+* [Diagonal Gaussian Policies](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#stochastic-policies
+* [Clipped Action Policy Gradient](https://arxiv.org/pdf/1802.07564.pdf)
+"""
+
+function (π::VPGPolicy)(env::AbstractEnv)
+    to_dev(x) = send_to_device(device(π.approximator), x)
+
+    logits = env |> get_state |> to_dev |> π.approximator
+
+    if π.action_space isa DiscreteSpace
+        dist = logits |> softmax |> π.dist
+        action = π.action_space[rand(π.rng, dist)]
+    elseif π.action_space isa ContinuousSpace
+        dist = π.dist.(logits...)
+        action = rand.(π.rng, dist)[1]
+    else
+        error("not implemented")
+    end
+    action
+end
+
+function (π::VPGPolicy)(env::MultiThreadEnv)
+    error("not implemented")
+    # TODO: can PG support multi env? PG only get updated at the end of an episode.
+end
+
+function RLBase.update!(π::VPGPolicy, traj::ElasticCompactSARTSATrajectory)
+    (length(traj[:terminal]) > 0 && traj[:terminal][end]) || return
+
+    model = π.approximator
+    to_dev(x) = send_to_device(device(model), x)
+
+    states = traj[:state]
+    actions = traj[:action] |> Array # need to convert ElasticArray to Array, or code will fail on gpu. `log_prob[CartesianIndex.(A, 1:length(A))`
+    gains = traj[:reward] |> x -> discount_rewards(x, π.γ)
+
+    for idx in Iterators.partition(shuffle(1:length(traj[:terminal])), π.batch_size)
+        S = select_last_dim(states, idx) |> to_dev
+        A = actions[idx]
+        G = gains[idx] |> x -> Flux.unsqueeze(x, 1) |> to_dev
+        # gains is a 1 colomn array, but the ouput of flux model is 1 row, n_batch columns array. so unsqueeze it.
+
+        if π.baseline isa NeuralNetworkApproximator
+            gs = gradient(Flux.params(π.baseline)) do
+                δ = G - π.baseline(S)
+                loss = mean(δ .^ 2) * π.α_w # mse
+                ignore() do
+                    π.baseline_loss = loss
+                end
+                loss
+            end
+            update!(π.baseline, gs)
+        elseif π.baseline isa Nothing
+            # Normalization. See
+            # (http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/hw2_final.pdf)
+            # (https://web.stanford.edu/class/cs234/assignment3/solution.pdf)
+            # normalise should not be used with baseline. or the loss of the policy will be too small.
+            δ = G |> x -> normalise(x; dims = 2)
+        end
+
+        gs = gradient(Flux.params(model)) do
+            if π.action_space isa DiscreteSpace
+                log_prob = S |> model |> logsoftmax
+                log_probₐ = log_prob[CartesianIndex.(A, 1:length(A))]
+            elseif π.action_space isa ContinuousSpace
+                dist = π.dist.(model(S)...) # TODO: this part does not work on GPU. See: https://github.com/JuliaStats/Distributions.jl/issues/1183 .
+                log_probₐ = logpdf.(dist, A)
+            end
+            loss = -mean(log_probₐ .* δ) * π.α_θ
+            ignore() do
+                π.loss = loss
+            end
+            loss
+        end
+        update!(model, gs)
+    end
+    empty!(traj)
+end