Skip to content
This repository has been archived by the owner on May 6, 2021. It is now read-only.

PG policy #87

Merged
merged 27 commits into from
Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
This project aims to provide some implementations of the most typical reinforcement learning algorithms.

# Algorithms Implemented

- VPG (Vanilla Policy Gradient, with a baseline)
- DQN
- Prioritized DQN
- Rainbow
Expand Down Expand Up @@ -43,6 +43,9 @@ Some built-in experiments are exported to help new users to easily run benchmark
- ``E`JuliaRL_A2C_CartPole` ``
- ``E`JuliaRL_A2CGAE_CartPole` `` (Thanks to [@sriram13m](https://github.com/sriram13m))
- ``E`JuliaRL_PPO_CartPole` ``
- ``E`JuliaRL_VPG_CartPole` ``
- ``E`JuliaRL_VPG_Pendulum` `` (continuous action space)
- ``E`JuliaRL_VPG_PendulumD` `` (discrete action space)
- ``E`JuliaRL_DDPG_Pendulum` ``
- ``E`JuliaRL_SAC_Pendulum` `` (Thanks to [@rbange](https://github.com/rbange))
- ``E`JuliaRL_BasicDQN_MountainCar` `` (Thanks to [@felixchalumeau](https://github.com/felixchalumeau))
Expand Down
1 change: 1 addition & 0 deletions src/algorithms/policy_gradient/policy_gradient.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ include("ppo.jl")
include("A2CGAE.jl")
include("ddpg.jl")
include("sac.jl")
include("vpg.jl")
140 changes: 140 additions & 0 deletions src/algorithms/policy_gradient/vpg.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
using Flux: normalise
using Random: shuffle

using ReinforcementLearningBase
using ReinforcementLearningCore

export VPGPolicy, GaussianNetwork

struct GaussianNetwork
pre::Chain
μ::Chain
σ::Chain
end
Flux.@functor GaussianNetwork
function (m::GaussianNetwork)(S)
x = m.pre(S)
m.μ(x), m.σ(x) .|> exp
end

"""
Vanilla Policy Gradient

VPGPolicy(;kwargs)

# Keyword arguments
- `approximator`,
- `baseline`,
- `dist`, distribution function of the action
- `γ`, discount factor
- `α_θ`, step size of policy parameter
- `α_w`, step size of baseline parameter
- `batch_size`,
- `rng`,
- `loss`,
- `baseline_loss`,


if the action space is continuous,
then the env should transform the action value, (such as using tanh),
in order to make sure low ≤ value ≤ high
norci marked this conversation as resolved.
Show resolved Hide resolved
"""
Base.@kwdef mutable struct VPGPolicy{
A<:NeuralNetworkApproximator,
B<:Union{NeuralNetworkApproximator,Nothing},
S<:AbstractSpace,
R<:AbstractRNG,
} <: AbstractPolicy
approximator::A
baseline::B = nothing
action_space::S
dist::Any
γ::Float32 = 0.99f0 # discount factor
α_θ = 1.0f0 # step size of policy
α_w = 1.0f0 # step size of baseline
batch_size::Int = 1024
rng::R = Random.GLOBAL_RNG
loss::Float32 = 0.0f0
baseline_loss::Float32 = 0.0f0
end

"""
About continuous action space, see
* [Diagonal Gaussian Policies](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#stochastic-policies
* [Clipped Action Policy Gradient](https://arxiv.org/pdf/1802.07564.pdf)
"""

function (π::VPGPolicy)(env::AbstractEnv)
to_dev(x) = send_to_device(device(π.approximator), x)

logits = env |> get_state |> to_dev |> π.approximator

if π.action_space isa DiscreteSpace
dist = logits |> softmax |> π.dist
action = π.action_space[rand(π.rng, dist)]
elseif π.action_space isa ContinuousSpace
dist = π.dist.(logits...)
action = rand.(π.rng, dist)[1]
else
error("not implemented")
end
action
end

function (π::VPGPolicy)(env::MultiThreadEnv)
error("not implemented")
# TODO: can PG support multi env? PG only get updated at the end of an episode.
end

function RLBase.update!(π::VPGPolicy, traj::ElasticCompactSARTSATrajectory)
(length(traj[:terminal]) > 0 && traj[:terminal][end]) || return

model = π.approximator
to_dev(x) = send_to_device(device(model), x)

states = traj[:state]
actions = traj[:action] |> Array # need to convert ElasticArray to Array, or code will fail on gpu. `log_prob[CartesianIndex.(A, 1:length(A))`
gains = traj[:reward] |> x -> discount_rewards(x, π.γ)

for idx in Iterators.partition(shuffle(1:length(traj[:terminal])), π.batch_size)
S = select_last_dim(states, idx) |> to_dev
A = actions[idx]
G = gains[idx] |> x -> Flux.unsqueeze(x, 1) |> to_dev
# gains is a 1 colomn array, but the ouput of flux model is 1 row, n_batch columns array. so unsqueeze it.

if π.baseline isa NeuralNetworkApproximator
gs = gradient(Flux.params(π.baseline)) do
δ = G - π.baseline(S)
loss = mean(δ .^ 2) * π.α_w # mse
ignore() do
π.baseline_loss = loss
end
loss
end
update!(π.baseline, gs)
elseif π.baseline isa Nothing
# Normalization. See
# (http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/hw2_final.pdf)
# (https://web.stanford.edu/class/cs234/assignment3/solution.pdf)
# normalise should not be used with baseline. or the loss of the policy will be too small.
δ = G |> x -> normalise(x; dims = 2)
end

gs = gradient(Flux.params(model)) do
if π.action_space isa DiscreteSpace
log_prob = S |> model |> logsoftmax
log_probₐ = log_prob[CartesianIndex.(A, 1:length(A))]
elseif π.action_space isa ContinuousSpace
dist = π.dist.(model(S)...) # TODO: this part does not work on GPU. See: https://github.com/JuliaStats/Distributions.jl/issues/1183 .
log_probₐ = logpdf.(dist, A)
end
loss = -mean(log_probₐ .* δ) * π.α_θ
ignore() do
π.loss = loss
end
loss
end
update!(model, gs)
end
empty!(traj)
end