JuliaPOMDP · MaximeBouton · Mar 19, 2020 · Feb 29, 2020 · Feb 29, 2020 · Mar 2, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,7 @@ language: julia
 
 julia:
   - 1.0
-  - 1.2
+  - 1
 
 os:
   - linux

diff --git a/Project.toml b/Project.toml
@@ -1,19 +1,20 @@
 name = "POMDPPolicies"
 uuid = "182e52fb-cfd0-5e46-8c26-fd0667c990f4"
-version = "0.2.1"
+version = "0.3.0"
 
 [deps]
 BeliefUpdaters = "8bb6e9a1-7d73-552c-a44a-e5dc5634aac4"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 POMDPModelTools = "08074719-1b2a-587c-a292-00f91cc44415"
 POMDPs = "a93abf59-7444-517b-a68a-c42f96afdd7d"
+Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
-POMDPModelTools = "0.2"
 BeliefUpdaters = "0.1"
+POMDPModelTools = "0.2"
 POMDPs = "0.7.3, 0.8"
 StatsBase = "0.26,0.27,0.28,0.29,0.30,0.31,0.32"
 julia = "1"

diff --git a/README.md b/README.md
@@ -11,4 +11,4 @@ A collection of default policy types for [POMDPs.jl](https://github.com/JuliaPOM
 ```julia
 using Pkg
 Pkg.add("POMDPPolicies")
-```
+```
diff --git a/docs/src/exploration_policies.md b/docs/src/exploration_policies.md
@@ -0,0 +1,36 @@
+# Exploration Policies 
+
+Exploration policies are often useful for Reinforcement Learning algorithm to choose an action that is different than the action given by the policy being learned. 
+
+This package provides two exploration policies: `EpsGreedyPolicy` and `SoftmaxPolicy`
+
+```@docs 
+    EpsGreedyPolicy
+    SoftmaxPolicy
+```
+
+## Interface 
+
+Exploration policies are subtype of the abstract `ExplorationPolicy` type and they follow the following interface: 
+`action(exploration_policy::ExplorationPolicy, on_policy::Policy, s)`.
+
+The `action` method is exported by [POMDPs.jl](https://github.com/JuliaPOMDP/POMDPs.jl). 
+To use exploration policies in a solver, you must use the three argument version of `action` where `on_policy` is the policy being learned (e.g. tabular policy or neural network policy).
+
+## Schedules 
+
+Exploration policies often relies on a key parameter: $\epsilon$ in $\epsilon$-greedy and the temperature in softmax for example. 
+Reinforcement learning algorithms often require a decay schedule for these parameters. 
+`POMDPPolicies.jl` exports an interface for implementing decay schedules as well as a few convenient schedule. 
+
+```@docs 
+    LinearDecaySchedule 
+    ConstantSchedule
+```
+
+To implement your own schedule, you must define a schedule type that is a subtype of `ExplorationSchedule`, as well as the function `update_value` that returns the new parameter value updated according to your schedule.
+
+```@docs 
+    ExplorationSchedule
+    update_value
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -6,6 +6,7 @@ It currently provides:
 - an alpha vector policy type
 - a random policy
 - a stochastic policy type
+- exploration policies
 - a vector policy type
 - a wrapper to collect statistics and errors about policies
 

diff --git a/docs/src/stochastic.md b/docs/src/stochastic.md
@@ -5,7 +5,6 @@ Types for representing randomized policies:
 - `StochasticPolicy` samples actions from an arbitrary distribution.
 - `UniformRandomPolicy` samples actions uniformly (see `RandomPolicy` for a similar use)
 - `CategoricalTabularPolicy` samples actions from a categorical distribution with weights given by a `ValuePolicy`.
-- `EpsGreedyPolicy` uses epsilon-greedy action selection.
 
 ```@docs
 StochasticPolicy
@@ -14,7 +13,3 @@ StochasticPolicy
 ```@docs
 CategoricalTabularPolicy
 ```
-
-```@docs
-EpsGreedyPolicy
-```
diff --git a/src/POMDPPolicies.jl b/src/POMDPPolicies.jl
@@ -4,6 +4,7 @@ using LinearAlgebra
 using Random
 using StatsBase # for Weights
 using SparseArrays # for sparse vectors in alpha_vector.jl
+using Parameters
 
 using POMDPs
 import POMDPs: action, value, solve, updater
@@ -52,11 +53,20 @@ include("vector.jl")
 export
     StochasticPolicy,
     UniformRandomPolicy,
-    CategoricalTabularPolicy,
-    EpsGreedyPolicy
+    CategoricalTabularPolicy
 
 include("stochastic.jl")
 
+export ExplorationSchedule,
+       EpsGreedyPolicy,
+       SoftmaxPolicy,
+       ExplorationPolicy,
+       exploration_parameter,
+       LinearDecaySchedule,
+       ConstantSchedule
+
+include("exploration_policies.jl")
+
 export
     PolicyWrapper,
     payload

diff --git a/src/exploration_policies.jl b/src/exploration_policies.jl
@@ -0,0 +1,122 @@
+
+
+# exploration schedule 
+"""
+    ExplorationSchedule
+Abstract type for exploration schedule. 
+It is useful to define the schedule of a parameter of an exploration policy.
+The effect of a schedule is defined by the `update_value` function.
+"""
+abstract type ExplorationSchedule <: Function end 
+
+"""
+    update_value(::ExplorationSchedule, value)
+Returns an updated value according to the schedule.
+"""
+function update_value(::ExplorationSchedule, value) end
+
+
+"""
+    LinearDecaySchedule
+A schedule that linearly decreases a value from `start_val` to `end_val` in `steps` steps.
+if the value is greater or equal to `end_val`, it stays constant.
+
+# Constructor 
+
+`LinearDecaySchedule(;start, stop, steps)`
+"""
+@with_kw struct LinearDecaySchedule{R<:Real} <: ExplorationSchedule
+    start::R
+    stop::R
+    steps::Int
+end
+
+function (schedule::LinearDecaySchedule)(k)
+    rate = (schedule.start - schedule.stop) / schedule.steps
+    val = schedule.start - k*rate 
+    val = max(schedule.stop, val)
+end
+
+
+"""
+    ExplorationPolicy <: Policy
+An abstract type for exploration policies.
+Sampling from an exploration policy is done using `action(exploration_policy, on_policy, state)`
+"""
+abstract type ExplorationPolicy <: Policy end
+
+# """
+#     exploration_parameter(::ExplorationPolicy)
+# returns the exploration parameter of an exploration policy, e.g. epsilon for e-greedy or temperature for softmax
+# """
+# function exploration_parameter end
+
+"""
+    EpsGreedyPolicy <: ExplorationPolicy
+
+represents an epsilon greedy policy, sampling a random action with a probability `eps` or returning an action from a given policy otherwise.
+The evolution of epsilon can be controlled using a schedule. This feature is useful for using those policies in reinforcement learning algorithms. 
+
+constructor:
+
+`EpsGreedyPolicy(problem::Union{MDP, POMDP}, eps::Float64; rng=Random.GLOBAL_RNG, schedule=ConstantSchedule)`
+"""
+struct EpsGreedyPolicy{T<:Function, R<:AbstractRNG, A} <: ExplorationPolicy
+    eps::T
+    rng::R
+    actions::A
+end
+
+function EpsGreedyPolicy(problem::Union{MDP, POMDP}, eps::Function; 
+                         rng::AbstractRNG=Random.GLOBAL_RNG)
+    return EpsGreedyPolicy(eps, rng, actions(problem))
+end
+function EpsGreedyPolicy(problem::Union{MDP, POMDP}, eps::Real; 
+                         rng::AbstractRNG=Random.GLOBAL_RNG)
+    return EpsGreedyPolicy(x->eps, rng, actions(problem))
+end
+
+
+function POMDPs.action(p::EpsGreedyPolicy, on_policy::Policy, k, s)
+    if rand(p.rng) < p.eps(k)
+        return rand(p.rng, p.actions)
+    else 
+        return action(on_policy, s)
+    end
+end
+
+# exploration_parameter(p::EpsGreedyPolicy, k) = p.eps(k)
+
+# softmax 
+"""
+    SoftmaxPolicy <: ExplorationPolicy
+
+represents a softmax policy, sampling a random action according to a softmax function. 
+The softmax function converts the action values of the on policy into probabilities that are used for sampling. 
+A temperature parameter can be used to make the resulting distribution more or less wide.
+"""
+struct SoftmaxPolicy{T<:Function, R<:AbstractRNG, A} <: ExplorationPolicy
+    temperature::T
+    rng::R
+    actions::A
+end
+
+function SoftmaxPolicy(problem, temperature::Function; 
+                       rng::AbstractRNG=Random.GLOBAL_RNG)
+    return SoftmaxPolicy(temperature, rng, actions(problem))
+end
+function SoftmaxPolicy(problem, temperature::Real; 
+                       rng::AbstractRNG=Random.GLOBAL_RNG)
+    return SoftmaxPolicy(x->temperature, rng, actions(problem))
+end
+
+function POMDPs.action(p::SoftmaxPolicy, on_policy::Policy, k, s)
+    vals = actionvalues(on_policy, s)
+    vals ./= p.temperature(k)
+    maxval = maximum(vals)
+    exp_vals = exp.(vals .- maxval)
+    exp_vals /= sum(exp_vals)
+    return p.actions[sample(p.rng, Weights(exp_vals))]
+end
+
+# exploration_parameter(p::SoftmaxPolicy, k) = p.temperature(k)
diff --git a/src/stochastic.jl b/src/stochastic.jl
@@ -57,29 +57,3 @@ function action(policy::CategoricalTabularPolicy, s)
     policy.stochastic.distribution = Weights(policy.value.value_table[stateindex(policy.value.mdp, s),:])
     return policy.value.act[sample(policy.stochastic.rng, policy.stochastic.distribution)]
 end
-
-"""
-    EpsGreedyPolicy
-
-represents an epsilon greedy policy, sampling a random action with a probability `eps` or sampling from a given stochastic policy otherwise.
-
-constructor:
-
-`EpsGreedyPolicy(mdp::Union{MDP,POMDP}, eps::Float64; rng=Random.GLOBAL_RNG)`
-"""
-mutable struct EpsGreedyPolicy <: Policy
-    eps::Float64
-    val::ValuePolicy
-    uni::StochasticPolicy
-end
-
-EpsGreedyPolicy(mdp::Union{MDP,POMDP}, eps::Float64;
-                rng=Random.GLOBAL_RNG) = EpsGreedyPolicy(eps, ValuePolicy(mdp), UniformRandomPolicy(mdp, rng))
-
-function action(policy::EpsGreedyPolicy, s)
-    if rand(policy.uni.rng) > policy.eps
-        return action(policy.val, s)
-    else
-        return action(policy.uni, s)
-    end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -28,3 +28,6 @@ end
 @testset "pretty_printing" begin
     include("test_pretty_printing.jl")
 end
+@testset "exploration policies" begin 
+    include("test_exploration_policies.jl")
+end
diff --git a/test/test_exploration_policies.jl b/test/test_exploration_policies.jl
@@ -0,0 +1,22 @@
+using POMDPModels
+
+problem =  SimpleGridWorld()
+# e greedy
+policy = EpsGreedyPolicy(problem, 0.5)
+a = first(actions(problem))
+@inferred action(policy, FunctionPolicy(s->a::Symbol), 1, GWPos(1,1))
+policy = EpsGreedyPolicy(problem, 0.0)
+@test action(policy, FunctionPolicy(s->a), 1, GWPos(1,1)) == a
+
+# softmax 
+policy = SoftmaxPolicy(problem, 0.5)
+on_policy = ValuePolicy(problem)
+@inferred action(policy, on_policy, 1, GWPos(1,1))
+
+# test linear schedule 
+policy = EpsGreedyPolicy(problem, LinearDecaySchedule(start=1.0, stop=0.0, steps=10))
+for i=1:11 
+    action(policy, FunctionPolicy(s->a), i, GWPos(1,1))
+    @test policy.eps(i) < 1.0 
+end
+@test policy.eps(11) ≈ 0.0
diff --git a/test/test_stochastic_policy.jl b/test/test_stochastic_policy.jl
@@ -12,8 +12,4 @@ policy = CategoricalTabularPolicy(problem)
 sim = RolloutSimulator(max_steps=10)
 simulate(sim, problem, policy)
 
-policy = EpsGreedyPolicy(problem, 0.5)
-sim = RolloutSimulator(max_steps=10)
-simulate(sim, problem, policy)
-
 end