In [1]:
# addprocs(30);
addprocs(7);

In [2]:
using POMDPModels
using BasicPOMCP
using POMDPs
using POMDPToolbox

@everywhere begin
    using POMDPModels
    using BasicPOMCP
    using POMDPs
    using POMDPToolbox
    import POMDPs: action, updater
end

In [3]:
N = 1000;
# N = 100;
problem = BabyPOMDP(-5, -10);

In [4]:
function est_reward(problem, policy, belief, N; eps=0.01)
    sum = @parallel (+) for i in 1:N
        sim_rng = MersenneTwister(i)
        sim = POMDPToolbox.RolloutSimulator(rng=sim_rng, initial_state=false, eps=eps)
        if isa(policy, FeedWhenCrying)
            up = updater(policy)
        else
            up = updater(problem)
        end
        POMDPs.simulate(sim, problem, policy, up, initialize_belief(up,belief))
    end
    return sum/N;
end

est_reward (generic function with 1 method)

In [5]:
@time er = est_reward(problem, FeedWhenCrying(), false, N)
println(er)

  5.693364 seconds (712.34 k allocations: 28.889 MB, 0.11% gc time)
-15.778715013004313


This is better than in the crying babies test because epsilon is large and, more importantly, it gets a notcrying observation on the first step every time

In [6]:
# Random
pol_rng = MersenneTwister(7)
@time er = est_reward(problem, RandomPolicy(problem, rng=pol_rng), BoolDistribution(0.5), N)
println(er)

  1.724261 seconds (199.96 k allocations: 8.096 MB)
-32.396648905123115


In [7]:
# POMCP with FWC rollout policy
rng = MersenneTwister(3)

solver = POMCPSolver(estimate_value=RolloutEstimator(FeedWhenCrying()),
                    max_depth=44, # eps = 0.01
                    c=20.0,
                    tree_queries=300, 
                    rng=rng)
                    

policy = solve(solver, problem)

@time er = est_reward(problem, policy, BoolDistribution(0.0), N)
println(er)

 46.605113 seconds (77.22 k allocations: 3.253 MB)
-15.801811194693038


In [8]:
# POMCP with Random rollout policy
rng = MersenneTwister(2)
rollout_pol_rng = MersenneTwister(2)

solver = POMCPSolver(estimate_value=RolloutEstimator(RandomPolicy(problem, rng=rollout_pol_rng)),
                     max_depth=44, # eps = 0.01
                     c=20.0,
                     tree_queries=300, 
                     rng=rng)

policy = solve(solver, problem)

@time er = est_reward(problem, policy, BoolDistribution(0.0), N)
println(er)

 61.995754 seconds (51.56 k allocations: 2.262 MB)
-15.992283417459882


In [9]:
# Optimal policy for these particular problem parameters:
# if the belief that the baby is hungry is over .28206, then feed (see DMU book)
@everywhere begin
    type OptBabyPolicy <: POMDPs.Policy end
    function action(p::OptBabyPolicy, b::BoolDistribution, a=false)
        a = b.p>0.28206
        return a
    end
    updater(::OptBabyPolicy) = updater(BabyPOMDP(-5,-10))
end
@time er = est_reward(problem, OptBabyPolicy(), BoolDistribution(0.0), N)
println(er)

  0.732722 seconds (34.79 k allocations: 1.487 MB)
-15.525869513040346
