In [1]:
# addprocs(30);
using Distributed
addprocs(7);

In [2]:
using POMDPModels
using BasicPOMCP
using POMDPs
using POMDPPolicies

@everywhere begin
    using POMDPModels
    using BasicPOMCP
    using POMDPs
    using BeliefUpdaters
    using Random
    using POMDPSimulators
    using POMDPModelTools
end

In [3]:
N = 1000;
# N = 100;
problem = BabyPOMDP(-5, -10);

In [4]:
function est_reward(problem, policy, belief, N; eps=0.01)
    sum = @distributed (+) for i in 1:N
        sim_rng = MersenneTwister(i)
        sim = RolloutSimulator(rng=sim_rng, eps=eps)
        if isa(policy, FeedWhenCrying)
            up = updater(policy)
        else
            up = DiscreteUpdater(problem)
        end
        POMDPs.simulate(sim, problem, policy, up, initialize_belief(up,belief), false)
    end
    return sum/N;
end

est_reward (generic function with 1 method)

In [5]:
@time er = est_reward(problem, FeedWhenCrying(), false, N)
println(er)

  3.528358 seconds (3.23 M allocations: 163.269 MiB, 1.60% gc time)
-15.978715013004313


This is better than in the crying babies test because epsilon is large and, more importantly, it gets a notcrying observation on the first step every time

In [6]:
# Random
pol_rng = MersenneTwister(7)
@time er = est_reward(problem, RandomPolicy(problem, rng=pol_rng), BoolDistribution(0.5), N)
println(er)

  1.274271 seconds (1.57 M allocations: 78.204 MiB, 1.72% gc time)
-32.396648905123115


In [7]:
# POMCP with FWC rollout policy
rng = MersenneTwister(3)

solver = POMCPSolver(estimate_value=RolloutEstimator(FeedWhenCrying()),
                    max_depth=44, # eps = 0.01
                    c=20.0,
                    tree_queries=300, 
                    rng=rng)
                    

policy = solve(solver, problem)

@time er = est_reward(problem, policy, BoolDistribution(0.0), N)
println(er)

  7.938545 seconds (1.19 M allocations: 58.761 MiB, 0.45% gc time)
-15.746646379594688


In [8]:
# POMCP with Random rollout policy
rng = MersenneTwister(2)
rollout_pol_rng = MersenneTwister(2)

solver = POMCPSolver(estimate_value=RolloutEstimator(RandomPolicy(problem, rng=rollout_pol_rng)),
                     max_depth=44, # eps = 0.01
                     c=20.0,
                     tree_queries=300, 
                     rng=rng)

policy = solve(solver, problem)

@time er = est_reward(problem, policy, BoolDistribution(0.0), N)
println(er)

  9.203904 seconds (315.30 k allocations: 16.039 MiB, 0.86% gc time)
-16.023154545048516


In [9]:
# Optimal policy for these particular problem parameters:
# if the belief that the baby is hungry is over .28206, then feed (see DMU book)
@everywhere begin
    struct OptBabyPolicy <: POMDPs.Policy end
    function POMDPs.action(p::OptBabyPolicy, b)
        a = pdf(b, true)>0.28206
        return a
    end
    POMDPs.updater(::OptBabyPolicy) = updater(BabyPOMDP(-5,-10))
end
@time er = est_reward(problem, OptBabyPolicy(), BoolDistribution(0.0), N)
println(er)

  0.168666 seconds (162.41 k allocations: 8.239 MiB)
-15.525869513040346
