In [1]:
addprocs(30);
# addprocs(7);

In [2]:
@everywhere begin
    using POMDPModels
    using POMCP
    using POMDPs
    import POMDPToolbox.PreviousObservation

    import POMCP.belief_from_node
    import POMCP.init_V
    # import POMCP.POMCPPolicy
    import POMDPs.action
end

In [3]:
N = 100000;
# N = 100;
problem = BabyPOMDP(-5, -10);

In [4]:
@everywhere type RandomBabyPolicy <: POMDPs.Policy
    rng::AbstractRNG
end
@everywhere function action(::BabyPOMDP, p::RandomBabyPolicy, b::POMDPs.Belief, a=BabyAction(false))
    a.feed = rand(p.rng)>0.5
    return a
end

In [5]:
function est_reward(problem, policy, belief, N; eps=0.01)
    sum = @parallel (+) for i in 1:N
        sim_rng = MersenneTwister(i)
        POMDPs.simulate(problem, policy, belief, rng=sim_rng, eps=eps, initial_state=BabyState(false))
    end
    return sum/N;
end

est_reward (generic function with 1 method)

In [6]:
# Feed when Crying (Expected Reward for this nearly-optimal policy is -17.14)
# Test from earlier this week with 5000 experiments: -16.72
@time est_reward(problem, FeedWhenCrying(), PreviousObservation(BabyObservation(false)), N)

elapsed time: 2.739505561 seconds (14213344 bytes allocated)


-16.621810988816325

This is better than in the crying babies test because epsilon is large and, more importantly, it gets a notcrying observation on the first step every time

In [7]:
# Random
pol_rng = MersenneTwister(7)
@time est_reward(problem, RandomBabyPolicy(pol_rng), PreviousObservation(BabyObservation(false)), N)

elapsed time: 0.848669433 seconds (8383672 bytes allocated, 4.92% gc time)


-32.17854039299618

In [8]:
# POMCP with FWC rollout policy
# test from earlier this week with 5000 experiments: -16.77
rng = MersenneTwister(2)

solver = POMCPSolver(FeedWhenCrying(),
                     0.01,
                     10,
                     300, 
                     rng,
                     false,
                     PreviousObservationConverter())

policy = solve(solver, problem)

@time est_reward(problem, policy, POMCPBeliefWrapper(BabyStateDistribution(0.0)), N)

elapsed time: 3635.754880173 seconds (12027384 bytes allocated, 0.00% gc time)


-16.263003916559974

In [9]:
# POMCP with Random rollout policy
rng = MersenneTwister(2)
rollout_pol_rng = MersenneTwister(2)

solver = POMCPSolver(RandomBabyPolicy(rollout_pol_rng),
                     0.01,
                     10,
                     300, 
                     rng,
                     false,
                     EmptyConverter())

policy = solve(solver, problem)

@time est_reward(problem, policy, POMCPBeliefWrapper(BabyStateDistribution(0.0)), N)

elapsed time: 2841.589159556 seconds (2453736 bytes allocated)


-17.209711843801013

In [10]:
# Optimal policy for these particular problem parameters:
# if the belief that the baby is hungry is over .28206, then feed (see DMU book)
@everywhere begin
    type OptBabyPolicy <: POMDPs.Policy
    end
    function action(::BabyPOMDP, p::OptBabyPolicy, b::BabyStateDistribution, a=BabyAction(false))
        a.feed = b.p_hungry>0.28206
        return a
    end
end
@time est_reward(problem, OptBabyPolicy(), BabyStateDistribution(0.0), N)

elapsed time: 0.410525487 seconds (1888656 bytes allocated)


-16.16069990155499