In [None]:
# addprocs(30);
addprocs(7);

In [1]:
@everywhere begin
    using POMDPModels
    using POMCP
    using POMDPs
    import POMDPToolbox.PreviousObservation
    import POMCP.init_V
    # import POMCP.POMCPPolicy
    import POMDPs: action, updater
end

  likely near /home/zach/.julia/POMDPModels/src/GridWorlds.jl:30


In [2]:
# N = 100000;
N = 100;
problem = BabyPOMDP(-5, -10);

In [3]:
@everywhere type RandomBabyPolicy <: POMDPs.Policy
    rng::AbstractRNG
end
@everywhere function action(p::RandomBabyPolicy, b::POMDPs.Belief, a=BabyAction(false))
    a.feed = rand(p.rng)>0.5
    return a
end
@everywhere updater(::RandomBabyPolicy) = POMDPToolbox.EmptyUpdater()

In [4]:
function est_reward(problem, policy, belief, N; eps=0.01)
    sum = @parallel (+) for i in 1:N
        sim_rng = MersenneTwister(i)
        sim = POMDPToolbox.RolloutSimulator(rng=sim_rng, initial_state=BabyState(false), eps=eps)
        up = updater(policy)
        POMDPs.simulate(sim, problem, policy, up, convert_belief(up,belief))
    end
    return sum/N;
end

est_reward (generic function with 1 method)

In [5]:
# Feed when Crying (Expected Reward for this nearly-optimal policy is -17.14)
# Test from earlier this week with 5000 experiments: -16.72
@time est_reward(problem, FeedWhenCrying(), PreviousObservation(BabyObservation(false)), N)

  

-15.061634270536405

0.398786 seconds (406.49 k allocations: 17.684 MB, 1.23% gc time)


This is better than in the crying babies test because epsilon is large and, more importantly, it gets a notcrying observation on the first step every time

In [6]:
# Random
pol_rng = MersenneTwister(7)
@time est_reward(problem, RandomBabyPolicy(pol_rng), POMDPToolbox.EmptyBelief(), N)

  

-32.318800571303875

0.091403 seconds (83.86 k allocations: 3.863 MB, 5.03% gc time)


In [7]:
# POMCP with FWC rollout policy
# test from earlier this week with 5000 experiments: -16.77
rng = MersenneTwister(2)

solver = POMCPSolver(rollout_policy=FeedWhenCrying(),
                    eps=0.01,
                    c=10.0,
                    tree_queries=300, 
                    rng=rng,
                    updater=updater(problem))
                    

policy = solve(solver, problem)

@time est_reward(problem, policy, BabyStateDistribution(0.0), N)

 97

-14.812470954151415

In [8]:
# POMCP with Random rollout policy
rng = MersenneTwister(2)
rollout_pol_rng = MersenneTwister(2)

solver = POMCPSolver(rollout_policy=RandomBabyPolicy(rollout_pol_rng),
                     eps=0.01,
                     c=10.0,
                     tree_queries=300, 
                     rng=rng,
                     updater=updater(problem))

policy = solve(solver, problem)

@time est_reward(problem, policy, BabyStateDistribution(0.0), N)

.459331 seconds (508.58 M allocations: 17.743 GB, 14.80% gc time)
 67

-15.938558824572759

In [9]:
# Optimal policy for these particular problem parameters:
# if the belief that the baby is hungry is over .28206, then feed (see DMU book)
@everywhere begin
    type OptBabyPolicy <: POMDPs.Policy
    end
    function action(p::OptBabyPolicy, b::BabyStateDistribution, a=BabyAction(false))
        a.feed = b.p_hungry>0.28206
        return a
    end
    updater(::OptBabyPolicy) = updater(BabyPOMDP(-5,-10))
end
@time est_reward(problem, OptBabyPolicy(), BabyStateDistribution(0.0), N)

-14.686851194997828

.725932 seconds (498.03 M allocations: 12.891 GB, 20.52% gc time)
  0.064344 seconds (54.30 k allocations: 2.653 MB)
