In [1]:
include("get_requirements.jl")

# Tiger Problem

There are two doors in front of you. One of these doors has a large reward behind it. The other has a tiger. At each time step, you can take one of three actions:

- Open the LEFT door
- Open the RIGHT door
- LISTEN - you will receive an indication of which door the tiger is behind with an 85% chance of being correct.

Once you open one of the doors, you recieve a reward or penalty and the problem resets.

In [2]:
importall POMDPs
using Distributions
using POMDPToolbox
using QMDP

In [3]:
immutable MyTigerPOMDP <: POMDP{Int, Int, Int}
    p_correct_obs::Float64
    reward::Float64
    penalty::Float64
end
MyTigerPOMDP() = MyTigerPOMDP(0.85, 10, 100)

# states and observations
const TL = 1
const TR = 2

# actions
const LEFT = 1
const RIGHT = 2
const LISTEN = 3

n_states(t::MyTigerPOMDP) = 2
n_actions(t::MyTigerPOMDP) = 3

states(t::MyTigerPOMDP) = [TL, TR]

states (generic function with 4 methods)

In [4]:
initial_state_distribution(p::MyTigerPOMDP) = Categorical([0.5, 0.5])

function transition(t::MyTigerPOMDP, s::Int, a::Int)
    if a == LISTEN
        p = zeros(2)
        p[s] = 1.0
        return Categorical(p)
    else # action was to look, so reset
        return initial_state_distribution(t)
    end
end

function reward(t::MyTigerPOMDP, s::Int, a::Int)
    if a == LISTEN
        return 0.0
    elseif a == s
        return t.reward
    else
        return -t.penalty
    end
end

function observation(t::MyTigerPOMDP, a::Int, sp::Int)
    if a == LISTEN
        p = fill(1-t.p_correct_obs, 2)
        p[sp] = t.p_correct_obs
        return Categorical(p)
    else
        return Categorical([0.5, 0.5])
    end
end

discount(t::MyTigerPOMDP) = 0.95

discount (generic function with 4 methods)

In [5]:
problem = MyTigerPOMDP()
filter = DiscreteUpdater(problem)
b = initialize_belief(filter, initial_state_distribution(problem))
a = LISTEN
hist = sim(problem, max_steps=10) do o
    global b, a
    @show o
    if o != nothing
        b = update(filter, b, a, o)
    end
    @show b
    if b[TL] > 0.9
        a = LEFT
    elseif b[TR] > 0.9
        a = RIGHT
    else
        a = LISTEN
    end
    @show a
    return a
end
@show discounted_reward(hist)

o = nothing
b = POMDPToolbox.DiscreteBelief([0.5,0.5])
a = 3
o = 1
b = POMDPToolbox.DiscreteBelief([0.85,0.15])
a = 3
o = 1
b = POMDPToolbox.DiscreteBelief([0.969799,0.0302013])
a = 1
o = 2
b = POMDPToolbox.DiscreteBelief([0.5,0.5])
a = 3
o = 1
b = POMDPToolbox.DiscreteBelief([0.85,0.15])
a = 3
o = 1
b = POMDPToolbox.DiscreteBelief([0.969799,0.0302013])
a = 1
o = 2
b = POMDPToolbox.DiscreteBelief([0.5,0.5])
a = 3
o = 2
b = POMDPToolbox.DiscreteBelief([0.15,0.85])
a = 3
o = 1
b = POMDPToolbox.DiscreteBelief([0.5,0.5])
a = 3
o = 1
b = POMDPToolbox.DiscreteBelief([0.85,0.15])
a = 3
discounted_reward(hist) = 16.762809375


In [6]:
@requirements_info QMDPSolver() MyTigerPOMDP()


INFO: POMDPs.jl requirements for [1m[34msolve(::QMDPSolver, ::POMDP)[0m and dependencies. ([✔] = implemented correctly; [X] = missing)

For [1m[34msolve(::QMDPSolver, ::POMDP)[0m:
  [No additional requirements]
For [1m[34msolve(::ValueIterationSolver, ::Union{POMDP,MDP})[0m (in solve(::QMDPSolver, ::POMDP)):
[1m[32m  [✔] discount(::MyTigerPOMDP)[0m
[1m[32m  [✔] n_states(::MyTigerPOMDP)[0m
[1m[32m  [✔] n_actions(::MyTigerPOMDP)[0m
[1m[32m  [✔] transition(::MyTigerPOMDP, ::Int64, ::Int64)[0m
[1m[32m  [✔] reward(::MyTigerPOMDP, ::Int64, ::Int64, ::Int64)[0m
[1m[32m  [✔] state_index(::MyTigerPOMDP, ::Int64)[0m
[1m[32m  [✔] action_index(::MyTigerPOMDP, ::Int64)[0m
[1m[31m  [X] actions(::MyTigerPOMDP, ::Int64)[0m
For [1m[34mordered_states(::Union{POMDP,MDP})[0m (in solve(::ValueIterationSolver, ::Union{POMDP,MDP})):
[1m[32m  [✔] states(::MyTigerPOMDP)[0m
[1m[32m  [✔] iterator(::Array)[0m
For [1m[34mordered_actions(::Union{POMDP,MDP})[0m (in solve

LoadError: MethodError: no method matching actions(::MyTigerPOMDP)[0m
Closest candidates are:
  actions{S,A,O,B}(::POMDPs.POMDP{S,A,O}, [1m[31m::B[0m) at /home/zach/.julia/v0.5/POMDPs/src/space.jl:51
  actions{S,A}(::Union{POMDPs.MDP{S,A},POMDPs.POMDP{S,A,O}}, [1m[31m::S[0m) at /home/zach/.julia/v0.5/POMDPs/src/space.jl:43
  actions{S,O}([1m[31m::POMDPs.POMDP{S,Bool,O}[0m) at /home/zach/.julia/v0.5/POMDPToolbox/src/convenience/implementations.jl:6
  ...[0m

In [7]:
actions(t::MyTigerPOMDP) = [LEFT, RIGHT, LISTEN]

actions (generic function with 10 methods)

In [8]:
solver = QMDPSolver(max_iterations=1000)
problem = MyTigerPOMDP()
policy = solve(solver, problem, verbose=true)

Iteration : 1, residual: 14.75, iteration run-time: 8.085e-6, total run-time: 8.085e-6
Iteration : 2, residual: 12.59046875, iteration run-time: 6.472e-6, total run-time: 1.4557e-5
Iteration : 3, residual: 11.564691406249999, iteration run-time: 4.035e-6, total run-time: 1.8592e-5
Iteration : 4, residual: 10.943236428222654, iteration run-time: 3.092e-6, total run-time: 2.1684e-5
Iteration : 5, residual: 10.2558588273941, iteration run-time: 2.857e-6, total run-time: 2.4541e-5
Iteration : 6, residual: 9.587976314837448, iteration run-time: 2.479e-6, total run-time: 2.7020000000000002e-5
Iteration : 7, residual: 8.957886507199987, iteration run-time: 2.614e-6, total run-time: 2.9634e-5
Iteration : 8, residual: 8.367828168991792, iteration run-time: 2.872e-6, total run-time: 3.2506000000000004e-5
Iteration : 9, residual: 7.816304847983972, iteration run-time: 2.775e-6, total run-time: 3.5281e-5
Iteration : 10, residual: 7.301052156282381, iteration run-time: 3.551e-6, total run-time: 3.8

QMDP.QMDPPolicy{MyTigerPOMDP,Int64}([199.986 89.9864 189.986; 89.9868 199.987 189.987],[1,2,3],MyTigerPOMDP(0.85,10.0,100.0))

In [9]:
hr = HistoryRecorder(max_steps=100)
rsum = 0
for i in 1:100
    hist = simulate(hr, problem, policy, filter)
    rsum += discounted_reward(hist)
end
@show rsum/100

rsum / 100 = 32.30941394248515


32.30941394248515