## Belief State Reinforcement Learning

In [1]:
using Random
using Printf
using StaticArrays
using Flux
using FileIO
using BSON
using BSON: @load
using ProgressMeter
using POMDPs
using POMDPModelTools
using POMDPSimulators
using POMDPPolicies
using BeliefUpdaters
using DeepRL
using DeepQLearning
using LocalApproximationValueIteration
using DiscreteValueIteration
using AutomotiveDrivingModels
using AutoViz
using AutomotivePOMDPs
using MDPModelChecking
using PedCar
using AutomotiveSensors
using Reel

loaded


┌ Info: Recompiling stale cache file /mnt/c/Users/Maxime/wsl/.julia/compiled/v1.0/MDPModelChecking/XrC4q.ji for MDPModelChecking [abefb91b-a28c-5ab9-9bd9-026e532d7b0e]
└ @ Base loading.jl:1184
┌ Info: Recompiling stale cache file /mnt/c/Users/Maxime/wsl/.julia/compiled/v1.0/PedCar/NmDDZ.ji for PedCar [90cf7f26-d5c7-593d-a0e1-4a8367407571]
└ @ Base loading.jl:1184
│ - If you have PedCar checked out for development and have
│   added AutomotivePOMDPs as a dependency but haven't updated your primary
│   environment's manifest file, try `Pkg.resolve()`.
│ - Otherwise you may need to report an issue with PedCar


In [13]:
include("masking.jl")
include("util.jl")
include("masked_dqn.jl")
include("render_helpers.jl")
include("qmdp_approximation.jl")

In [3]:
include("training_scripts/RNNFiltering/RNNFiltering.jl")
using Main.RNNFiltering

In [4]:
rng = MersenneTwister(1)
cam = FitToContentCamera(0.);

## Environment

In [36]:
mdp = PedCarMDP(pos_res=2.0, vel_res=2., ped_birth=0.7, car_birth=0.7)
pomdp = UrbanPOMDP(env=mdp.env,
                   sensor = PerfectSensor(),
#                     sensor = GaussianSensor(false_positive_rate=0.0, 
#                                             pos_noise = LinearNoise(min_noise=0.5, increase_rate=0.05), 
#                                             vel_noise = LinearNoise(min_noise=0.5, increase_rate=0.05)),
                   ego_goal = LaneTag(2, 1),
                   obs_dist = ObstacleDistribution(mdp.env, 
                                                   upper_obs_pres_prob=0., 
                                                   left_obs_pres_prob=1.0, 
                                                   right_obs_pres_prob=1.0),
                   max_cars=1, 
                   max_peds=1, 
                   car_birth=0.1, 
                   ped_birth=0.1, 
                   max_obstacles=1, # no fixed obstacles
                   lidar=false,
                   ego_start=20,
                   ΔT=0.1);

# instantiate sub problems
dqn_pomdp = deepcopy(pomdp)
dqn_pomdp.max_obstacles = 0

rnn_pomdp = deepcopy(pomdp)
rnn_pomdp.max_obstacles = 1

1

## Load DRQN Policy

In [6]:
threshold = 0.99
@load "pc_processed.bson" qmat util pol
safe_policy = ValueIterationPolicy(mdp, qmat, util, pol)
mask = SafetyMask(mdp, safe_policy, threshold);

In [7]:
problem_file="training_scripts/drqn-log/log8/problem.bson"
weights_file="training_scripts/drqn-log/log8/weights.bson"
env_ = POMDPEnvironment(dqn_pomdp)
dqn_policy = DeepQLearning.restore(env_, problem_file=problem_file, weights_file=weights_file)
policy = MaskedNNPolicy(dqn_pomdp, dqn_policy, mask);

2018-10-11 09:58:05.754442: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA


## Load RNN Belief Updater

In [15]:
n_models = 5
models = Vector{Chain}(undef, n_models)
for i=1:n_models
    models[i] = BSON.load("training_scripts/RNNFiltering/model_$(i)0.bson")[:model] 
end

## Visualize Policy

In [16]:
include("qmdp_approximation.jl")

**Simulation with RNN Belief Updater**

In [39]:
pomdp.sensor = GaussianSensor(false_positive_rate=0.0, 
                            pos_noise = LinearNoise(min_noise=0.5, increase_rate=0.05), 
                            vel_noise = LinearNoise(min_noise=0.5, increase_rate=0.05));

In [46]:
up = PedCarRNNUpdater(models, mdp, rnn_pomdp)
reset_updater!(up)
DeepQLearning.reset_hidden_state!(policy)
s0 = initialstate(pomdp, rng)
a0 = UrbanAction(1.0)
o0 = generate_o(pomdp, s0, a0, s0, rng)
b0 = update(up, PedCarRNNBelief(Vector{Vector{Float64}}(undef, n_models), o0), a0, o0);
singleaction_policy = FunctionPolicy(s -> UrbanAction(0.))
hr = HistoryRecorder(rng=rng, max_steps=400)
@time hist = simulate(hr, pomdp, policy, up, b0, s0);

  2.096438 seconds (4.52 M allocations: 328.362 MiB, 9.06% gc time)


In [43]:
# visualize the simulation
state_history = state_hist(hist)
action_history = action_hist(hist)
safe_actions_hist = ainfo_hist(hist)
observation_history = observation_hist(hist)
insert!(observation_history, 1, o0)
belief_history = belief_hist(hist)
push!(action_history, UrbanAction(NaN))
push!(safe_actions_hist, UrbanAction[])
duration, fps, render_hist = animate_hist(pomdp, state_history, observation_history, belief_history, action_history, safe_actions_hist)
speed_factor = 1
film = roll(render_hist, fps = speed_factor*fps, duration = duration/speed_factor)

In [47]:
function evaluation_loop(pomdp::UrbanPOMDP, policy::Policy, up::PedCarRNNUpdater; n_ep::Int64 = 1000, max_steps::Int64 = 500, rng::AbstractRNG = Base.GLOBAL_RNG)
    rewards = zeros(n_ep)
    steps = zeros(n_ep)
    violations = zeros(n_ep)
    @showprogress for ep=1:n_ep
        reset_updater!(up)
        DeepQLearning.reset_hidden_state!(policy)
        s0 = initialstate(pomdp, rng)
        a0 = UrbanAction(1.0)
        o0 = generate_o(pomdp, s0, a0, s0, rng)
        b0 = update(up, PedCarRNNBelief(Vector{Vector{Float64}}(undef, n_models), o0), a0, o0);
        singleaction_policy = FunctionPolicy(s -> UrbanAction(0.))
        hr = HistoryRecorder(rng=rng, max_steps=400)
        hist = simulate(hr, pomdp, policy, up, b0, s0);
        rewards[ep] = discounted_reward(hist)
        steps[ep] = n_steps(hist)
        violations[ep] = is_crash(hist.state_hist[end])#sum(hist.reward_hist .<= -1.) #+ Int(n_steps(hist) >= max_steps)
    end
    return rewards, steps, violations
end

evaluation_loop (generic function with 3 methods)

In [48]:
up = PedCarRNNUpdater(models, mdp, rnn_pomdp)
@time rewards_mask, steps_mask, violations_mask = evaluation_loop(pomdp, policy, up, n_ep=1000, max_steps=400, rng=rng);
print_summary(rewards_mask, steps_mask, violations_mask)

[32mProgress: 100%|█████████████████████████████████████████|  ETA: 0:00:01[39m

1046.236362 seconds (2.40 G allocations: 172.741 GiB, 12.58% gc time)
Summary for 

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:17:26[39m


1000 episodes: 
Average reward: 0.018 
Average # of steps: 86.592 
Average # of violations: 7.800 


**Simulation with perfect observation**

In [90]:
pomdp.sensor = PerfectSensor();

PerfectSensor()

In [91]:
up = PerfectSensorUpdater(dqn_pomdp)
DeepQLearning.reset_hidden_state!(policy)
s0 = initialstate(pomdp, rng)
a0 = UrbanAction(1.0)
o0 = generate_o(pomdp, s0, a0, s0, rng)
b0 = update(up, o0, a0, o0)
singleaction_policy = FunctionPolicy(s -> UrbanAction(0.))
hr = HistoryRecorder(rng=rng, max_steps=400)
@time hist = simulate(hr, pomdp, policy, up, b0, s0);

  0.191908 seconds (857.43 k allocations: 33.055 MiB, 14.02% gc time)


In [93]:
state_history = state_hist(hist)
action_history = action_hist(hist)
safe_actions_hist = ainfo_hist(hist)
observation_history = observation_hist(hist)
insert!(observation_history, 1, o0)
belief_history = belief_hist(hist)
push!(action_history, UrbanAction(NaN))
push!(safe_actions_hist, UrbanAction[])
duration, fps, render_hist = animate_hist(pomdp, state_history, observation_history, action_history, safe_actions_hist)
speed_factor = 1
film = roll(render_hist, fps = speed_factor*fps, duration = duration/speed_factor)

**Simulation with multiple cars and pedestrians**

In [109]:
pomdp.max_cars = 5
pomdp.max_peds = 5
still_policy = FunctionPolicy(s -> UrbanAction(0.))
up = NothingUpdater()
s0 = initialstate(pomdp, rng)
hr = HistoryRecorder(rng=rng, max_steps=400)
@time hist = simulate(hr, pomdp, still_policy, up, b0, s0);

  0.868439 seconds (3.82 M allocations: 112.693 MiB, 5.61% gc time)


In [110]:
duration, fps, render_rec = animate_scenes(hist.state_hist, pomdp.env, sim_dt=pomdp.ΔT, cam = StaticCamera(VecE2(0., -8.), 14.0))
speed_factor = 1
film = roll(render_rec, fps = speed_factor*fps, duration = duration/speed_factor)

In [103]:
length(hist.state_hist)

401

In [77]:
s = state_history[1]
o = observation_history[1]
p_sa = compute_probas(pomdp, policy.mask, o)

4-element Array{Float64,1}:
 0.9999913828130208
 0.9999913828130208
 0.9999913828130208
 0.9922843380022387

In [35]:
function AutomotivePOMDPs.animate_hist(pomdp::UrbanPOMDP, 
                                         scenes::Vector{Scene}, 
                                         observations::Vector{Vector{Float64}}, 
                                         beliefs::Vector{PedCarRNNBelief}, 
                                         actions::Vector{UrbanAction},
                                         safe_actions::Vector{Any};
                                         sim_dt = 0.1,
                                         cam = StaticCamera(VecE2(0., -8.), 14.0))
    env = pomdp.env 
    duration = length(scenes)*sim_dt
    fps = Int(1/sim_dt)
    function render_rec(t, dt)
        frame_index = Int(floor(t/dt)) + 1
        overlays = SceneOverlay[IDOverlay()]
        obs = [veh for veh in obs_to_scene(pomdp, observations[frame_index]) if veh.id != EGO_ID]
        obs_overlay = GaussianSensorOverlay(sensor=pomdp.sensor, o=obs, color=MONOKAI["color2"])
        push!(overlays, obs_overlay)
        occlusion_overlay = OcclusionOverlay(obstacles=mdp.env.obstacles)
        push!(overlays, occlusion_overlay)
        cp, pp = 0., 0.
        for pred in beliefs[frame_index].predictions
            bb, car_pres, ped_pres = process_prediction(pomdp, pred, beliefs[frame_index].obs)
            cp += car_pres
            pp += ped_pres
            bel = [veh for veh in obs_to_scene(pomdp, bb) if veh.id != EGO_ID]
            bel_overlay = GaussianSensorOverlay(sensor=pomdp.sensor, o=bel, color=MONOKAI["color4"])            
            push!(overlays, bel_overlay)
        end
        cp /= length(beliefs[frame_index].predictions)
        pp /= length(beliefs[frame_index].predictions)
        push!(overlays, HistogramOverlay(pos=VecE2(-15., -20.), val=cp, label="car"))
        push!(overlays, HistogramOverlay(pos=VecE2(-12., -20.), val=pp, label="ped"))
        push!(overlays, TextOverlay(text=["Probability of presence"], pos=VecSE2(-17,-14.), font_size=15, incameraframe=true))
        push!(overlays, TextOverlay(text = ["v: $(get_ego(scenes[frame_index]).state.v)"], font_size=20, 
                                    pos=VecE2(pomdp.env.params.x_min + 3.,6.), incameraframe=true))
        push!(overlays, TextOverlay(text = ["Acc: $(actions[frame_index].acc)"], font_size=20,
                                    pos=VecE2(pomdp.env.params.x_min + 3.,8.), incameraframe=true))
        push!(overlays, TextOverlay(text = ["Available Actions: $([a.acc for a in safe_actions[frame_index]])"], font_size=20,
                                    pos=VecE2(pomdp.env.params.x_min + 3.,10.), incameraframe=true))
        push!(overlays, TextOverlay(text = ["step: $frame_index"], font_size=20,
                                            pos=VecE2(pomdp.env.params.x_min + 3.,4.), incameraframe=true))
                                
        return AutoViz.render(scenes[frame_index], env, overlays, cam=cam)
    end
    return duration, fps, render_rec
end
    