In [2]:
using ReinforcementLearning
using Flux
using Flux.Losses
using IntervalSets
using StableRNGs
using ControlSystems, FurutaPendulums
using BSON
using Logging
using TensorBoardLogger

In [3]:
mutable struct pidEnv <: AbstractEnv
    state
    reward::AbstractFloat
    action_space
    state_space
    done
    furuta
    t
    dt
    tmax
    last_time
    Beta
    K
    N
    Td
    Ti
    Tr
    yOld
    eOld
    e
    I
    D
    u
end

function pidEnv(;
        max_u=[3,5,10,10,10,4],
        min_u=[0, 0, 0, 0, 0, 0],
        max_dθ=1e10,
        max_dϕ=1e10,
        dt = 0.01,
        tmax = 5.
        )
        high = [2*pi,max_dθ,2*pi,max_dϕ]
        low = [0,-max_dθ,0,-max_dϕ]
            furuta = SimulatedFurutaPendulum()
    pidEnv(
        measure(furuta),
        0.,
        Space(ClosedInterval{Float64}.(min_u,max_u)),
        Space(ClosedInterval{Float64}.(low, high)),
        false,
        furuta,
        0.,
        dt,
        tmax,
        time(),
        1.,
        1.,
        7.,
        1.,
        0.,
        10.,
        0.,
        0.,
        0.,
        0.,
        0.,
        0.
            )
end

RLBase.action_space(env::pidEnv) = env.action_space
RLBase.state_space(env::pidEnv) = env.state_space
RLBase.reward(env::pidEnv) = env.reward
function _reward(env::pidEnv)
    ϕ, ϕdot, θ, θdot = measure(env.furuta)
    costs = 1 - cos(θ)
    return -costs
end
RLBase.is_terminated(env::pidEnv) = env.done
function RLBase.state(env::pidEnv)
    ϕ, ϕdot, θ, θdot = measure(env.furuta)
    [ϕ,ϕdot,θ,θdot]
end

function (env::pidEnv)(a::Vector{AbstractFloat})
    @assert a in env.action_space
    @show a
    env.Beta, env.K, env.N, env.Td, env.Ti, env.Tr = a

    y = cos(env.state[3])
    env.e = 1 - y
    ad = env.Td/(env.N*env.dt + env.Td) 
    bd = env.K*env.N*ad
    env.D = ad*env.D + bd*(y-env.eOld)
    v = env.K*(env.Beta-env.e) + env.I + env.D
    env.u = clamp(v,-5,5)

    control(env.furuta,env.u)
    env.last_time = periodic_wait(env.furuta,env.t,env.dt)
    env.t += env.dt
    env.done = env.t >= env.tmax

    if env.Ti == 0
        I = 0.0;
    else 
        I = I + (env.K*env.dt/env.Ti)*env.e + (env.dt/env.Tr)*(env.u - v);
    end
    env.yOld = y;
    env.eOld = env.e;
end

function RLBase.reset!(env::pidEnv)
    env.last_time = time()
    env.reward = 0.
    env.t = 0.
    env.furuta = SimulatedFurutaPendulum()
    env.state = measure(env.furuta)
    env.I = 0.0
    env.yOld = 0.0
    env.eOld = 0.0
    env.done = false
end

In [42]:
struct PIDNeuralNet
    chain::Chain
end
function (m::PIDNeuralNet)(x)
    return sum(m.chain(x).*x,dims=1)
end

Flux.@functor PIDNeuralNet

# _init = glorot_uniform(StableRNG(123))
function create_actor()
    chain = Chain(
        Dense(5, 30, relu),
        # Dense(30, 30, relu; init = init),
        Dense(30, 5, tanh),
    )
    model = PIDNeuralNet(chain)
    return model |> cpu
end

model = create_actor()
batch = [1 1 1 1 1;
    1 2 1 2 1]'
model(batch)

PIDNeuralNet(Chain(Dense(5, 30, relu), Dense(30, 6, tanh)), pidEnv([6.010434288348817, 0.0, 3.141592653589793, 0.0], 0.0, -5.0..5.0, Space{Vector{ClosedInterval{Float64}}}(ClosedInterval{Float64}[0.0..6.283185307179586, -1.0e10..1.0e10, 0.0..6.283185307179586, -1.0e10..1.0e10]), false, SimulatedFurutaPendulum{Float64, Random._GLOBAL_RNG}(), 0.0, 0.01, 5.0, 1.685572934208e9, 1.0, 1.0, 7.0, 1.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0))

In [43]:
function RL.Experiment(
    ::Val{:JuliaRL},
    ::Val{:TD3},
    ::Val{:Pendulum},
    ::Nothing;
    seed = 123,
)
    rng = StableRNG(seed)
    env = pidEnv()
    A = action_space(env)
    ns = length(state(env))
    init = glorot_uniform(rng)
    na = length(A)

    create_actor() = Chain(
        Dense(ns, 32, relu; init = init),
        Dense(32, 32, relu; init = init),
        Dense(32, na, tanh; init = init),
    ) |> cpu

    create_critic_model() = Chain(
        Dense(ns + na, 32, relu; init = init),
        Dense(32, 32, relu; init = init),
        Dense(32, 1; init = init),
    ) |> cpu

    create_critic() = TD3Critic(create_critic_model(), create_critic_model())

    agent = Agent(
        policy = TD3Policy(
            behavior_actor = NeuralNetworkApproximator(
                model = create_actor(),
                optimizer = ADAM(),
            ),
            behavior_critic = NeuralNetworkApproximator(
                model = create_critic(),
                optimizer = ADAM(),
            ),
            target_actor = NeuralNetworkApproximator(
                model = create_actor(),
                optimizer = ADAM(),
            ),
            target_critic = NeuralNetworkApproximator(
                model = create_critic(),
                optimizer = ADAM(),
            ),
            γ = 0.9f0,
            ρ = 0.99f0,
            batch_size = 64,
            start_steps = 1000,
            start_policy = RandomPolicy({0. for i in 1:6}..{5. for i in 1:6}; rng = rng),
            update_after = 1000,
            update_freq = 1,
            policy_freq = 2,
            target_act_limit = 5.0,
            target_act_noise = 0.1,
            act_limit = 5.0,
            act_noise = 0.1,
            rng = rng,
        ),
        trajectory = CircularArraySARTTrajectory(
            capacity = 1_500_000,
            state = Vector{Float32} => (ns,),
            action = Vector{AbstractFloat} => (na,),
        ),
    )

    #stop_condition = StopAfterStep(10000, is_show_progress=!haskey(ENV, "CI"))
    stop_condition = StopAfterEpisode(5; cur = 0, is_show_progress = true)
    hook = TotalRewardPerEpisode() 
    Experiment(agent, env, stop_condition, hook, "# Play Pendulum with TD3")
end

using Plots
pid = E`JuliaRL_TD3_Pendulum`
run(pid)
plot(ex.hook.rewards)

# Play Pendulum with TD3


LoadError: BoundsError: attempt to access 6-element Vector{Float64} at index []

In [None]:
dd

In [9]:
run(
           RandomPolicy(),
        #    ex.policy,
        #    model,
           pidEnv(),
           StopAfterEpisode(10),
           TotalRewardPerEpisode()
       )

LoadError: method not implemented