In [None]:
import Base: +, *
using Flux
import Flux: params
using Flux.Optimise
using Flux: crossentropy, ADAM
using LinearAlgebra
using Plots
import ProgressMeter
import ProgressMeter: @showprogress
PM = ProgressMeter
PM.ijulia_behavior(:clear)
using Statistics
import Random: seed!
using CUDA

In [None]:
# This is the reward function. You can adjust it however you like.
stage_reward(x; sp, ap) = Δt*( - x[3]*ap - x[4]*ifelse(x[3] < π/2, 1, -sp) )

In [None]:
# This function calculates the total reward
function Total_reward(x_init, n_steps, cp, sp)
    x = x_init
    c = 0
    for k in 1:n_steps
        u = policy(x)
        x = sim(x,u)
        c += stage_reward(x, cp=cp, sp=sp)
    end
    return c
end

In [None]:
function play(p :: Net; x_init, n_steps, cp, sp, ϵ)
    log = []
    x = x_init
    for k in 1:n_steps
        if rand() > ϵ
            u = p(x)
        else
            u = [5*rand() - 1.0]
        end
        x₊ = sim(x,u)
        sc = stage_reward(x₊, cp=cp, sp=sp)
        push!(log, (x=x, u=u, x₊=x₊, sc=sc))
        x = x₊
    end
    log
end

In [None]:
function run(p :: Net; x_init, n_steps)
    x = x_init
    xs = []
    for k in 1:n_steps
        u = p(x)
        x = sim(x,u)
        push!(xs, x)
    end
    xs
end

In [6]:
function policy_gradient(u)
    # The current policy is a simple linear policy.
    u = rand()
    grad = gradient(u -> Q_function(x,u), u)
    return u
end

log[1:end, 1:2] = [200 201; 200 201; 200 201; 200 201]
log[1:end, 4] = [4, 4, 4, 4]


4-element Vector{Int64}:
 4
 4
 4
 4

In [None]:
# This function estimates the Q value of a state-action pair using a neural network
function Q_function(x,u)
    # x,u -> Neural network -> estimated Q value
    q_train = log[1:end,1:2]
    q_target = log[1:end,4]
end

# This is the Q-learning algorithm
function Q_learning(x,u,steps)
    for i in 1:steps
        x₊, reward = sim(x,u)
        x, u = round(x), round(u)
        q_values[x,u] += γ * (reward + α*np.max(q_values[round(x₊)]) - q_values[x, u])
        push!(log, [x u x₊ q_values[x, u]])
    end
end

In [None]:
# Use the functions above to implement your own RL algorithm
# As an example, we implement the DDPG algorithm
function RL_MODEL(x_init, n_steps, critic_cost_bound, n_iters, replay_buffer_size, n_samples, p_layers, q_layers, γ, ap, sp)
    # Pre-training
    Q_learning(x_init, u_init, 500)
    # Initialize the replay buffer
    replay_buffer = []
    # Initialize the critic and actor networks
    p = Net(p_layers)
    q = Net(q_layers)
    # Initialize the target networks
    p̂ = deepcopy(p)
    q̂ = deepcopy(q)
    # Initialize the optimizer
    opt_p = ADAM(0.001)
    opt_q = ADAM(0.001)
    # Initialize the critic cost
    critic_cost = 0
    # Initialize the iteration counter
    k = 0
    # Initialize the total reward
    total_reward = 0
    # Actor training
    while critic_cost < critic_cost_bound && k < n_iters
        # Sample a batch of transitions from the replay buffer
        batch = sample(replay_buffer, n_samples)
        # Calculate the target Q value
        q_target = batch[1:end,4] + γ * q̂(batch[1:end,3])
        # Update the critic network
        gs = gradient(() -> crossentropy(q(batch[1:end,1:2]), q_target), params(q))
        update!(opt_q, params(q), gs)
        # Update the actor network
        gs = gradient(() -> -q(p(batch[1:end,1])), params(p))
        update!(opt_p, params(p), gs)
        # Update the target networks
        p̂ = deepcopy(p)
        q̂ = deepcopy(q)
        # Update the critic cost
        critic_cost = crossentropy(q(batch[1:end,1:2]), q_target)
        # Update the iteration counter
        k += 1
    end
    # Play the game
    log = play(p; x_init, n_steps, ap, sp, ϵ=0.0)
    # Calculate the total reward
    total_reward = sum([log[i].sc for i in 1:n_steps])
    return total_reward
end

In [None]:
x_init = [0.0, 0.0, 0.0, 0.0]
q_layers = Chain(Dense(2, 4, relu), Dense(4, 2, relu), Dense(2, 2), softmax)
p_layers = Chain(Dense(2, 4, relu), Dense(4, 2, relu), Dense(2, 2), softmax)
n_iters = 40
n_steps = 1000
n_critic_pre_steps = 100
n_samples = 80
replay_buffer_size = 100_000
critic_cost_bound = 1000
γ = 0.05
ap = 1
sp = 1