In [None]:
import Base: +, *
using Flux
import Flux: params
using Flux.Optimise
using Flux: crossentropy, ADAM
using LinearAlgebra
using Plots
import ProgressMeter
import ProgressMeter: @showprogress
PM = ProgressMeter
PM.ijulia_behavior(:clear)
using Statistics
import Random: seed!
using CUDA

In [None]:
# This is the reward function. You can adjust it however you like.
# Choosing your reward wisely otherwise it might not result in the intended behavior.
stage_reward(x; sp, ap) = Δt*( - x[3]*ap - x[4]*ifelse(x[3] < π/2, 1, -sp) )

In [None]:
# This function can used during training to explore the environment and generate the data.
function (x::LotteryEnv)(action)
    x = x.state
    x₊ = sim(x, action)
    x.reward = (- x₊[3]*ap - x₊[4]*ifelse(x₊[3] < π/2, 1, -sp)
    return x₊, reward
end
end

function play(p :: Net; x_init, n_steps, cp, sp, ϵ)
    x = x_init
    for k in 1:n_steps
        if rand() > ϵ
            u = p(x)
        else
            u = rand()*10 - 5
        end
        x₊ = sim(x,u)
        sc = stage_reward(x₊, cp=cp, sp=sp)
        push!(replay_buffer, [x=x, u=u, x₊=x₊, sc=sc])
        x = x₊
    end
    return replay_buffer
end

In [None]:
# This function runs the policy for n_steps and returns the trajectory and total reward.
function run(p :: Net; x_init, n_steps)
    x = x_init
    xs = []
    c = 0
    for k in 1:n_steps
        u = p(x)
        x = sim(x,u)
        push!(xs, x)
        c += stage_reward(x, cp=cp, sp=sp)
    end
    xs, c
end

In [None]:
# This function calculates the gradient of the policy with respect to the Q function for a given state.
function policy_gradient(x, p, q)
    u = p(x)
    gs = gradient(u -> q(x,u), u)
    return grad
end

log[1:end, 1:2] = [200 201; 200 201; 200 201; 200 201]
log[1:end, 4] = [4, 4, 4, 4]


4-element Vector{Int64}:
 4
 4
 4
 4

In [None]:
# This is the Q-learning algorithm, which returns the log of the Q values
function Q_learning(x_init=[0,0,0,0], p, steps=500, γ=0.5, α=0.9, ϵ-0.5, log=[])
    for i in 1:steps
        if rand() > ϵ
            u = p(x)
        else
            u = rand()*10 - 5
        end
        x₊ = sim(x,u)
        reward = stage_reward(x₊, cp=cp, sp=sp)
        x, u = round(x), round(u)
        q_values[x,u] += γ * (reward + α*np.max(q_values[round(x₊)]) - q_values[x, u])
    end
    return q_values
end

# This function estimates the Q value of a state-action pair using a neural network trained on the log generated by Q_learning
function Critic(q_values, q :: Net)
    # x,u -> Neural network -> estimated Q value
    q_train = [ q_values[x,u] for (x,u) in keys(q_values) ] # I think it is supposed be something like this
    q_loss = sum((q_y - q(q_x)).^2)
    train!(q_loss, params(q), [q_x,q_y], OptimiserChain(WeightDecay(0.42), Adam(0.1)), cb = () -> println("critic")) # OptimiserChain is used to apply L2 regulization
    return q_loss
end

In [None]:
# Use the functions above to implement your own RL algorithm
# As an example, we implement the DDPG algorithm
function RL_MODEL(x_init, n_steps, critic_cost_bound, n_iters, replay_buffer_size, n_samples, p_layers, q_layers, γ, ap, sp)
    qs = []
    qs_target = []
    ps = []
    ps_target = []
    
    # Initialize the replay buffer
    replay_buffer = []
    # Initialize the critic and actor networks
    p = Net(p_layers)
    q = Net(q_layers)
    # Initialize the target networks
    p̂ = deepcopy(p)
    q̂ = deepcopy(q)
    # Initialize the optimizer
    opt_p = ADAM(0.001)
    opt_q = ADAM(0.001)
    # Initialize the critic cost
    critic_cost = 0
    # Initialize the iteration counter
    k = 0
    # Initialize the total reward
    total_reward = 0
    # Pre-training
    Q_learning(x_init=x_init, p=, steps=500)

    println("Building initial replay_buffer")
    for y_init in range(-1, 2, length=5), dy_init in range(-1, 1, length=3)
        append!(replay_buffer, play(p, sp=sp, cp=cp, ϵ = ϵ, x_init = [y_init, dy_init]))
    end
    
    # Actor training
    while critic_cost < critic_cost_bound && k < n_iters
        # Sample a batch of transitions from the replay buffer
        batch = sample(replay_buffer, n_samples)
        # Calculate the target Q value
        q_target = batch[1:end,4] + γ * q̂(batch[1:end,3])
        # Update the critic network
        gs = gradient(() -> crossentropy(q(batch[1:end,1:2]), q_target), params(q))
        update!(opt_q, params(q), gs)
        # Update the actor network
        gs = gradient(() -> -q(p(batch[1:end,1])), params(p))
        update!(opt_p, params(p), gs)
        # Update the target networks
        p̂ = deepcopy(p)
        q̂ = deepcopy(q)
        # Update the critic cost
        critic_cost = crossentropy(q(batch[1:end,1:2]), q_target)
        # Update the iteration counter
        k += 1
    end
    # Play the game
    log = play(p; x_init, n_steps, ap, sp, ϵ=0.0)
    # Calculate the total reward
    total_reward = sum([log[i].sc for i in 1:n_steps])
    return p, q
end

In [None]:
# Tune the hyperparameters
x_init = [0.0, 0.0, 0.0, 0.0]
q_layers = Chain(Dense(2, 4, relu), Dense(4, 2, relu), Dense(2, 2), softmax)
p_layers = Chain(Dense(2, 4, relu), Dense(4, 2, relu), Dense(2, 2), softmax)
n_iters = 40
n_steps = 1000
n_critic_pre_steps = 100
n_samples = 80
replay_buffer_size = 100_000
critic_cost_bound = 1000
γ = 0.05
ap = 1
sp = 1

In [None]:
# Train the model
p, q = RL_MODEL(x_init, n_steps, critic_cost_bound, n_iters, replay_buffer_size, n_samples, p_layers, q_layers, γ, ap, sp)

# Save the model
using BSON
@save "Actor.bson" p
@save "Critic.bson" q

In [None]:
# Test the final model
xs, c = run(p, x_init=[0,0,0,0], n_steps=2000)
plot(xs[1,:], xs[2,:], label="trajectory")