In [None]:
## Init Bionic VTOL
include("../Flyonic.jl");
using .Flyonic;

using Rotations; # used for initial position

using ReinforcementLearning;
using StableRNGs;
using Flux;
using Flux.Losses;
using Random;
using IntervalSets;
using LinearAlgebra;
using Distributions;

using Plots;
using Statistics;

using BSON: @save, @load # save mode
create_visualization();

# indicates how many threads Julia was started with. This is important for the multi-threaded environment
Threads.nthreads()
### Create Reinforcement Learning Environment

mutable struct VtolEnv{A,T,ACT,R<:AbstractRNG} <: AbstractEnv # Parametric Constructor for a subtype of AbstractEnv
    action_space::A # action space
    observation_space::Space{Vector{ClosedInterval{T}}} # observation space
    state::Vector{T} # current state space
    action::ACT # action space
    done::Bool # done
    t::T # time
    rng::R # random number generator

    name::String # for multible environments
    visualization::Bool # visualization
    realtime::Bool # realtime

    x_previous::Vector{T} # previous position
    x_W::Vector{T} # current position
    v_B::Vector{T} # velocity
    R_W::Matrix{T} # current rotation
    w_B::Vector{T} # rotation velocitiy
    wind_W::Vector{T} # wind
    delta_t::T # simulation time step

    ################################ TODO ################################
    # Extend the environment here.
    # Everything you need additionaly in your environment also go in here.
    # E.g. a trajectory

    waypoints::Vector{Vector{T}} # waypoints
    proximity_tolerance::T # proximity tolerance
    v_max::T # maximum allowed velocity

    ######################################################################
end

################################ TODO ################################
# You can initialization global constants here.
# E.g. a fixed point in the beginning of training (for testing/overfitting)
# Define global constants for initial position and rotation

#####
##### first coordinate - red axis - x
##### second coordinate - green axis - y
##### third coordinate - blue axis - z
#####

const NUM_WAYPOINTS = 2;
const INITIAL_POSITION = [0.0, 0.0, 0.0];
const INITIAL_ROTATION = [1.0 0.0 0.0; 0.0 1.0 0.0; 0.0 0.0 1.0];
const SAMPLE_WAYPOINTS = [[0.0, 0.0, 3.0], [0.0, 0.0, 5.0]]
######################################################################
# define a keyword-based constructor for the type declared in the mutable struct typedef.
# It could also be done with the macro Base.@kwdef.
function VtolEnv(;
    rng = Random.GLOBAL_RNG, # random number generation
    name = "vtol",
            visualization = false,
    realtime = false,
    kwargs...) # let the function take an arbitrary number of keyword arguments

    T = Float64; # explicit type which is used e.g. in state. Cannot be altered due to the poor matrix defininon.
    A = Space{Vector{ClosedInterval{T}}};

    action_space = Space(
        ClosedInterval{T}[
            0.0..2.0, # propeller 1
            0.0..2.0, # propeller 2
            ],
    ) # propeller 1 and 2

    state_space = Space( # Three continuous values in state space.
        ClosedInterval{T}[
            ################################ TODO ################################
            # Implement an observation space.
            # Here is an example space. You can change it if desired.
            # You have to extend it.
            # Orientate yourself on the observation space from the paper.

            typemin(T)..typemax(T), # previous position along x WORLD coordinates
            typemin(T)..typemax(T), # previous position along z WORLD coordinates

            typemin(T)..typemax(T), # current position along x WORLD coordinates
            typemin(T)..typemax(T), # current position along z WORLD coordinates

            typemin(T)..typemax(T), # orientation along x WORLD coordinates
            typemin(T)..typemax(T), # orientation along z WORLD coordinates

            typemin(T)..typemax(T), # velocity along x BODY coordinates
            typemin(T)..typemax(T), # velocity along z BODY coordinates

            typemin(T)..typemax(T), # rotational velocity along z BODY coordinates

            typemin(T)..typemax(T), # position of target along x WORLD coordinates
            typemin(T)..typemax(T), # position of target along y WORLD coordinates

            ######################################################################
            ],
    )

    if visualization #visualizes VTOL
        create_VTOL(name, actuators = true, color_vec=[1.0; 1.0; 0.6; 1.0]);
    end

    environment = VtolEnv(
        action_space, # action space
        state_space, # observation space
        zeros(T, length(state_space)), # current state space
        rand(action_space), # initialization action
        false, # episode done
        0.0, # time
        rng, # random number generator

        name,
        visualization,
        realtime,

        zeros(T, 3), # x_previous, previous position
        zeros(T, 3), # x_W, current position
        zeros(T, 3), # v_B, velocity
        [1.0 0.0 0.0; 0.0 1.0 0.0; 0.0 0.0 1.0], # R_W, current rotation
        zeros(T, 3), # w_B
        zeros(T, 3), # wind_W
        T(0.025), # simulation time step

        ################################## TODO ##################################
        # Initialization everything you need additionaly in your environment here
        SAMPLE_WAYPOINTS,
        1e-5, # proximity tolerance
        100.0, # maximum allowed velocity

        ##########################################################################
    )

    reset!(environment)
    return environment
end;

methods(VtolEnv)


# Just for explanation:
# 1. A mutable Struct is created. A struct is a constructor and a constructor is a function that creates new objects.
# 2. A outer keyword-based constructor method is added for the type declared in the mutable struct typedef before.'
# So now we have a function with two methods. Julia will decide which method to call by multiple dispatch.

## Define the RL interface

Random.seed!(env::VtolEnv, seed) = Random.seed!(env.rng, seed)
RLBase.action_space(env::VtolEnv) = env.action_space
RLBase.state_space(env::VtolEnv) = env.observation_space
RLBase.is_terminated(env::VtolEnv) = env.done
RLBase.state(env::VtolEnv) = env.state


# Hyperparameters
global kp = 5.0;
global kw = 0.01;
global kwp = 5.0;
function computeReward(env::VtolEnv{A,T}) where {A,T}
    reward = 0.0

    ################################ TODO ################################
    # Implement the reward function.
    # Orientate on the paper.

    # Gates and their length
    gates = env.waypoints;
    n = length(gates);

    # closest point on the guiding path phi
    # its line-segment index lp
    lp_old, phi_old = calculate_progress(gates, env.x_previous);
    lp_new, phi_new = calculate_progress(gates, env.x_W);

    # previous time step spt_old
    # current time step spt_new
    spt_old = 0.0;
    spt_new = 0.0;
    for i in 1:(lp_old-1)
        spt_old += (norm(gates[i+1] - gates[i]) + norm(phi_old - gates[lp_old]));
    end
    for i in 1:(lp_new-1)
        spt_new += (norm(gates[i+1] - gates[i]) + norm(phi_new - gates[lp_old]));
    end

    # rogress reward rpt at time t is as a difference in reached
    # distance between the current and previous time step
    rpt = spt_new - spt_old
    spt = spt_new

    # The sum that is later goig to be the divisor
    # for the reached distance reward
    divisor_g = 0.0;
    for i in 1:(n-1)
        divisor_g += norm(gates[i+1] - gates[i]);
    end

    # reached distance reward ks
    ks = 2 * env.v_max * env.delta_t / divisor_g;

    # waypoint index wpi must be same as the index of the current line segment
    wpi = lp_old;
    # distance to new waypoint dw
    dwp = norm(env.x_W - gates[wpi]);
    # tolerance for proximity to a waypoint
    r_tol = env.proximity_tolerance;

    # waypoint reward rwp
    rwp = exp(-dwp/r_tol);

    # no obstacles
    collision = false;
    # terminal reward rt
    rt = collision ? -10 : 0;

    # rotation speed
    w = norm(env.w_B);

    reward = kp * rpt + ks * spt + kwp * rwp + rt - kw * w;
    ################################################################################################

    return reward
end

RLBase.reward(env::VtolEnv{A,T}) where {A,T} = computeReward(env)

function RLBase.reset!(env::VtolEnv{A,T}) where {A,T}
    # Visualize initial state
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, [0.0; 0.0; 0.0; 0.0]);
    end

    env.x_W = [0.0; 0.0; 0.0];
    env.v_B = [0.0; 0.0; 0.0];
    env.R_W = Matrix(UnitQuaternion(RotZ(-pi/2.0) * RotY(-pi/2.0) * RotX(pi)));

    env.w_B = [0.0; 0.0; 0.0];
    env.wind_W = [0.0; 0.0; 0.0];

    env.t = 0.0;
    env.action = [0.0, 0.0];
    env.done = false;

    ################################ TODO ################################
    # Reset environment.
    # Is called if the training terminates
    # (e.g. if drone crashes or successfully reaches point)
    # HINT: Everything you added to your environment needs to be reseted.
    #       Compare it with the initialization.

    env.x_previous = [0.0; 0.0; 0.0]; # starting position
    env.delta_t = T(0.025); # Δ time

    env.waypoints = SAMPLE_WAYPOINTS;
    env.proximity_tolerance = 1e-5;
    ######################################################################

    nothing
end;

# defines a methods for a callable object.
# So when a VtolEnv object is created, it has this method that can be called
function (env::VtolEnv)(a)
    # set the propeller trust and the two flaps 2D case
    # flaps set to 0.0
    next_action = [a[1], a[2], 0.0, 0.0]

    _step!(env, next_action)
end

env = VtolEnv();
methods(env) # Just to explain which methods the object has


function _step!(env::VtolEnv, next_action)

    ################################ TODO ################################
    # Implement step.
    # HINT: This is relatet to your environment.
    #       Compare to struct VtolEnv.
    #       How does it change in every step.

    env.state[1] = env.state[3]; # update previous x-coordinate
    env.state[2] = env.state[4]; # update previous z-coordinate
    env.x_previous = env.x_W;

    ######################################################################

    # caluclate wind impact
    v_in_wind_B = vtol_add_wind(env.v_B, env.R_W, env.wind_W);

    # caluclate aerodynamic forces
    torque_B, force_B = vtol_model(v_in_wind_B, next_action, eth_vtol_param);

    # Limit to 2D
    force_B[3] = 0.0; # Body Z
    env.v_B[3] = 0.0;
    torque_B[1] = 0.0;
    torque_B[2] = 0.0;  # Body X and Y
    env.w_B[1] = 0.0;
    env.w_B[2] = 0.0;

    # integrate rigid body dynamics for delta_t
    env.x_W, env.v_B, env.R_W, env.w_B, time = rigid_body_simple(torque_B, force_B, env.x_W, env.v_B, env.R_W, env.w_B, env.t, env.delta_t, eth_vtol_param);

    if env.realtime
        sleep(env.delta_t); # just a dirty hack. this is of course slower than real time.
    end

    # Visualize the new state
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, next_action);
    end

    env.t += env.delta_t

    env.state[3] = env.x_W[1]; # position along x
    env.state[4] = env.x_W[3]; # position along z

    env.state[5] = env.R_W[1,1]; # orientation along x
    env.state[6] = env.R_W[3,1]; # orientation along z

    env.state[7] = env.v_B[1]; # velocity along x coordinates
    env.state[8] = env.v_B[2]; # velocity along z coordinates

    env.state[9] = env.w_B[3];  # rotational velocity along z BODY coordinates

    ################################ TODO ################################
    # Implement step.
    # HINT: This is relatet to your environment.
    #       Compare to struct VtolEnv.
    #       How does it change in every step.

    env.state[10] = env.waypoints[1][1] # position of target along x WORLD coordinates
    env.state[11] = env.waypoints[1][3] # position of target along z WORLD coordinates
    ######################################################################

    ################################ TODO ################################
    # Add termination criterias.
    # Use many termination criteria so that you do not train unnecessarily in wrong areas.
    env.done =
        norm(env.w_B) > 100.0 || # stop if body rate is too high
        norm(env.v_B) > env.v_max || # stop if body is too fast
        env.t > 10.0
    ######################################################################

    nothing
end;

RLBase.test_runnable!(env)

# changed to 10s (5s before) per point and 5.0m too far off path (2.0 before)
# Show an overview of the environment.

## Setup of a reinforcement learning experiment.

seed = 123
rng = StableRNG(seed)
    N_ENV = 8
    UPDATE_FREQ = 1024

    vtol_envs = [
        # use different names for the visualization
        VtolEnv(; rng = StableRNG(hash(seed+i)), name = "vtol$i") for i in 1:N_ENV
    ];
    # define multiple environments for parallel training
    env = MultiThreadEnv(vtol_envs)

    # Define the function approximator
    # (optional) TODO: change architecture
    # TODO: research briefly what Actor Critic is
    # (optional) TODO: change optimizer
    # TODO: research what ADAM is
    ns, na = length(state(env[1])), length(action_space(env[1]))
    approximator = ActorCritic(
                actor = GaussianNetwork(
                    pre = Chain(
                    Dense(ns, 16, tanh; initW = glorot_uniform(rng)),#
                    Dense(16, 16, tanh; initW = glorot_uniform(rng)),
                    ),
                    μ = Chain(Dense(16, na; initW = glorot_uniform(rng))),
                    logσ = Chain(Dense(16, na; initW = glorot_uniform(rng))),
                ),
                critic = Chain(
                    Dense(ns, 16, tanh; initW = glorot_uniform(rng)),
                    Dense(16, 16, tanh; initW = glorot_uniform(rng)),
                    Dense(16, 1; initW = glorot_uniform(rng)),
                ),
                optimizer = ADAM(1e-3),
            );

        agent = Agent( # A wrapper of an AbstractPolicy
        # AbstractPolicy: the policy to use
        # (optional) TODO: change eventually
        # TODO: research briefly what PPO is
        policy = PPOPolicy(;
                    approximator = approximator |> gpu,
                    update_freq=UPDATE_FREQ,
                    dist = Normal,
                    # For parameters visit the docu: https://juliareinforcementlearning.org/docs/rlzoo/#ReinforcementLearningZoo.PPOPolicy
                    ),

        # AbstractTrajectory: used to store transitions between an agent and an environment source
        trajectory = PPOTrajectory(;
            capacity = UPDATE_FREQ,
            state = Matrix{Float64} => (ns, N_ENV),
            action = Matrix{Float64} => (na, N_ENV),
            action_log_prob = Vector{Float64} => (N_ENV,),
            reward = Vector{Float64} => (N_ENV,),
            terminal = Vector{Bool} => (N_ENV,),
        ),
    );


function saveModel(t, agent, env)
    model = cpu(agent.policy.approximator)
    f = joinpath("./RL_models/", "vtol_2D_ppo_$t.bson") # TODO: save model here
    @save f model
    println("parameters at step $t saved to $f")
end;

function loadModel()
    f = joinpath("./RL_models/", "vtol_2D_ppo_1500000.bson") # TODO: load model here
    @load f model
    return model
end;

function validate_policy(t, agent, env)
    run(agent.policy, test_env, StopAfterEpisode(1), episode_test_reward_hook)
    # the result of the hook
    println("test reward at step $t: $(episode_test_reward_hook.rewards[end])")

end;

episode_test_reward_hook = TotalRewardPerEpisode(;is_display_on_exit=false)
# create a env only for reward test
test_env = VtolEnv(;name = "testVTOL", visualization = true, realtime = true);

ReinforcementLearning.run(
    agent,
    env,
    StopAfterStep(1_500_000),
    ComposedHook(
        DoEveryNStep(saveModel, n=100_000),
        DoEveryNStep(validate_policy, n=10_000)),
)

[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mMeshCat server started. You can open the visualizer by visiting the following URL in your browser:
[36m[1m└ [22m[39mhttp://127.0.0.1:8700


[0m[1mTest Summary:              | [22m[32m[1mPass  [22m[39m[36m[1mTotal  [22m[39m[0m[1mTime[22m
random policy with VtolEnv | [32m2000  [39m[36m 2000  [39m[0m1.1s


[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mThe GPU function is being called but the GPU is not accessible. 
[36m[1m└ [22m[39mDefaulting back to the CPU. (No action is required if you want to run on the CPU).
[32mProgress:   0%|                                         |  ETA: 12:21:26[39m9m

test reward at step 10000: -46.340577342801886


[32mProgress:   1%|▌                                        |  ETA: 0:57:29[39m

test reward at step 20000: -46.45111178798262


[32mProgress:   2%|▊                                        |  ETA: 0:37:42[39m

test reward at step 30000: -56.74967885342572


[32mProgress:   2%|█                                        |  ETA: 0:34:17[39m

test reward at step 40000: -48.77046965657638


[32mProgress:   3%|█▍                                       |  ETA: 0:27:22[39m

test reward at step 50000: -55.53335983631496


[32mProgress:   4%|█▌                                       |  ETA: 0:26:14[39m

test reward at step 60000: -71.71230968196889


[32mProgress:   5%|█▉                                       |  ETA: 0:25:11[39m

test reward at step 70000: -48.454913581772


[32mProgress:   5%|██▎                                      |  ETA: 0:27:59[39m

test reward at step 80000: -51.71980662838415


[32mProgress:   6%|██▍                                      |  ETA: 0:26:04[39m

test reward at step 90000: -80.63170371382472


[32mProgress:   7%|██▊                                      |  ETA: 0:25:41[39m

parameters at step 100000 saved to ./RL_models/vtol_2D_ppo_100000.bson
test reward at step 100000: -75.89883417566884


[32mProgress:   7%|██▉                                      |  ETA: 0:27:01[39m

test reward at step 110000: -55.48313011132846


[32mProgress:   8%|███▎                                     |  ETA: 0:25:45[39m

test reward at step 120000: -15.045747304964639


[32mProgress:   8%|███▍                                     |  ETA: 0:26:34[39m

test reward at step 130000: -13.97003085669134


[32mProgress:   9%|███▊                                     |  ETA: 0:26:04[39m

test reward at step 140000: -42.08770707027078


[32mProgress:  10%|████▏                                    |  ETA: 0:25:36[39m

test reward at step 150000: -13.025799408510359


[32mProgress:  10%|████▎                                    |  ETA: 0:26:11[39m

test reward at step 160000: -10.049245379494126


[32mProgress:  11%|████▋                                    |  ETA: 0:25:41[39m

test reward at step 170000: -5.375132899585065


[32mProgress:  12%|████▊                                    |  ETA: 0:26:12[39m

test reward at step 180000: -3.6095466518618347


[32mProgress:  12%|█████▏                                   |  ETA: 0:25:47[39m

test reward at step 190000: -3.537302712504238


[32mProgress:  13%|█████▍                                   |  ETA: 0:25:20[39m

parameters at step 200000 saved to ./RL_models/vtol_2D_ppo_200000.bson
test reward at step 200000: -3.7117024886100327


[32mProgress:  14%|█████▋                                   |  ETA: 0:25:40[39m

test reward at step 210000: -4.310138324386568


[32mProgress:  14%|█████▉                                   |  ETA: 0:25:19[39m

test reward at step 220000: -4.246758769818594


[32mProgress:  15%|██████▎                                  |  ETA: 0:24:53[39m

test reward at step 230000: -4.605170277630629


[32mProgress:  16%|██████▌                                  |  ETA: 0:25:10[39m

test reward at step 240000: -4.709127960958299


[32mProgress:  17%|██████▊                                  |  ETA: 0:24:50[39m

test reward at step 250000: -4.262103199412175


[32mProgress:  17%|███████▏                                 |  ETA: 0:25:18[39m

test reward at step 260000: -4.173960655119529


[32mProgress:  18%|███████▎                                 |  ETA: 0:24:40[39m

test reward at step 270000: -4.426356958879046


[32mProgress:  19%|███████▋                                 |  ETA: 0:24:18[39m

test reward at step 280000: -4.3427424378525705


[32mProgress:  19%|███████▊                                 |  ETA: 0:24:30[39m

test reward at step 290000: -4.532444031909873


[32mProgress:  20%|████████▏                                |  ETA: 0:24:05[39m

parameters at step 300000 saved to ./RL_models/vtol_2D_ppo_300000.bson
test reward at step 300000: -5.304952904060155


[32mProgress:  20%|████████▎                                |  ETA: 0:24:14[39m

test reward at step 310000: -4.649329200203804


[32mProgress:  21%|████████▋                                |  ETA: 0:23:49[39m

test reward at step 320000: -4.8322497943477325


[32mProgress:  22%|█████████                                |  ETA: 0:23:28[39m

test reward at step 330000: -5.105329170383905


[32mProgress:  23%|█████████▎                               |  ETA: 0:23:12[39m

test reward at step 340000: -4.645146218875226


[32mProgress:  23%|█████████▍                               |  ETA: 0:23:25[39m

test reward at step 350000: -5.154342431620806


[32mProgress:  24%|█████████▊                               |  ETA: 0:23:04[39m

test reward at step 360000: -5.059846145453372


[32mProgress:  25%|██████████▏                              |  ETA: 0:22:42[39m

test reward at step 370000: -4.955943886355248


[32mProgress:  25%|██████████▎                              |  ETA: 0:22:47[39m

test reward at step 380000: -5.290341568377019


[32mProgress:  26%|██████████▋                              |  ETA: 0:22:29[39m

test reward at step 390000: -8.483916415403545


[32mProgress:  26%|██████████▊                              |  ETA: 0:22:33[39m

parameters at step 400000 saved to ./RL_models/vtol_2D_ppo_400000.bson


[32mProgress:  27%|██████████▉                              |  ETA: 0:22:38[39m

test reward at step 400000: -4.462562345790456


[32mProgress:  27%|███████████▏                             |  ETA: 0:22:12[39m

test reward at step 410000: -9.460731366135192


[32mProgress:  28%|███████████▌                             |  ETA: 0:21:51[39m

test reward at step 420000: -4.8394568473352475


[32mProgress:  28%|███████████▋                             |  ETA: 0:21:55[39m

test reward at step 430000: -5.095446873421124


[32mProgress:  29%|████████████                             |  ETA: 0:21:35[39m

test reward at step 440000: -5.138981753028657


[32mProgress:  30%|████████████▎                            |  ETA: 0:21:15[39m

test reward at step 450000: -5.729211097734909


[32mProgress:  30%|████████████▌                            |  ETA: 0:21:18[39m

test reward at step 460000: -5.274804498185783


[32mProgress:  31%|████████████▊                            |  ETA: 0:21:00[39m

test reward at step 470000: -5.466485689859748


[32mProgress:  32%|█████████████▏                           |  ETA: 0:20:39[39m

test reward at step 480000: -4.972002650504066


[32mProgress:  32%|█████████████▎                           |  ETA: 0:20:41[39m

test reward at step 490000: -4.789466028156276


[32mProgress:  33%|█████████████▋                           |  ETA: 0:20:25[39m

parameters at step 500000 saved to ./RL_models/vtol_2D_ppo_500000.bson
test reward at step 500000: -5.695387488612208


[32mProgress:  34%|██████████████                           |  ETA: 0:20:25[39m

test reward at step 510000: -5.814957976315709


[32mProgress:  34%|██████████████▏                          |  ETA: 0:20:05[39m

test reward at step 520000: -5.560020008290055


[32mProgress:  35%|██████████████▌                          |  ETA: 0:19:46[39m

test reward at step 530000: -5.442095273999983


[32mProgress:  36%|██████████████▋                          |  ETA: 0:19:48[39m

test reward at step 540000: -4.881175561025651


[32mProgress:  36%|███████████████                          |  ETA: 0:19:29[39m

test reward at step 550000: -5.415314047471877


[32mProgress:  37%|███████████████▎                         |  ETA: 0:19:10[39m

test reward at step 560000: -9.674148951705266


[32mProgress:  38%|███████████████▌                         |  ETA: 0:19:09[39m

test reward at step 570000: -6.138685030833354


[32mProgress:  38%|███████████████▊                         |  ETA: 0:18:52[39m

test reward at step 580000: -8.022218638751468


[32mProgress:  39%|████████████████▏                        |  ETA: 0:18:34[39m

test reward at step 590000: -5.2235166066145124


[32mProgress:  40%|████████████████▎                        |  ETA: 0:18:33[39m

parameters at step 600000 saved to ./RL_models/vtol_2D_ppo_600000.bson
test reward at step 600000: -5.091094916888947


[32mProgress:  40%|████████████████▋                        |  ETA: 0:18:18[39m

test reward at step 610000: -5.459517433990495


[32mProgress:  41%|████████████████▉                        |  ETA: 0:17:59[39m

test reward at step 620000: -5.801180113783272


[32mProgress:  42%|█████████████████▏                       |  ETA: 0:17:58[39m

test reward at step 630000: -4.931456076419264


[32mProgress:  42%|█████████████████▍                       |  ETA: 0:17:42[39m

test reward at step 640000: -6.342266490911784


[32mProgress:  43%|█████████████████▊                       |  ETA: 0:17:23[39m

test reward at step 650000: -5.290627438425884


[32mProgress:  44%|█████████████████▉                       |  ETA: 0:17:21[39m

test reward at step 660000: -5.78623679806344


[32mProgress:  44%|██████████████████▎                      |  ETA: 0:17:04[39m

test reward at step 670000: -4.689806628265799


[32mProgress:  45%|██████████████████▋                      |  ETA: 0:16:45[39m

test reward at step 680000: -7.182809070139873


[32mProgress:  46%|██████████████████▊                      |  ETA: 0:16:43[39m

test reward at step 690000: -5.2236331806179015


[32mProgress:  46%|███████████████████                      |  ETA: 0:16:31[39m

parameters at step 700000 saved to ./RL_models/vtol_2D_ppo_700000.bson
test reward at step 700000: -5.44519665940104


[32mProgress:  47%|███████████████████▍                     |  ETA: 0:16:14[39m

test reward at step 710000: -5.584419581456965


[32mProgress:  48%|███████████████████▌                     |  ETA: 0:16:12[39m

test reward at step 720000: -5.229751332987006


[32mProgress:  48%|███████████████████▉                     |  ETA: 0:15:54[39m

test reward at step 730000: -5.595442067524357


[32mProgress:  49%|████████████████████▎                    |  ETA: 0:15:36[39m

test reward at step 740000: -6.628649642487952


[32mProgress:  50%|████████████████████▍                    |  ETA: 0:15:32[39m

test reward at step 750000: -5.017712487088355


[32mProgress:  51%|████████████████████▊                    |  ETA: 0:15:15[39m

test reward at step 760000: -7.714206235813664


[32mProgress:  51%|████████████████████▉                    |  ETA: 0:15:11[39m

test reward at step 770000: -5.617417971751619


[32mProgress:  52%|█████████████████████▎                   |  ETA: 0:14:53[39m

test reward at step 780000: -6.590257901457464


[32mProgress:  53%|█████████████████████▋                   |  ETA: 0:14:35[39m

test reward at step 790000: -6.373780831662406


[32mProgress:  53%|█████████████████████▊                   |  ETA: 0:14:31[39m

parameters at step 800000 saved to ./RL_models/vtol_2D_ppo_800000.bson
test reward at step 800000: -6.254084835702116


[32mProgress:  54%|██████████████████████▏                  |  ETA: 0:14:14[39m

test reward at step 810000: -5.980037048575429


[32mProgress:  54%|██████████████████████▎                  |  ETA: 0:14:09[39m

test reward at step 820000: -5.335548829565564


[32mProgress:  55%|██████████████████████▋                  |  ETA: 0:13:50[39m

test reward at step 830000: -6.227931511066823


[32mProgress:  56%|███████████████████████                  |  ETA: 0:13:41[39m

test reward at step 840000: -5.921535858009676


[32mProgress:  56%|███████████████████████▏                 |  ETA: 0:13:28[39m

test reward at step 850000: -5.890172947498714


[32mProgress:  57%|███████████████████████▌                 |  ETA: 0:13:10[39m

test reward at step 860000: -6.467035189725857


[32mProgress:  58%|███████████████████████▋                 |  ETA: 0:13:05[39m

test reward at step 870000: -6.859186526978995


[32mProgress:  59%|████████████████████████                 |  ETA: 0:12:49[39m

test reward at step 880000: -14.05374747985643


[32mProgress:  59%|████████████████████████▏                |  ETA: 0:12:43[39m

test reward at step 890000: -6.6611258547715435


[32mProgress:  60%|████████████████████████▌                |  ETA: 0:12:26[39m

parameters at step 900000 saved to ./RL_models/vtol_2D_ppo_900000.bson
test reward at step 900000: -7.291744917489778


[32mProgress:  61%|████████████████████████▉                |  ETA: 0:12:08[39m

test reward at step 910000: -6.976193127119966


[32mProgress:  61%|█████████████████████████                |  ETA: 0:12:04[39m

test reward at step 920000: -7.1246056157035245


[32mProgress:  62%|█████████████████████████▍               |  ETA: 0:11:47[39m

test reward at step 930000: -7.463733394457223


[32mProgress:  62%|█████████████████████████▌               |  ETA: 0:11:42[39m

test reward at step 940000: -8.207780317395287


[32mProgress:  63%|█████████████████████████▉               |  ETA: 0:11:24[39m

test reward at step 950000: -6.278128993036093


[32mProgress:  64%|██████████████████████████▎              |  ETA: 0:11:08[39m

In [None]:
agent.policy.approximator = loadModel();

ReinforcementLearning.run(
    agent,
    env,
    StopAfterStep(1_500_000),
    ComposedHook(
        DoEveryNStep(saveModel, n=100_000),
        DoEveryNStep(validate_policy, n=10_000)),
)

In [None]:
### Plot the stuff
plot(episode_test_reward_hook.rewards)

close_visualization(); # closes the MeshCat visualization