In [1]:
## Init Bionic VTOL
include("../Flyonic.jl");
using .Flyonic;

using Rotations; # used for initial position

using ReinforcementLearning;
using StableRNGs;
using Flux;
using Flux.Losses;
using Random;
using IntervalSets;
using LinearAlgebra;
using Distributions;

using Plots;
using Statistics;

using BSON: @save, @load # save mode
create_visualization();

# indicates how many threads Julia was started with. This is important for the multi-threaded environment
Threads.nthreads()
### Create Reinforcement Learning Environment


################################ TODO ################################
# You can initialization global constants here.
# E.g. a fixed point in the beginning of training (for testing/overfitting)
# Define global constants for initial position and rotation

#####
##### first coordinate - red axis - x
##### second coordinate - green axis - y
##### third coordinate - blue axis - z
#####
x_init = [0.0, 0.0, 0.0];
rot_init = [1.0 0.0 0.0; 0.0 1.0 0.0; 0.0 0.0 1.0];

# Defaault values for VtolEnv variables
waypoints_default = [[0.0, 0.0, 0.0], [-10.0, 0.0, 5.0], [1.0, 0.0, 10.0], [5.0, 0.0, 0.0], [-3.0, 0.0, 3.0]];
proximity_tolerance_default = 0.01;
timeout_default = 15.0;

v_min_default = 0.5;
v_expected_default = 3.0;
v_max_default = 100.0;
w_expected_default = 10.0;
w_max_default = 25.0;

# Initial values, assuming the first waypoint is the origin and has been reached upon start of the simuilation
wpi_init = 2;
v_init = 0.25 * [waypoints_default[2][3], -waypoints_default[2][1], 0.0];

# Multiplier for doing the same mistake over and over again
mistake_amplifier_default = 1.0;
last_reward_default = -1e30;

######################################################################

[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mMeshCat server started. You can open the visualizer by visiting the following URL in your browser:
[36m[1m└ [22m[39mhttp://127.0.0.1:8700


In [3]:
v_init = 0.25 * [waypoints_default[2][3], -waypoints_default[2][1], 0.0];
last_reward_default = 1e3;

mutable struct VtolEnv{A,T,ACT,R<:AbstractRNG} <: AbstractEnv # Parametric Constructor for a subtype of AbstractEnv
    action_space::A # action space
    observation_space::Space{Vector{ClosedInterval{T}}} # observation space
    state::Vector{T} # current state space
    action::ACT # action space
    done::Bool # done
    t::T # time
    rng::R # random number generator

    name::String # for multible environments
    visualization::Bool # visualization
    realtime::Bool # realtime

    x_previous::Vector{T} # previous position
    x_W::Vector{T} # current position
    v_B::Vector{T} # velocity
    R_W::Matrix{T} # current rotation
    w_B::Vector{T} # rotation velocitiy
    wind_W::Vector{T} # wind
    delta_t::T # simulation time step

    ################################ TODO ################################
    # Extend the environment here.
    # Everything you need additionaly in your environment also go in here.
    # E.g. a trajectory

    # Pathway variables
    waypoints::Vector{Vector{T}}
    proximity_tolerance::T
    timeout::T

    # Kinetic variables
    v_min::T
    v_expected::T # minimum required velocity
    v_max::T # maximum allowed velocity
    w_expected::T # minimum required velocity
    w_max::T # maximum allowed velocity

    ### Very important variable
    # counting index of current steered towards waypoint
    # Assuming the first waypoint is origin and is reached at the start of the simulation
    wpi::Int

    # Mistake amplification variables
    mistake_amplifier::T
    last_reward::T

    ######################################################################
end


# define a keyword-based constructor for the type declared in the mutable struct typedef.
# It could also be done with the macro Base.@kwdef.
function VtolEnv(;
    rng = Random.GLOBAL_RNG, # random number generation
    name = "vtol",
            visualization = false,
    realtime = false,
    kwargs...) # let the function take an arbitrary number of keyword arguments

    T = Float64; # explicit type which is used e.g. in state. Cannot be altered due to the poor matrix defininon.
    A = Space{Vector{ClosedInterval{T}}};

    action_space = Space(
        ClosedInterval{T}[
            0.0..2.0, # propeller 1
            0.0..2.0, # propeller 2
            ],
    ) # propeller 1 and 2

    state_space = Space( # Three continuous values in state space.
        ClosedInterval{T}[
            ################################ TODO ################################
            # Implement an observation space.
            # Here is an example space. You can change it if desired.
            # You have to extend it.
            # Orientate yourself on the observation space from the paper.

            typemin(T)..typemax(T), #1 previous position along x WORLD coordinates
            typemin(T)..typemax(T), #2 previous position along z WORLD coordinates

            typemin(T)..typemax(T), #3 current position along x WORLD coordinates
            typemin(T)..typemax(T), #4 current position along z WORLD coordinates

            typemin(T)..typemax(T), #5 orientation along x WORLD coordinates
            typemin(T)..typemax(T), #6 orientation along z WORLD coordinates

            typemin(T)..typemax(T), #7 velocity along x BODY coordinates
            typemin(T)..typemax(T), #8 velocity along y BODY coordinates

            typemin(T)..typemax(T), #9 rotational velocity along z BODY coordinates
            ######################################################################
            ],
    )

    if visualization # visualizes VTOL
        create_VTOL(name, actuators = true, color_vec=[1.0; 1.0; 0.6; 1.0]);
    end

    environment = VtolEnv(
        action_space, # action space
        state_space, # observation space
        zeros(T, length(state_space)), # current state space
        rand(action_space), # initialization action
        false, # episode done
        0.0, # time
        rng, # random number generator

        name,
        visualization,
        realtime,

        zeros(T, 3), # x_previous, previous position
        zeros(T, 3), # x_W, current position
        v_init, # v_B, velocity
        [1.0 0.0 0.0; 0.0 1.0 0.0; 0.0 0.0 1.0], # R_W, current rotation
        zeros(T, 3), # w_B
        zeros(T, 3), # wind_W
        T(0.02), # simulation time step

        ################################## TODO ##################################
        # Initialization everything you need additionaly in your environment here

        waypoints_default,
        proximity_tolerance_default,
        timeout_default,
        v_min_default,
        v_expected_default,
        v_max_default,
        w_expected_default,
        w_max_default,
        wpi_init,
        mistake_amplifier_default,
        last_reward_default,
        ##########################################################################
    )

    reset!(environment)
    return environment
end;

methods(VtolEnv)

# Just for explanation:
# 1. A mutable Struct is created. A struct is a constructor and a constructor is a function that creates new objects.
# 2. A outer keyword-based constructor method is added for the type declared in the mutable struct typedef before.'
# So now we have a function with two methods. Julia will decide which method to call by multiple dispatch.

## Define the RL interface

Random.seed!(env::VtolEnv, seed) = Random.seed!(env.rng, seed)
RLBase.action_space(env::VtolEnv) = env.action_space
RLBase.state_space(env::VtolEnv) = env.observation_space
RLBase.is_terminated(env::VtolEnv) = env.done
RLBase.state(env::VtolEnv) = env.state

function computeReward(env::VtolEnv{A,T}) where {A,T}
    reward = 0.0

    ################################ TODO ################################
    # Implement the reward function.
    # Orientate on the paper.

    # Hyperparameters
    kp = 5.0;
    kw = 0.1;
    kwp = 5.0;
    kv = 0.1;
    # max distance
    dmax = 0.3;

    # Gates and their length
    gates = env.waypoints;
    n = length(gates);

    # closest point on the guiding path phi
    # its line-segment index lp
    x_old = [env.state[1], 0.0, env.state[2]];
    x_new = [env.state[3], 0.0, env.state[4]];
    lp_old, phi_old = calculate_progress(gates, x_old);
    lp_new, phi_new = calculate_progress(gates, x_new);

    # previous time step spt_old
    # current time step spt_new
    spt_old = 0.0;
    spt_new = 0.0;
    for i in 1:(lp_old-1)
        spt_old += norm(gates[i+1] - gates[i]);
    end
    spt_old += norm(phi_old - gates[lp_old])
    for i in 1:(lp_new-1)
        spt_new += norm(gates[i+1] - gates[i]);
    end
    spt_new += norm(phi_new - gates[lp_new])

    # rogress reward rpt at time t is as a difference in reached
    # distance between the current and previous time step
    rpt = spt_new - spt_old
    spt = spt_new

    # The sum that is later going to be the divisor
    # for the reached distance reward
    divisor_g = 0.0;
    for i in 1:(n-1)
        divisor_g += norm(gates[i+1] - gates[i]);
    end

    # reached distance reward ks
    ks = 2 * env.v_max * env.delta_t / divisor_g;

    # steered at waypoint index wpi must be 1 bigger than the current line segment
    # distance to new waypoint dw
    dwp = norm(x_new - gates[env.wpi]);
    # tolerance for proximity to a waypoint
    r_tol = env.proximity_tolerance;

    # waypoint reward rwp
    rwp = exp(-dwp/r_tol);

    # no obstacles
    collision = false;
    # terminal reward rt
    rt = collision ? -10 : 0;

    # absolute velocity
    v_vector = [env.state[7], env.state[8], 0.0]
    v = norm(v_vector);
    # rotation speed
    w = abs(env.state[9]);
    # distance from closes point on the guiding path
    gd = norm(x_new - phi_new);
    kgd = 5.0;

    # punishment for flying too damn fast
    # don't punish slow flight
    rv = max(0, v - env.v_expected);
    rw = max(0, w - env.w_expected);

    # Scaling factors
    svmax = v > env.v_max ? 10^(env.v_max - v) : 1.0;
    svmin = v < env.v_min ?  10^(v - env.v_min) : 1.0;
    sgd = gd > dmax  ? exp(dmax - gd) : 1.0;

    # Ultimate scaling factor
    s = svmax * svmin * sgd;

    # Scaling the rewards
    kp = s * kp;
    ks = s * ks;

    # Punish idiotic rotation acceleration
    k_idiot = 1000.0;
    r_idiot = rw * (env.timeout - env.t);

    reward = kp * rpt + ks * spt + kwp * rwp + rt - kw * rw - kgd * gd - k_idiot * r_idiot;

    # Amplifying autistic drone stupidity
    amp_tol = 5.0;
    if (reward < 0.0 && abs(env.last_reward) < amp_tol * abs(reward))
        env.mistake_amplifier *= 1.3;
    else
        env.mistake_amplifier /= 1.3;
    end
    reward *= env.mistake_amplifier;
    env.last_reward = reward;

    ################################################################################################

    return reward
end

RLBase.reward(env::VtolEnv{A,T}) where {A,T} = computeReward(env)

function RLBase.reset!(env::VtolEnv{A,T}) where {A,T}
    # Visualize initial state
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, [0.0; 0.0; 0.0; 0.0]);
    end

    env.x_W = [0.0; 0.0; 0.0];
    env.v_B = v_init;
    env.R_W = Matrix(UnitQuaternion(RotZ(-pi/2.0) * RotY(-pi/2.0) * RotX(pi)));

    env.w_B = [0.0; 0.0; 0.0];
    env.wind_W = [0.0; 0.0; 0.0];

    env.t = 0.0;
    env.action = [0.0, 0.0];
    env.done = false;

    ################################ TODO ################################
    # Reset environment.
    # Is called if the training terminates
    # (e.g. if drone crashes or successfully reaches point)
    # HINT: Everything you added to your environment needs to be reseted.
    #       Compare it with the initialization.

    env.x_previous = x_init; # starting position
    env.delta_t = T(0.02); # Δ time

    env.waypoints = waypoints_default;
    env.proximity_tolerance = proximity_tolerance_default;
    env.timeout = timeout_default;
    env.v_min = v_min_default;
    env.v_expected = v_expected_default;
    env.v_max = v_max_default;
    env.w_expected = w_expected_default;
    env.w_max = w_max_default;
    env.wpi = wpi_init;
    env.last_reward = last_reward_default;

    # Visualize the waypoints
    radius = 0.1;
    visualize_waypoints(env.waypoints, radius);
    ######################################################################

    nothing
end;

# defines a methods for a callable object.
# So when a VtolEnv object is created, it has this method that can be called
function (env::VtolEnv)(a)
    # set the propeller trust and the two flaps 2D case
    # flaps set to 0.0
    next_action = [a[1], a[2], 0.0, 0.0]

    _step!(env, next_action)
end

env = VtolEnv();
methods(env) # Just to explain which methods the object has


function _step!(env::VtolEnv, next_action)
    # Update previous
    env.x_previous = [env.state[3], 0.0, env.state[4]];

    # caluclate wind impact
    v_in_wind_B = vtol_add_wind(env.v_B, env.R_W, env.wind_W);

    # caluclate aerodynamic forces
    torque_B, force_B = vtol_model(v_in_wind_B, next_action, eth_vtol_param);

    # Limit to 2D
    force_B[3] = 0.0; # Body Z
    env.v_B[3] = 0.0;
    torque_B[1] = 0.0;
    torque_B[2] = 0.0;  # Body X and Y
    env.w_B[1] = 0.0;
    env.w_B[2] = 0.0;

    # integrate rigid body dynamics for delta_t
    env.x_W, env.v_B, env.R_W, env.w_B, time = rigid_body_simple(torque_B, force_B, env.x_W, env.v_B, env.R_W, env.w_B, env.t, env.delta_t, eth_vtol_param);

    if env.realtime
        sleep(env.delta_t); # just a dirty hack. this is of course slower than real time.
    end

    # Visualize the new state
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, next_action);
    end

    env.t += env.delta_t

    env.state[3] = env.x_W[1]; # position along x
    env.state[4] = env.x_W[3]; # position along z

    env.state[5] = env.R_W[1,1]; # orientation along x
    env.state[6] = env.R_W[3,1]; # orientation along z

    env.state[7] = env.v_B[1]; # velocity along x BODY coordinates
    env.state[8] = env.v_B[2]; # velocity along y BODY coordinates

    env.state[9] = env.w_B[3];  # rotational velocity along z BODY coordinates

    ################################ TODO ################################

    env.state[1] = env.x_previous[1]; # update previous x-coordinate
    env.state[2] = env.x_previous[3]; # update previous z-coordinate

    ### Very important update
    # counting index of current steered towards waypoint
    if(norm(env.waypoints[env.wpi] - env.x_W) < env.proximity_tolerance)
        println("wpi: "+wpi);
        env.wpi += 1;
        env.t = 0;
        env.wpi = min(env.wpi, length(env.waypoints))
    end

    # Instead of the floor level which assumes the simulation will only run upstairs,
    # Here a variables for being too far from the next wp after reaching a previous waypoint can be used instead

    segment_length = norm(env.waypoints[env.wpi] - env.waypoints[env.wpi - 1])
    k_failure = 3;
    wp_dist = norm(env.waypoints[env.wpi] - env.x_W)

    env.done =
        norm(env.w_B) > env.w_max || # stop if body rate is too high
        norm(env.v_B) > env.v_max || # stop if body is too fast
        wp_dist > k_failure * segment_length ||
        env.t > env.timeout
    ######################################################################

    nothing
end;

RLBase.test_runnable!(env)

# changed to 10s (5s before) per point and 5.0m too far off path (2.0 before)
# Show an overview of the environment.

## Setup of a reinforcement learning experiment.

seed = 123
rng = StableRNG(seed)
    N_ENV = 8
    UPDATE_FREQ = 1024

    vtol_envs = [
        # use different names for the visualization
        VtolEnv(; rng = StableRNG(hash(seed+i)), name = "vtol$i") for i in 1:N_ENV
    ];
    # define multiple environments for parallel training
    env = MultiThreadEnv(vtol_envs)

    # Define the function approximator
    # (optional) TODO: change architecture
    # TODO: research briefly what Actor Critic is
    # (optional) TODO: change optimizer
    # TODO: research what ADAM is
    # Define the function approximator
    ns, na = length(state(env[1])), length(action_space(env[1]))
    #ActorCritic Policy
    approximator = ActorCritic(
                #ns - number states as input
                #3 layer; last layer splitted in mean and variance; then action is sampled
                actor = GaussianNetwork(
                    pre = Chain(
                    Dense(ns, 16, relu; initW = glorot_uniform(rng)),#
                    Dense(16, 16, relu; initW = glorot_uniform(rng)),
                    ),
                    μ = Chain(Dense(16, na; initW = glorot_uniform(rng))),
                    logσ = Chain(Dense(16, na; initW = glorot_uniform(rng))),
                ),
                critic = Chain(
                    Dense(ns, 16, relu; initW = glorot_uniform(rng)),
                    Dense(16, 16, relu; initW = glorot_uniform(rng)),
                    Dense(16, 1; initW = glorot_uniform(rng)),
                ),
                optimizer = ADAM(1e-3),
            );

        agent = Agent( # A wrapper of an AbstractPolicy
        # AbstractPolicy: the policy to use
        # (optional) TODO: change eventually
        # TODO: research briefly what PPO is
        policy = PPOPolicy(;
                    approximator = approximator |> gpu,
                    update_freq=UPDATE_FREQ,
                    dist = Normal,
                    # For parameters visit the docu: https://juliareinforcementlearning.org/docs/rlzoo/#ReinforcementLearningZoo.PPOPolicy
                    ),

        # AbstractTrajectory: used to store transitions between an agent and an environment source
        trajectory = PPOTrajectory(;
            capacity = UPDATE_FREQ,
            state = Matrix{Float64} => (ns, N_ENV),
            action = Matrix{Float64} => (na, N_ENV),
            action_log_prob = Vector{Float64} => (N_ENV,),
            reward = Vector{Float64} => (N_ENV,),
            terminal = Vector{Bool} => (N_ENV,),
        ),
    );


function saveModel(t, agent, env)
    model = cpu(agent.policy.approximator)
    f = joinpath("./RL_models_leo/", "vtol_2D_ppo_$t.bson") # TODO: save model here
    @save f model
    println("parameters at step $t saved to $f")
end;

function loadModel()
    f = joinpath("./RL_models_leo/", "vtol_2D_ppo_1500000.bson") # TODO: load model here
    @load f model
    return model
end;

function validate_policy(t, agent, env)
    run(agent.policy, test_env, StopAfterEpisode(1), episode_test_reward_hook)
    # the result of the hook
    println("test reward at step $t: $(episode_test_reward_hook.rewards[end])")

end;

episode_test_reward_hook = TotalRewardPerEpisode(;is_display_on_exit=false)
# create a env only for reward test
test_env = VtolEnv(;name = "testVTOL", visualization = true, realtime = true);


ReinforcementLearning.run(
    agent,
    env,
    StopAfterStep(1_500_000),
    ComposedHook(
        DoEveryNStep(saveModel, n=100_000),
        DoEveryNStep(validate_policy, n=10_000)),
)

[0m[1mTest Summary:              | [22m[32m[1mPass  [22m[39m[36m[1mTotal  [22m[39m[0m[1mTime[22m
random policy with VtolEnv | [32m2000  [39m[36m 2000  [39m[0m0.0s


[32mProgress:   1%|▎                                        |  ETA: 0:05:19[39m

test reward at step 10000: -799732.2980165019


[32mProgress:   1%|▌                                        |  ETA: 0:12:20[39m

test reward at step 20000: -1.3819783592466598e6


[32mProgress:   2%|▉                                        |  ETA: 0:12:13[39m

test reward at step 30000: -1.6088421667579531e6


[32mProgress:   2%|█                                        |  ETA: 0:11:01[39m

test reward at step 40000: -2.1845684367354896e7


[32mProgress:   3%|█▎                                       |  ETA: 0:11:06[39m

test reward at step 50000: -147800.45708779237


[32mProgress:   4%|█▌                                       |  ETA: 0:11:42[39m

test reward at step 60000: -19391.503996078696


[32mProgress:   4%|█▉                                       |  ETA: 0:11:26[39m

test reward at step 70000: -1.7688562815818608e7


[32mProgress:   5%|██▏                                      |  ETA: 0:11:18[39m

test reward at step 80000: -4.826222219000004e7


[32mProgress:   6%|██▍                                      |  ETA: 0:11:12[39m

test reward at step 90000: -40325.56870125506


[32mProgress:   7%|██▊                                      |  ETA: 0:11:14[39m

parameters at step 100000 saved to ./RL_models_leo/vtol_2D_ppo_100000.bson
test reward at step 100000: -28345.24956863999


[32mProgress:   7%|███                                      |  ETA: 0:11:12[39m

test reward at step 110000: -25725.708927476677


[32mProgress:   8%|███▎                                     |  ETA: 0:11:03[39m

test reward at step 120000: -30583.445584894474


[32mProgress:   9%|███▌                                     |  ETA: 0:10:56[39m

test reward at step 130000: -73071.02983299


[32mProgress:   9%|███▊                                     |  ETA: 0:10:49[39m

test reward at step 140000: -2.5816864663006917e7


[32mProgress:  10%|████▏                                    |  ETA: 0:10:45[39m

test reward at step 150000: -48136.15500648403


[32mProgress:  11%|████▍                                    |  ETA: 0:11:12[39m

test reward at step 160000: -165713.10543506392


[32mProgress:  11%|████▌                                    |  ETA: 0:10:55[39m

test reward at step 170000: -19154.882644597157


[32mProgress:  12%|████▉                                    |  ETA: 0:10:45[39m

test reward at step 180000: -4.825971141135499e6


[32mProgress:  12%|█████▏                                   |  ETA: 0:10:39[39m

test reward at step 190000: -20530.150492625748


[32mProgress:  13%|█████▍                                   |  ETA: 0:10:31[39m

parameters at step 200000 saved to ./RL_models_leo/vtol_2D_ppo_200000.bson
test reward at step 200000: -41621.5619205702


[32mProgress:  14%|█████▋                                   |  ETA: 0:10:29[39m

test reward at step 210000: -27289.920132420135


[32mProgress:  15%|██████                                   |  ETA: 0:10:22[39m

test reward at step 220000: -20524.225624900657


[32mProgress:  15%|██████▎                                  |  ETA: 0:10:13[39m

test reward at step 230000: -1.747875104969912e6


[32mProgress:  16%|██████▍                                  |  ETA: 0:10:19[39m

test reward at step 240000: -1.5661228273924524e7


[32mProgress:  16%|██████▊                                  |  ETA: 0:10:15[39m

test reward at step 250000: -2.669888105931471e7


[32mProgress:  17%|███████                                  |  ETA: 0:10:12[39m

test reward at step 260000: -9.540132985807803e6


[32mProgress:  18%|███████▎                                 |  ETA: 0:10:01[39m

test reward at step 270000: -302178.65724383306


[32mProgress:  18%|███████▌                                 |  ETA: 0:09:56[39m

test reward at step 280000: -6.93599561775306e7


[32mProgress:  19%|███████▉                                 |  ETA: 0:09:49[39m

test reward at step 290000: -4.2715576567575745e6


[32mProgress:  20%|████████▏                                |  ETA: 0:09:45[39m

parameters at step 300000 saved to ./RL_models_leo/vtol_2D_ppo_300000.bson
test reward at step 300000: -36848.04811339888


[32mProgress:  20%|████████▍                                |  ETA: 0:09:41[39m

test reward at step 310000: -1.302940226465874e7


[32mProgress:  21%|████████▋                                |  ETA: 0:09:37[39m

test reward at step 320000: -47066.46930368912


[32mProgress:  22%|█████████                                |  ETA: 0:09:35[39m

test reward at step 330000: -1.9457544064830686e6


[32mProgress:  23%|█████████▎                               |  ETA: 0:09:32[39m

test reward at step 340000: -5.848608280434052e6


[32mProgress:  23%|█████████▌                               |  ETA: 0:09:28[39m

test reward at step 350000: -35281.91530565419


[32mProgress:  24%|█████████▊                               |  ETA: 0:09:22[39m

test reward at step 360000: -21790.708349376317


[32mProgress:  25%|██████████▏                              |  ETA: 0:09:15[39m

test reward at step 370000: -33604.38561499189


[32mProgress:  25%|██████████▍                              |  ETA: 0:09:18[39m

test reward at step 380000: -5.2790573691306785e7


[32mProgress:  26%|██████████▌                              |  ETA: 0:09:11[39m

test reward at step 390000: -924670.1483497496


[32mProgress:  26%|██████████▉                              |  ETA: 0:09:06[39m

parameters at step 400000 saved to ./RL_models_leo/vtol_2D_ppo_400000.bson
test reward at step 400000: -42380.46762680773


[32mProgress:  27%|███████████▏                             |  ETA: 0:09:02[39m

test reward at step 410000: -4.848900205993076e6


[32mProgress:  28%|███████████▍                             |  ETA: 0:08:56[39m

test reward at step 420000: -3.576572517767277e7


[32mProgress:  28%|███████████▋                             |  ETA: 0:08:51[39m

test reward at step 430000: -35644.76370166768


[32mProgress:  29%|████████████                             |  ETA: 0:08:46[39m

test reward at step 440000: -1.292995197935401e6


[32mProgress:  30%|████████████▎                            |  ETA: 0:08:41[39m

test reward at step 450000: -23045.781800244953


[32mProgress:  31%|████████████▌                            |  ETA: 0:08:34[39m

test reward at step 460000: -2.772541684244405e7


[32mProgress:  31%|████████████▉                            |  ETA: 0:08:28[39m

test reward at step 470000: -2.4055953303647712e7


[32mProgress:  32%|█████████████                            |  ETA: 0:08:29[39m

test reward at step 480000: -1.2932647568249732e7


[32mProgress:  32%|█████████████▎                           |  ETA: 0:08:24[39m

test reward at step 490000: -250986.62323598607


[32mProgress:  33%|█████████████▋                           |  ETA: 0:08:19[39m

parameters at step 500000 saved to ./RL_models_leo/vtol_2D_ppo_500000.bson
test reward at step 500000: -1.5558985645596578e6


[32mProgress:  34%|█████████████▉                           |  ETA: 0:08:14[39m

test reward at step 510000: -27805.33404209162


[32mProgress:  35%|██████████████▏                          |  ETA: 0:08:08[39m

test reward at step 520000: -27097.714927220743


[32mProgress:  35%|██████████████▍                          |  ETA: 0:08:03[39m

test reward at step 530000: -519818.9420906236


[32mProgress:  36%|██████████████▊                          |  ETA: 0:07:58[39m

test reward at step 540000: -3.1867089601157643e6


[32mProgress:  37%|███████████████                          |  ETA: 0:07:54[39m

test reward at step 550000: -7.742348801840053e6


[32mProgress:  37%|███████████████▎                         |  ETA: 0:07:48[39m

test reward at step 560000: -5.109972794512085e6


[32mProgress:  38%|███████████████▌                         |  ETA: 0:07:43[39m

test reward at step 570000: -43528.65685101004


[32mProgress:  39%|███████████████▉                         |  ETA: 0:07:39[39m

test reward at step 580000: -3.758469927779063e7


[32mProgress:  39%|████████████████▏                        |  ETA: 0:07:40[39m

test reward at step 590000: -48529.1568018974


[32mProgress:  40%|████████████████▎                        |  ETA: 0:07:35[39m

parameters at step 600000 saved to ./RL_models_leo/vtol_2D_ppo_600000.bson
test reward at step 600000: -824173.2936209526


[32mProgress:  40%|████████████████▋                        |  ETA: 0:07:29[39m

test reward at step 610000: -27460.675286352525


[32mProgress:  41%|████████████████▉                        |  ETA: 0:07:23[39m

test reward at step 620000: -18184.475635592753


[32mProgress:  42%|█████████████████▏                       |  ETA: 0:07:17[39m

test reward at step 630000: -8.59457708473359e6


[32mProgress:  42%|█████████████████▍                       |  ETA: 0:07:10[39m

test reward at step 640000: -484936.76262948115


[32mProgress:  43%|█████████████████▊                       |  ETA: 0:07:05[39m

test reward at step 650000: -2.6487353571926534e7


[32mProgress:  44%|██████████████████                       |  ETA: 0:07:01[39m

test reward at step 660000: -5.146251379515645e6


[32mProgress:  45%|██████████████████▎                      |  ETA: 0:06:57[39m

test reward at step 670000: -27032.01848443463


[32mProgress:  45%|██████████████████▌                      |  ETA: 0:06:51[39m

test reward at step 680000: -4.086103896801995e6


[32mProgress:  46%|██████████████████▉                      |  ETA: 0:06:46[39m

test reward at step 690000: -1.673957322951378e7


[32mProgress:  46%|███████████████████                      |  ETA: 0:06:45[39m

parameters at step 700000 saved to ./RL_models_leo/vtol_2D_ppo_700000.bson
test reward at step 700000: -1.4026226941740971e7


[32mProgress:  47%|███████████████████▎                     |  ETA: 0:06:39[39m

test reward at step 710000: -2.4060180022206098e7


[32mProgress:  48%|███████████████████▋                     |  ETA: 0:06:36[39m

test reward at step 720000: -3.4366914331392255e6


[32mProgress:  48%|███████████████████▉                     |  ETA: 0:06:31[39m

test reward at step 730000: -43892.441263009416


[32mProgress:  49%|████████████████████▏                    |  ETA: 0:06:26[39m

test reward at step 740000: -19346.144947256405


[32mProgress:  50%|████████████████████▍                    |  ETA: 0:06:20[39m

test reward at step 750000: -2.045397803455977e7


[32mProgress:  50%|████████████████████▋                    |  ETA: 0:06:15[39m

test reward at step 760000: -37330.29527943512


[32mProgress:  51%|█████████████████████                    |  ETA: 0:06:10[39m

test reward at step 770000: -81660.4686502497


[32mProgress:  52%|█████████████████████▎                   |  ETA: 0:06:05[39m

test reward at step 780000: -158771.951266751


[32mProgress:  53%|█████████████████████▌                   |  ETA: 0:06:00[39m

test reward at step 790000: -34679.97630773464


[32mProgress:  53%|█████████████████████▉                   |  ETA: 0:05:54[39m

parameters at step 800000 saved to ./RL_models_leo/vtol_2D_ppo_800000.bson
test reward at step 800000: -30609.808067928403


[32mProgress:  54%|██████████████████████▏                  |  ETA: 0:05:49[39m

test reward at step 810000: -5.1772175737853326e7


[32mProgress:  55%|██████████████████████▍                  |  ETA: 0:05:44[39m

test reward at step 820000: -36658.55373594264


[32mProgress:  55%|██████████████████████▊                  |  ETA: 0:05:40[39m

test reward at step 830000: -1.6919978601745155e7


[32mProgress:  56%|██████████████████████▉                  |  ETA: 0:05:36[39m

test reward at step 840000: -2.9087653446810663e7


[32mProgress:  56%|███████████████████████▏                 |  ETA: 0:05:30[39m

test reward at step 850000: -6.240038241821922e6


[32mProgress:  57%|███████████████████████▍                 |  ETA: 0:05:25[39m

test reward at step 860000: -9.73635109209074e6


[32mProgress:  58%|███████████████████████▊                 |  ETA: 0:05:19[39m

test reward at step 870000: -2.3882517045993675e6


[32mProgress:  58%|████████████████████████                 |  ETA: 0:05:13[39m

test reward at step 880000: -5.587337677831586e6


[32mProgress:  59%|████████████████████████▎                |  ETA: 0:05:08[39m

test reward at step 890000: -27992.10432045336


[32mProgress:  60%|████████████████████████▋                |  ETA: 0:05:02[39m

parameters at step 900000 saved to ./RL_models_leo/vtol_2D_ppo_900000.bson
test reward at step 900000: -4.154728035467341e7


[32mProgress:  60%|████████████████████████▊                |  ETA: 0:05:00[39m

test reward at step 910000: -4.049641049433878e7


[32mProgress:  61%|█████████████████████████                |  ETA: 0:04:53[39m

test reward at step 920000: -3.729145910250129e6


[32mProgress:  62%|█████████████████████████▍               |  ETA: 0:04:48[39m

test reward at step 930000: -5.176707725928944e6


[32mProgress:  62%|█████████████████████████▋               |  ETA: 0:04:42[39m

test reward at step 940000: -1.993348418056053e7


[32mProgress:  63%|█████████████████████████▉               |  ETA: 0:04:37[39m

test reward at step 950000: -2.9211562192852236e7


[32mProgress:  64%|██████████████████████████▎              |  ETA: 0:04:32[39m

test reward at step 960000: -33233.75374156904


[32mProgress:  65%|██████████████████████████▌              |  ETA: 0:04:27[39m

test reward at step 970000: -980354.9430501047


[32mProgress:  65%|██████████████████████████▊              |  ETA: 0:04:21[39m

test reward at step 980000: -2.3954146335117253e6


[32mProgress:  66%|███████████████████████████              |  ETA: 0:04:16[39m

test reward at step 990000: -3.127670991043682e7


[32mProgress:  67%|███████████████████████████▎             |  ETA: 0:04:11[39m

parameters at step 1000000 saved to ./RL_models_leo/vtol_2D_ppo_1000000.bson
test reward at step 1000000: -1.53996289396053e7


[32mProgress:  67%|███████████████████████████▋             |  ETA: 0:04:05[39m

test reward at step 1010000: -2.0929678095942806e6


[32mProgress:  68%|███████████████████████████▉             |  ETA: 0:04:02[39m

test reward at step 1020000: -1.3617130152243607e7


[32mProgress:  68%|████████████████████████████             |  ETA: 0:03:58[39m

test reward at step 1030000: -4.637084925210269e7


[32mProgress:  69%|████████████████████████████▍            |  ETA: 0:03:53[39m

test reward at step 1040000: -2.8386210054035604e7


[32mProgress:  70%|████████████████████████████▋            |  ETA: 0:03:47[39m

test reward at step 1050000: -3.7808868070618487e6


[32mProgress:  70%|████████████████████████████▉            |  ETA: 0:03:42[39m

test reward at step 1060000: -2.148246167354283e7


[32mProgress:  71%|█████████████████████████████▏           |  ETA: 0:03:36[39m

test reward at step 1070000: -33433.08837025966


[32mProgress:  72%|█████████████████████████████▌           |  ETA: 0:03:31[39m

test reward at step 1080000: -27976.670199203676


[32mProgress:  73%|█████████████████████████████▊           |  ETA: 0:03:26[39m

test reward at step 1090000: -1.2977602883017477e7


[32mProgress:  73%|██████████████████████████████           |  ETA: 0:03:20[39m

parameters at step 1100000 saved to ./RL_models_leo/vtol_2D_ppo_1100000.bson
test reward at step 1100000: -374062.7588190768


[32mProgress:  74%|██████████████████████████████▍          |  ETA: 0:03:15[39m

test reward at step 1110000: -2.067352455806659e7


[32mProgress:  75%|██████████████████████████████▋          |  ETA: 0:03:11[39m

test reward at step 1120000: -4.8738308694861665e7


[32mProgress:  75%|██████████████████████████████▉          |  ETA: 0:03:06[39m

test reward at step 1130000: -2.3316845601195097e7


[32mProgress:  76%|███████████████████████████████▏         |  ETA: 0:03:01[39m

test reward at step 1140000: -5.468776796618579e6


[32mProgress:  76%|███████████████████████████████▍         |  ETA: 0:02:58[39m

test reward at step 1150000: -30574.606546447085


[32mProgress:  77%|███████████████████████████████▋         |  ETA: 0:02:52[39m

test reward at step 1160000: -5.882050368585386e6


[32mProgress:  78%|███████████████████████████████▉         |  ETA: 0:02:47[39m

test reward at step 1170000: -2.639673279733025e7


[32mProgress:  79%|████████████████████████████████▏        |  ETA: 0:02:42[39m

test reward at step 1180000: -6.244452089254611e6


[32mProgress:  79%|████████████████████████████████▌        |  ETA: 0:02:36[39m

test reward at step 1190000: -4.22548239604011e6


[32mProgress:  80%|████████████████████████████████▊        |  ETA: 0:02:31[39m

parameters at step 1200000 saved to ./RL_models_leo/vtol_2D_ppo_1200000.bson
test reward at step 1200000: -1.813768469226467e7


[32mProgress:  81%|█████████████████████████████████        |  ETA: 0:02:26[39m

test reward at step 1210000: -36310.36730840573


[32mProgress:  81%|█████████████████████████████████▍       |  ETA: 0:02:20[39m

test reward at step 1220000: -333402.9134974944


[32mProgress:  82%|█████████████████████████████████▋       |  ETA: 0:02:15[39m

test reward at step 1230000: -2.00499784440248e7


[32mProgress:  82%|█████████████████████████████████▊       |  ETA: 0:02:13[39m

test reward at step 1240000: -3.367696018251504e7


[32mProgress:  83%|██████████████████████████████████       |  ETA: 0:02:08[39m

test reward at step 1250000: -2.5965085666195676e7


[32mProgress:  84%|██████████████████████████████████▍      |  ETA: 0:02:02[39m

test reward at step 1260000: -2.976073315271607e7


[32mProgress:  84%|██████████████████████████████████▋      |  ETA: 0:01:57[39m

test reward at step 1270000: -27575.148921358752


[32mProgress:  85%|██████████████████████████████████▉      |  ETA: 0:01:52[39m

test reward at step 1280000: -4.368685852605861e7


[32mProgress:  86%|███████████████████████████████████▎     |  ETA: 0:01:46[39m

test reward at step 1290000: -1.516227018269989e7


[32mProgress:  87%|███████████████████████████████████▌     |  ETA: 0:01:41[39m

parameters at step 1300000 saved to ./RL_models_leo/vtol_2D_ppo_1300000.bson
test reward at step 1300000: -37812.549608261645


[32mProgress:  87%|███████████████████████████████████▊     |  ETA: 0:01:36[39m

test reward at step 1310000: -8.29713448563881e6


[32mProgress:  88%|████████████████████████████████████     |  ETA: 0:01:31[39m

test reward at step 1320000: -3.3534101336491723e6


[32mProgress:  89%|████████████████████████████████████▍    |  ETA: 0:01:26[39m

test reward at step 1330000: -1.776778548888056e7


[32mProgress:  89%|████████████████████████████████████▋    |  ETA: 0:01:21[39m

test reward at step 1340000: -152380.8268599607


[32mProgress:  90%|████████████████████████████████████▉    |  ETA: 0:01:15[39m

test reward at step 1350000: -7.994705101990858e6


[32mProgress:  90%|█████████████████████████████████████    |  ETA: 0:01:13[39m

test reward at step 1360000: -2.351138801507992e7


[32mProgress:  91%|█████████████████████████████████████▍   |  ETA: 0:01:07[39m

test reward at step 1370000: -2.2506639752511244e7


[32mProgress:  92%|█████████████████████████████████████▋   |  ETA: 0:01:02[39m

test reward at step 1380000: -39998.46391374209


[32mProgress:  92%|█████████████████████████████████████▉   |  ETA: 0:00:57[39m

test reward at step 1390000: -30047.282376901727


[32mProgress:  93%|██████████████████████████████████████▏  |  ETA: 0:00:52[39m

parameters at step 1400000 saved to ./RL_models_leo/vtol_2D_ppo_1400000.bson
test reward at step 1400000: -4.057456268266435e7


[32mProgress:  94%|██████████████████████████████████████▌  |  ETA: 0:00:46[39m

test reward at step 1410000: -3.5677990221180907e6


[32mProgress:  94%|██████████████████████████████████████▊  |  ETA: 0:00:41[39m

test reward at step 1420000: -8.503031983076964e6


[32mProgress:  95%|███████████████████████████████████████  |  ETA: 0:00:36[39m

test reward at step 1430000: -3.188861599059456e7


[32mProgress:  96%|███████████████████████████████████████▎ |  ETA: 0:00:31[39m

test reward at step 1440000: -269281.9221994902


[32mProgress:  97%|███████████████████████████████████████▋ |  ETA: 0:00:26[39m

test reward at step 1450000: -3.959848287146317e7


[32mProgress:  97%|███████████████████████████████████████▉ |  ETA: 0:00:21[39m

test reward at step 1460000: -3.821680122169447e7


[32mProgress:  98%|████████████████████████████████████████▏|  ETA: 0:00:15[39m

test reward at step 1470000: -4.432260162107362e7


[32mProgress:  99%|████████████████████████████████████████▌|  ETA: 0:00:10[39m

test reward at step 1480000: -3.005220103773562e7


[32mProgress:  99%|████████████████████████████████████████▊|  ETA: 0:00:05[39m

test reward at step 1490000: -2.4063833920675725e7


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:12:27[39m


parameters at step 1500000 saved to ./RL_models_leo/vtol_2D_ppo_1500000.bson
test reward at step 1500000: -1.475999873747642e6


In [None]:

agent.policy.approximator = loadModel();

ReinforcementLearning.run(
    agent,
    env,
    StopAfterStep(1_500_000),
    ComposedHook(
        DoEveryNStep(saveModel, n=100_000),
        DoEveryNStep(validate_policy, n=10_000)),
)

In [None]:
### Plot the stuff
plot(episode_test_reward_hook.rewards)

In [None]:
close_visualization(); # closes the MeshCat visualization