# Init Bionic VTOL

In [1]:
include("../Flyonic.jl");
using .Flyonic;

using Rotations; # used for initial position

using ReinforcementLearning;
using StableRNGs;
using Flux;
using Flux.Losses;
using Random;
using IntervalSets;
using LinearAlgebra;
using Distributions;

using Plots;
using Statistics;

using BSON: @save, @load # save mode

In [2]:
create_visualization();

┌ Info: MeshCat server started. You can open the visualizer by visiting the following URL in your browser:
│ http://127.0.0.1:8700
└ @ MeshCat /Users/leonardoigler/.julia/packages/MeshCat/Ax8pH/src/visualizer.jl:73


In [3]:
# indicates how many threads Julia was started with. This is important for the multi-threaded environment
Threads.nthreads()

1

In [4]:
eth_vtol_param["gravity"] = 9.81;

In [5]:
# TODO: All this stuff must be replaced later by your guiding paths.

DESIRED_x = [-4.0, 0.0, 4.0] # desired distance    
angle = calculateAngle([0.0 ,0.0, 1.0], DESIRED_x) # 
DESIRED_R = Matrix(UnitQuaternion(RotY(angle)*RotX(pi/2.0)*RotZ(pi/2.0)))

create_VTOL("fixgoal", actuators = false, color_vec=[0.0; 1.0; 0.0; 1.0]);
set_transform("fixgoal", DESIRED_x ,QuatRotation(DESIRED_R)); 

# Create Reinforcement Learning Environment

In [6]:
mutable struct VtolEnv{A,T,ACT,R<:AbstractRNG} <: AbstractEnv # Parametric Constructor for a subtype of AbstractEnv
    action_space::A
    observation_space::Space{Vector{ClosedInterval{T}}}
    state::Vector{T}
    action::ACT
    done::Bool
    t::T
    rng::R

    name::String #for multible environoments
    visualization::Bool
    realtime::Bool
    
    # Everything you need aditionaly can also go in here.
    x_W::Vector{T}
    v_B::Vector{T}
    R_W::Matrix{T}
    ω_B::Vector{T}
    wind_W::Vector{T}
    Δt::T
    
    # Bonus / Target
    x_d_W::Vector{T}
    R_d_W::Matrix{T}

    # NEW
    covered_line::T
    previously_covered_line::T
    reached_goal::Bool
end

In [7]:
# define a keyword-based constructor for the type declared in the mutable struct typedef. 
# It could also be done with the macro Base.@kwdef.
function VtolEnv(;
     
    #continuous = true,
    rng = Random.GLOBAL_RNG, # Random number generation
    name = "vtol",
    visualization = false,
    realtime = false,
    kwargs... # let the function take an arbitrary number of keyword arguments 
)
    
    T = Float64; # explicit type which is used e.g. in state. Cannot be altered due to the poor matrix defininon.

    #action_space = Base.OneTo(21) # 21 discrete positions for the flaps
    
    action_space = Space(
        ClosedInterval{T}[
            0.0..2.0, # propeller 1
            0.0..2.0, # propeller 2
            ], 
    )

    
    state_space = Space( # Three continuous values in state space.
        ClosedInterval{T}[
            
            # If you are not flying horizontally, you can later switch gravitation 
            # back on and counteract it with the rotors as well.
            # In addition, once the drone has flown over its target, 
            # it can "fall down" and does not have to turn around.
            
            # orientate yourself on the state space from the paper
            typemin(T)..typemax(T), # position along x
            typemin(T)..typemax(T), # position along z
            typemin(T)..typemax(T), # orientation along x
            typemin(T)..typemax(T), # orientation along z
            typemin(T)..typemax(T), # velocity along x BODY coordinates
            typemin(T)..typemax(T), # velocity along y BODY coordinates
            typemin(T)..typemax(T), # rotational velocity along z BODY coordinates
            
            typemin(T)..typemax(T), # position error along x
            typemin(T)..typemax(T), # position error along z
            # Not used in Paper!!!
            typemin(T)..typemax(T), # target rotation along x (better than angle for neural networks)
            typemin(T)..typemax(T), # target rotation along z (better than angle for neural networks)
            
            # NEW 
            typemin(T)..typemax(T), # The distance along the connecting line which has been passed
            typemin(T)..typemax(T), # The distance along the connecting line which has been previously passed         
            ], 
    )
    
    if visualization
        create_VTOL(name, actuators = true, color_vec=[1.0; 1.0; 0.6; 1.0]);
    end

    environment = VtolEnv(
        action_space,
        state_space,
        zeros(T, 11), # current state, needs to be extended. 
        rand(action_space),
        false, # episode done ?
        0.0, # time
        rng, # random number generator  
        name,
        visualization,
        realtime,
        zeros(T, 3), # x_W
        zeros(T, 3), # v_B
        #Matrix(UnitQuaternion((RotX(pi)))),
        [1.0 0.0 0.0; 0.0 1.0 0.0; 0.0 0.0 1.0], # Float64... so T needs to be Float64
        zeros(T, 3), # ω_B
        zeros(T, 3), # wind_W
        T(0.025), # Δt 
        # TODO Random
        DESIRED_x, # desired distance 
        [1.0 0.0 0.0; 0.0 1.0 0.0; 0.0 0.0 1.0], # desired orientation
    
        0.0, # Covered line
        0.0,    # Previously covered line
        false,  # Reached Goal
    )
    
    
    reset!(environment)
    
    return environment
    
end;

In [8]:
# TODO Don't get that part. Ask next meeting
print(typeof(RotY(-pi/2.0)*RotX(pi)))
print(typeof(UnitQuaternion(RotY(-pi/2.0)*RotX(pi))))
print(typeof(QuatRotation(UnitQuaternion(RotY(-pi/2.0)*RotX(pi)))))

RotYX{Float64}QuatRotation{Float64}QuatRotation{Float64}

Just for explanation:

1. A mutable Struct is created. A struct is a constructor and a constructor is a function that creates new objects.
2. A outer keyword-based constructor method is added for the type declared in the mutable struct typedef before.

So now we have a function with two methods. Julia will decide which method to call by multiple dispatch.

In [9]:
methods(VtolEnv)

# Define the RL interface

In [10]:
Random.seed!(env::VtolEnv, seed) = Random.seed!(env.rng, seed)
RLBase.action_space(env::VtolEnv) = env.action_space
RLBase.state_space(env::VtolEnv) = env.observation_space
RLBase.is_terminated(env::VtolEnv) = env.done
RLBase.state(env::VtolEnv) = env.state

In [11]:
function computeReward(env::VtolEnv{A,T}) where {A,T}
    
    # TODO: Add tolerance for VTOL-Drone
    if norm(env.x_W - env.x_d_W) < 1 && env.reached_goal == false
        near_goal = 100#exp(-norm(env.x_W - env.x_d_W))*10
        env.reached_goal = true
    else
        near_goal = 0
    end

    distance_goal = 0#norm(env.x_W - env.x_d_W) * 20
    
    limit_rotation = 0.1 * env.ω_B[3]^2 #* 10.0

    if env.covered_line > 1
        new_progress = -(env.covered_line-env.previously_covered_line)*100
        progress = - env.covered_line #* 10#sign(env.covered_line)*abs(env.covered_line-norm(env.x_d_W))*10
    else
        new_progress = (env.covered_line-env.previously_covered_line)*100
        progress = env.covered_line #* 10
    end
    # TODO: Make yourself comfortable with what this is
    difference_angle = sum((env.R_W[:,1] - env.R_d_W[:,1]).^2)
    
    
    #difference_angle = abs(env.state[3])*50.0

    #distance_goal = norm(env.x_d_W-[env.state[1], env.state[2], 0])*100.0

    #difference_angle = abs(env.state[3]-env.angle_d_W)*50.0
    

    # TODO Save last position or last projection somewhere (env.last) --> Compare 
    # to current project along line
    #print(difference_angle)
    #not_upright_orientation = abs(env.state[1]-pi*0.5)*10.0
    #not_centered_position = abs(env.state[2])*10.0
    #hight = env.state[4]*100.0

    env.previously_covered_line = env.covered_line
    
    return near_goal - distance_goal + progress + new_progress - limit_rotation - difference_angle
end


RLBase.reward(env::VtolEnv{A,T}) where {A,T} = computeReward(env)

In [12]:
function RLBase.reset!(env::VtolEnv{A,T}) where {A,T}
    
    # Visualize initial state
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, [0.0; 0.0; 0.0; 0.0])
    end
    
    env.x_W = [0.0; 0.0; 0.0];
    env.v_B = [0.0; 0.0; 0.0];
    env.R_W = Matrix(UnitQuaternion(RotZ(-pi/2.0)*RotY(-pi/2.0)*RotX(pi)));
    #env.R_W = Matrix(UnitQuaternion(RotX(pi)));
    #DESIRED_R = Matrix(UnitQuaternion(env.R_W))

    env.ω_B = [0.0; 0.0; 0.0];
    env.wind_W = [0.0; 0.0; 0.0];


    #env.x_d_W = DESIRED_x
    env.x_d_W = [rand(Uniform(-5,5)), 0.0, rand(Uniform(0,5))]

    #env.R_d_W = DESIRED_R
    env.R_d_W = UnitQuaternion(RotY(calculateAngle([0.0 ,0.0, 1.0], env.x_d_W))*env.R_W)#Matrix(UnitQuaternion(RotZ(calculateAngle([0.0 ,0.0, 1.0], DESIRED_x))*RotZ(-pi/2.0)*RotY(-pi/2.0)*RotX(pi)))

    if env.visualization
        create_VTOL("fixgoal", actuators = false, color_vec=[0.0; 1.0; 0.0; 1.0]);
        set_transform("fixgoal", env.x_d_W ,QuatRotation(env.R_d_W)); 
    end

    env.covered_line = 0.0
    env.previously_covered_line = 0.0
    
    env.state = [env.x_W[1];
                 env.x_W[3];
                 env.R_W[1,1];
                 env.R_W[3,1];
                 env.v_B[1];
                 env.v_B[2];    
                 env.ω_B[3];
                 env.x_W[1] - env.x_d_W[1];
                 env.x_W[3] - env.x_d_W[3]; 
                 env.R_d_W[1,1]; 
                 env.R_d_W[3,1];
                 env.covered_line;
                 env.previously_covered_line]
    
    env.t = 0.0
    env.action = [0.0, 0.0]
    env.done = false

    env.reached_goal = false
    nothing

end;

In [13]:
# defines a methods for a callable object.
# So when a VtolEnv object is created, it has this method that can be called
function (env::VtolEnv)(a)

    # set the propeller trust and the two flaps 2D case
    next_action = [a[1], a[2], 0.0, 0.0]
   
    _step!(env, next_action)
end

In [14]:
env = VtolEnv();

In [15]:
methods(env) # Just to explain which methods the object has

In [16]:
function _step!(env::VtolEnv, next_action)
        
    # caluclate wind impact
    v_in_wind_B = vtol_add_wind(env.v_B, env.R_W, env.wind_W)
    # caluclate aerodynamic forces
    torque_B, force_B = vtol_model(v_in_wind_B, next_action, eth_vtol_param);
    # Limit to 2D
    force_B[3] = 0.0; # Body Z
    env.v_B[3] = 0.0;
    torque_B[1] = 0.0; torque_B[2] = 0.0;  # Body X and Y
    env.ω_B[1] = 0.0; env.ω_B[2] = 0.0;
    # integrate rigid body dynamics for Δt
    env.x_W, env.v_B, env.R_W, env.ω_B, time = rigid_body_simple(torque_B, force_B, env.x_W, env.v_B, env.R_W, env.ω_B, env.t, env.Δt, eth_vtol_param)
    
    # NEW    
    env.covered_line = dot(env.x_W, env.x_d_W)/(norm(env.x_d_W)^2)

    if env.realtime
        sleep(env.Δt) # TODO: just a dirty hack. this is of course slower than real time.
    end

    # Visualize the new state 
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, next_action)
    end
 
    env.t += env.Δt
    
    # State space
    #rot = Rotations.params(RotYXZ(env.R_W))[3]
    #env.state[1] = env.x_W[1] # world position in x
    #env.state[2] = env.ω_B[2] # world position in y
    #env.state[3] = rot # rotation around z
    #rot = Rotations.params(RotYXZ(env.R_W))[1]
    
    
    env.state[1] = env.x_W[1];
    env.state[2] = env.x_W[3];
    env.state[3] = env.R_W[1,1];
    env.state[4] = env.R_W[3,1];
    env.state[5] = env.v_B[1];
    env.state[6] = env.v_B[2];
    env.state[7] = env.ω_B[3];
    env.state[8] = env.x_W[1] - env.x_d_W[1];
    env.state[9] = env.x_W[3] - env.x_d_W[3]; 
    env.state[10] = env.R_d_W[1,1]; 
    env.state[11] = env.R_d_W[3,1];
    env.state[12] = env.covered_line;    # Covered distance along line after step
    env.state[13] = env.previously_covered_line; # Covered distance along line before step
    
    
    # Termination criteria
    # TODO: Use many termination criteria so that you do not train unnecessarily in wrong areas
    env.done = #true

        # After time... How fast is drone+Range of desired point
        # After reaching position (circle of r_tol)
        norm(env.ω_B) > 100.0 || 
        norm(env.v_B) > 100.0 || # stop if body is too fast
        env.x_W[3] < -5.0 || # stop if body is below -10m
        #0.0 > rot || # Stop if the drone is pitched 90°.
        #rot > pi || # Stop if the drone is pitched 90°.
        sum((env.x_W - env.x_d_W).^2) < 0.5 ||
        env.t > 10.0 # stop after 10s
    nothing
end;

In [17]:
RLBase.test_runnable!(env)

[0m[1mTest Summary:              | [22m[32m[1mPass  [22m[39m[36m[1mTotal  [22m[39m[0m[1mTime[22m


random policy with VtolEnv | [32m2000  [39m[36m 2000  [39m[0m0.7s


Test.DefaultTestSet("random policy with VtolEnv", Any[], 2000, false, false, true, 1.669566671617692e9, 1.669566672313042e9)

Show an overview of the environment.

# Setup of a reinforcement learning experiment.

In [18]:
seed = 123    
rng = StableRNG(seed)
    N_ENV = 8
    UPDATE_FREQ = 1024
    
    
    # define multiple environments for parallel training
    env = MultiThreadEnv([
        # use different names for the visualization
        VtolEnv(; rng = StableRNG(hash(seed+i)), name = "vtol$i") for i in 1:N_ENV
    ])

MultiThreadEnv(8 x VtolEnv)

In [19]:
# Define the function approximator
    ns, na = length(state(env[1])), length(action_space(env[1]))
    approximator = ActorCritic(
                actor = GaussianNetwork(
                    pre = Chain(
                    Dense(ns, 32, relu; initW = glorot_uniform(rng)),#
                    Dense(32, 32, relu; initW = glorot_uniform(rng)),
                    ),
                    μ = Chain(Dense(32, na; initW = glorot_uniform(rng))),
                    logσ = Chain(Dense(32, na; initW = glorot_uniform(rng))),
                ),
                critic = Chain(
                    Dense(ns, 32, relu; initW = glorot_uniform(rng)),
                    Dense(32, 32, relu; initW = glorot_uniform(rng)),
                    Dense(32, 1; initW = glorot_uniform(rng)),
                ),
                optimizer = ADAM(1e-3),
            );

In [20]:
    agent = Agent( # A wrapper of an AbstractPolicy
        # AbstractPolicy: the policy to use
        policy = PPOPolicy(;
                    approximator = approximator |> gpu,
                    update_freq=UPDATE_FREQ,
                    dist = Normal,
                    # For parameters visit the docu: https://juliareinforcementlearning.org/docs/rlzoo/#ReinforcementLearningZoo.PPOPolicy
                    ),
        
        # AbstractTrajectory: used to store transitions between an agent and an environment source
        trajectory = PPOTrajectory(;
            capacity = UPDATE_FREQ,
            state = Matrix{Float64} => (ns, N_ENV),
            action = Matrix{Float64} => (na, N_ENV),
            action_log_prob = Vector{Float64} => (N_ENV,),
            reward = Vector{Float64} => (N_ENV,),
            terminal = Vector{Bool} => (N_ENV,),
        ),
    );


┌ Info: The GPU function is being called but the GPU is not accessible. 
│ Defaulting back to the CPU. (No action is required if you want to run on the CPU).
└ @ Flux /Users/leonardoigler/.julia/packages/Flux/7nTyc/src/functor.jl:187


In [21]:
function saveModel(t, agent, env)
    model = cpu(agent.policy.approximator)   
    f = joinpath("./RL_models/", "vtol_2D_ppo_$t.bson")
    @save f model
    println("parameters at step $t saved to $f")
end;

In [22]:
function loadModel()
    f = joinpath("./RL_models/", "vtol_2D_ppo_2000000.bson") # TODO: evtl anpassen
    @load f model
    return model
end;

In [23]:
function validate_policy(t, agent, env)
    run(agent.policy, test_env, StopAfterEpisode(1), episode_test_reward_hook)
    # the result of the hook

    # TODO Modify: Not mean, rather last step or last 5 steps
    println("test reward at step $t: $(episode_test_reward_hook.rewards[end])")
    
end;

episode_test_reward_hook = TotalRewardPerEpisode(;is_display_on_exit=false)
# create a env only for reward test
test_env = VtolEnv(;name = "testVTOL", visualization = true, realtime = true);

In [24]:
#agent.policy.approximator = loadModel();

In [25]:
ReinforcementLearning.run(
    agent,
    env,
    StopAfterStep(1_000_000),
    ComposedHook(
        DoEveryNStep(saveModel, n=100_000), 
        DoEveryNStep(validate_policy, n=10_000)),
)

[32mProgress:   0%|                                         |  ETA: 18.65 days[39m[K

[32mProgress:   0%|                                         |  ETA: 5:43:26[39m[K

[32mProgress:   1%|▎                                        |  ETA: 0:55:49[39m[K

test reward at step 10000: -570.5859795040183


[32mProgress:   1%|▌                                        |  ETA: 0:39:38[39m[K

[32mProgress:   1%|▋                                        |  ETA: 0:29:49[39m[K

[32mProgress:   2%|▊                                        |  ETA: 0:24:10[39m[K

test reward at step 20000: -36.0459806878078


[32mProgress:   2%|█                                        |  ETA: 0:22:11[39m[K

[32mProgress:   3%|█▏                                       |  ETA: 0:19:10[39m[K

test reward at step 30000: -386.635959772928


[32mProgress:   3%|█▎                                       |  ETA: 0:18:30[39m[K

[32mProgress:   4%|█▌                                       |  ETA: 0:16:48[39m[K

[32mProgress:   4%|█▋                                       |  ETA: 0:15:27[39m[K

test reward at step 40000: 18.879445615122318


[32mProgress:   4%|█▊                                       |  ETA: 0:14:59[39m[K

[32mProgress:   5%|██                                       |  ETA: 0:13:55[39m[K

test reward at step 50000: -14.568859368465965


[32mProgress:   5%|██▏                                      |  ETA: 0:13:51[39m[K

[32mProgress:   6%|██▎                                      |  ETA: 0:13:07[39m[K

[32mProgress:   6%|██▌                                      |  ETA: 0:12:28[39m[K

test reward at step 60000: -274.7849616496483


[32mProgress:   6%|██▋                                      |  ETA: 0:12:23[39m[K

[32mProgress:   7%|██▊                                      |  ETA: 0:11:46[39m[K

test reward at step 70000: -29.143700176266893


[32mProgress:   7%|███                                      |  ETA: 0:11:41[39m[K

[32mProgress:   8%|███▏                                     |  ETA: 0:11:12[39m[K

[32mProgress:   8%|███▎                                     |  ETA: 0:11:08[39m[K

test reward at step 80000: 20.264404890600158


[32mProgress:   8%|███▌                                     |  ETA: 0:10:41[39m[K

[32mProgress:   9%|███▋                                     |  ETA: 0:10:18[39m[K

test reward at step 90000: -140.5890027006948


[32mProgress:   9%|███▉                                     |  ETA: 0:10:17[39m[K

[32mProgress:  10%|████                                     |  ETA: 0:09:58[39m[K

parameters at step 100000 saved to ./RL_models/vtol_2D_ppo_100000.bson


test reward at step 100000: 64.07219796665733


[32mProgress:  10%|████▏                                    |  ETA: 0:10:11[39m[K

[32mProgress:  11%|████▍                                    |  ETA: 0:09:54[39m[K

[32mProgress:  11%|████▌                                    |  ETA: 0:09:37[39m[K

test reward at step 110000: 61.02453555202068


[32mProgress:  11%|████▋                                    |  ETA: 0:09:46[39m[K

[32mProgress:  12%|████▉                                    |  ETA: 0:09:30[39m[K

test reward at step 120000: -15.002679544689236


[32mProgress:  12%|█████                                    |  ETA: 0:09:29[39m[K

[32mProgress:  13%|█████▏                                   |  ETA: 0:09:16[39m[K

[32mProgress:  13%|█████▍                                   |  ETA: 0:09:03[39m[K

test reward at step 130000: 30.64169245516794


[32mProgress:  13%|█████▌                                   |  ETA: 0:09:06[39m[K

[32mProgress:  14%|█████▋                                   |  ETA: 0:08:58[39m[K

[32mProgress:  14%|█████▊                                   |  ETA: 0:09:02[39m[K

test reward at step 140000: 46.75008322440721


[32mProgress:  14%|█████▉                                   |  ETA: 0:08:52[39m[K

[32mProgress:  15%|██████                                   |  ETA: 0:08:42[39m[K

test reward at step 150000: 151.57300160109446


[32mProgress:  15%|██████▎                                  |  ETA: 0:08:58[39m[K

[32mProgress:  16%|██████▍                                  |  ETA: 0:08:48[39m[K

[32mProgress:  16%|██████▌                                  |  ETA: 0:08:39[39m[K

test reward at step 160000: -129.4227282586199


[32mProgress:  16%|██████▋                                  |  ETA: 0:08:40[39m[K

[32mProgress:  17%|██████▉                                  |  ETA: 0:08:30[39m[K

test reward at step 170000: 6.121818913538493


[32mProgress:  17%|███████                                  |  ETA: 0:08:30[39m[K

[32mProgress:  17%|███████▏                                 |  ETA: 0:08:21[39m[K

[32mProgress:  18%|███████▍                                 |  ETA: 0:08:13[39m[K

test reward at step 180000: 50.45407383420311


[32mProgress:  18%|███████▌                                 |  ETA: 0:08:13[39m[K

[32mProgress:  19%|███████▋                                 |  ETA: 0:08:05[39m[K

[32mProgress:  19%|███████▊                                 |  ETA: 0:07:58[39m[K

test reward at step 190000: 204.88175503382797


[32mProgress:  19%|███████▉                                 |  ETA: 0:08:00[39m[K

[32mProgress:  20%|████████▏                                |  ETA: 0:07:52[39m[K

parameters at step 200000 saved to ./RL_models/vtol_2D_ppo_200000.bson


test reward at step 200000: 79.43950871964581


[32mProgress:  20%|████████▎                                |  ETA: 0:07:59[39m[K

[32mProgress:  20%|████████▍                                |  ETA: 0:07:53[39m[K

[32mProgress:  21%|████████▌                                |  ETA: 0:07:47[39m[K

test reward at step 210000: -123.93955438278095


[32mProgress:  21%|████████▋                                |  ETA: 0:08:01[39m[K

[32mProgress:  21%|████████▊                                |  ETA: 0:07:54[39m[K

[32mProgress:  22%|█████████                                |  ETA: 0:07:48[39m[K

test reward at step 220000: 50.1355947645202


[32mProgress:  22%|█████████▏                               |  ETA: 0:07:49[39m[K

[32mProgress:  23%|█████████▎                               |  ETA: 0:07:43[39m[K

[32mProgress:  23%|█████████▍                               |  ETA: 0:07:37[39m[K

test reward at step 230000: -38.78966065021446


[32mProgress:  23%|█████████▌                               |  ETA: 0:07:36[39m[K

[32mProgress:  24%|█████████▊                               |  ETA: 0:07:30[39m[K

test reward at step 240000: 169.83650318685704


[32mProgress:  24%|█████████▉                               |  ETA: 0:07:27[39m[K

[32mProgress:  25%|██████████                               |  ETA: 0:07:20[39m[K

[32mProgress:  25%|██████████▎                              |  ETA: 0:07:14[39m[K

test reward at step 250000: -9.006658371798556


[32mProgress:  25%|██████████▍                              |  ETA: 0:07:11[39m[K

[32mProgress:  26%|██████████▌                              |  ETA: 0:07:05[39m[K

test reward at step 260000: 162.5280417008013


[32mProgress:  26%|██████████▋                              |  ETA: 0:07:13[39m[K

[32mProgress:  27%|██████████▉                              |  ETA: 0:07:06[39m[K

[32mProgress:  27%|███████████                              |  ETA: 0:07:00[39m[K

test reward at step 270000: 79.64466402162388


[32mProgress:  27%|███████████▎                             |  ETA: 0:07:01[39m[K

[32mProgress:  28%|███████████▍                             |  ETA: 0:06:55[39m[K

test reward at step 280000: 19.74694996250463


[32mProgress:  28%|███████████▌                             |  ETA: 0:06:56[39m[K

[32mProgress:  28%|███████████▋                             |  ETA: 0:06:50[39m[K

[32mProgress:  29%|███████████▉                             |  ETA: 0:06:45[39m[K

test reward at step 290000: -80.01076963716903


[32mProgress:  29%|████████████                             |  ETA: 0:06:44[39m[K

[32mProgress:  30%|████████████▏                            |  ETA: 0:06:40[39m[K

parameters at step 300000 saved to ./RL_models/vtol_2D_ppo_300000.bson


test reward at step 300000: 46.044137411666085


[32mProgress:  30%|████████████▎                            |  ETA: 0:06:36[39m[K

[32mProgress:  30%|████████████▌                            |  ETA: 0:06:32[39m[K

[32mProgress:  31%|████████████▋                            |  ETA: 0:06:28[39m[K

test reward at step 310000: 48.05547741596472


[32mProgress:  31%|████████████▊                            |  ETA: 0:06:25[39m[K

[32mProgress:  31%|████████████▉                            |  ETA: 0:06:21[39m[K

[32mProgress:  32%|█████████████                            |  ETA: 0:06:18[39m[K

test reward at step 320000: 74.03998681224247


[32mProgress:  32%|█████████████▏                           |  ETA: 0:06:22[39m[K

[32mProgress:  33%|█████████████▍                           |  ETA: 0:06:15[39m[K

test reward at step 330000: 63.80311120253946


[32mProgress:  33%|█████████████▌                           |  ETA: 0:06:15[39m[K

[32mProgress:  33%|█████████████▊                           |  ETA: 0:06:10[39m[K

[32mProgress:  34%|█████████████▉                           |  ETA: 0:06:06[39m[K

test reward at step 340000: 50.57779764589197


[32mProgress:  34%|██████████████                           |  ETA: 0:06:07[39m[K

[32mProgress:  35%|██████████████▏                          |  ETA: 0:06:03[39m[K

[32mProgress:  35%|██████████████▍                          |  ETA: 0:05:59[39m[K

test reward at step 350000: -8.383684070315054


[32mProgress:  35%|██████████████▌                          |  ETA: 0:05:59[39m[K

[32mProgress:  36%|██████████████▋                          |  ETA: 0:05:56[39m[K

[32mProgress:  36%|██████████████▊                          |  ETA: 0:05:52[39m[K

test reward at step 360000: 46.73167729994646


[32mProgress:  36%|██████████████▉                          |  ETA: 0:05:52[39m[K

[32mProgress:  37%|███████████████                          |  ETA: 0:05:49[39m[K

test reward at step 370000: 170.52698234428647


[32mProgress:  37%|███████████████▏                         |  ETA: 0:05:53[39m[K

[32mProgress:  37%|███████████████▎                         |  ETA: 0:05:51[39m[K

[32mProgress:  38%|███████████████▍                         |  ETA: 0:05:47[39m[K

[32mProgress:  38%|███████████████▋                         |  ETA: 0:05:44[39m[K

test reward at step 380000: -91.76957352519142


[32mProgress:  38%|███████████████▊                         |  ETA: 0:05:44[39m[K

[32mProgress:  39%|███████████████▊                         |  ETA: 0:05:42[39m[K

[32mProgress:  39%|███████████████▉                         |  ETA: 0:05:40[39m[K

test reward at step 390000: 23.84190566419688


[32mProgress:  39%|████████████████                         |  ETA: 0:05:41[39m[K

[32mProgress:  39%|████████████████▏                        |  ETA: 0:05:37[39m[K

[32mProgress:  40%|████████████████▍                        |  ETA: 0:05:33[39m[K

parameters at step 400000 saved to ./RL_models/vtol_2D_ppo_400000.bson


test reward at step 400000: -11.332597775646715


[32mProgress:  40%|████████████████▌                        |  ETA: 0:05:32[39m[K

In [None]:
plot(episode_test_reward_hook.rewards)

In [None]:
close_visualization(); # closes the MeshCat visualization