# Init Bionic VTOL

In [1]:
include("../Flyonic.jl");
using .Flyonic; #simulator kram

using Rotations; # used for initial position

#find packages on julia hub
using ReinforcementLearning; 
using StableRNGs;
using Flux; #DL
using Flux.Losses;
using Random;
using IntervalSets;
using LinearAlgebra;
using Distributions;

using Plots;
using Statistics;

using BSON: @save, @load # save and load model

In [2]:
create_visualization(); #from flyonics package

[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mMeshCat server started. You can open the visualizer by visiting the following URL in your browser:
[36m[1m└ [22m[39mhttp://127.0.0.1:8700


In [3]:
# indicates how many threads Julia was started with. This is important for the multi-threaded environment
Threads.nthreads()

1

# Create Reinforcement Learning Environment

In [4]:
#connect RL-library with simulation via this environment
mutable struct VtolEnv{A,T,ACT,R<:AbstractRNG} <: AbstractEnv # Parametric Constructor for a subtype of AbstractEnv
    #required
    action_space::A
    observation_space::Space{Vector{ClosedInterval{T}}}
    state::Vector{T} #state of system; goes in policy
    action::ACT #action that system does next
    done::Bool #e.g. drone crashed
    t::T
    rng::R

    name::String #for multible environoments
    visualization::Bool
    realtime::Bool # realtime
    
    # Everything you need aditionaly can also go in here.
    #additional states for simulation; not for policy
    x_W::Vector{T}
    v_B::Vector{T}
    R_W::Matrix{T}
    ω_B::Vector{T}
    wind_W::Vector{T}
    Δt::T
end

In [5]:
# define a keyword-based constructor for the type declared in the mutable struct typedef. 
# It could also be done with the macro Base.@kwdef.
function VtolEnv(;
     
    #continuous = true,
    rng = Random.GLOBAL_RNG, # Random number generation
    name = "vtol",
    visualization = false,
    realtime = false, # realtime
    kwargs... # let the function take an arbitrary number of keyword arguments 
)
    
    T = Float64; # explicit type which is used e.g. in state. Cannot be altered due to the poor matrix defininon.

    #action_space = Base.OneTo(21) # 21 discrete positions for the flaps
    
    #here: two actions; continuous
    #Beginning: stay 2D
    #Later: 4D: independent rotors and flaps (thats all actuators)
    action_space = Space(
        ClosedInterval{T}[
            0.0..2.0, # thrust
            -1.0..1.0, # flaps
            ], 
    )

    #reduced to 2D for now
    state_space = Space( # Three continuous values in state space.
        ClosedInterval{T}[
            typemin(T)..typemax(T), # rotation arround y
            typemin(T)..typemax(T), # rotation velocity arround y
            typemin(T)..typemax(T), # world position along x
            typemin(T)..typemax(T), # world position along z
            ], 
    )
    
    if visualization
        create_VTOL(name, actuators = true, color_vec=[1.0; 1.0; 0.6; 1.0]); #for viz
        set_transform(name, [0.0; 0.0; 0.0] ,QuatRotation(UnitQuaternion(RotY(-pi/2.0)*RotX(pi)))); #for viz
        set_actuators(name, [0.0; 0.0; 0.0; 0.0]) #for viz
    end

    #instantiates the sctruct from before
    environment = VtolEnv(
        action_space,
        state_space,
        zeros(T, length(state_space)), # current state, needs to be extended.
        rand(action_space),
        false, # episode done ?
        0.0, # time
        rng, # random number generator  
        name,
        visualization,
        realtime,
        zeros(T, 3), # x_W
        zeros(T, 3), # v_B
        Matrix(UnitQuaternion(RotY(-pi/2.0)*RotX(pi))), # Float64... so T needs to be Float64
        zeros(T, 3), # ω_B
        zeros(T, 3), # wind_W
        T(0.025), # Δt  
    )
    
    #do this for simulation start
    reset!(environment)
    
    return environment
    
end;

Just for explanation:

1. A mutable Struct is created. A struct is a constructor and a constructor is a function that creates new objects.
2. A outer keyword-based constructor method is added for the type declared in the mutable struct typedef before.

So now we have a function with two methods. Julia will decide which method to call by multiple dispatch.

In [6]:
methods(VtolEnv)

# Define the RL interface

In [7]:
Random.seed!(env::VtolEnv, seed) = Random.seed!(env.rng, seed)
RLBase.action_space(env::VtolEnv) = env.action_space
RLBase.state_space(env::VtolEnv) = env.observation_space
RLBase.is_terminated(env::VtolEnv) = env.done
RLBase.state(env::VtolEnv) = env.state

In [8]:
function computeReward(env::VtolEnv{A,T}) where {A,T}
    
    #this rewards makes drone go straight up :)
    stay_alive = 3.0
    not_upright_orientation = abs(env.state[1]-pi*0.5)*10.0
    not_centered_position = abs(env.state[3])*10.0
    hight = env.state[4]*100.0
    
    return stay_alive - not_upright_orientation - not_centered_position + hight
end


RLBase.reward(env::VtolEnv{A,T}) where {A,T} = computeReward(env)

In [9]:
function RLBase.reset!(env::VtolEnv{A,T}) where {A,T}
    
    # Visualize initial state
    if env.visualization
        set_transform(env.name, env.x_W,QuatRotation(env.R_W));
        set_actuators(env.name, [0.0; 0.0; 0.0; 0.0])
    end
        
    env.x_W = [0.0; 0.0; 0.0];
    env.v_B = [0.0; 0.0; 0.0];
    env.R_W = Matrix(UnitQuaternion(RotY(-pi/2.0)*RotX(pi)));
    env.ω_B = [0.0; 0.0; 0.0];
    env.wind_W = [0.0; 0.0; 0.0]; #use this for gusty condition

 
    env.state = [env.ω_B[2]; Rotations.params(RotYXZ(env.R_W))[1]; env.x_W[1]; env.x_W[3]]
    env.t = 0.0
    env.action = [0.0]
    env.done = false
    nothing
end;

In [10]:
# defines a methods for a callable object.
# So when a VtolEnv object is created, it has this method that can be called
# Apply chosen actions on simulator
# add a third function to environment with action a
function (env::VtolEnv)(a)

    # set the propeller trust and the two flaps 2D case
    next_action = [a[1], a[1], a[2], a[2]]
   
    _step!(env, next_action)
end

In [11]:
env = VtolEnv()

# VtolEnv

## Traits

| Trait Type        |                  Value |
|:----------------- | ----------------------:|
| NumAgentStyle     |          SingleAgent() |
| DynamicStyle      |           Sequential() |
| InformationStyle  | ImperfectInformation() |
| ChanceStyle       |           Stochastic() |
| RewardStyle       |           StepReward() |
| UtilityStyle      |           GeneralSum() |
| ActionStyle       |     MinimalActionSet() |
| StateStyle        |     Observation{Any}() |
| DefaultStateStyle |     Observation{Any}() |

## Is Environment Terminated?

No

## State Space

`Space{Vector{ClosedInterval{Float64}}}(ClosedInterval{Float64}[-Inf..Inf, -Inf..Inf, -Inf..Inf, -Inf..Inf])`

## Action Space

`Space{Vector{ClosedInterval{Float64}}}(ClosedInterval{Float64}[0.0..2.0, -1.0..1.0])`

## Current State

```
[0.0, 1.5707963267948966, 0.0, 0.0]
```


In [12]:
methods(env) # Just to explain which methods the object has

In [13]:
# actual step for each simulation iteration
# Trajectory: generate externally; and then do some calculation in the step; in the environment add additional state: v_soll; v_ist --> use in reward function
# Generate trajectories randomly, so that iterations train on different trajectories
# Test set: always use same trajectory
# TODO maybe we can get trajectory generation from last year
# Evaluation metric: how well do I track my velocity trajectory? Look for 2-3 metrics here and thats ok
function _step!(env::VtolEnv, next_action)
        
    # caluclate wind impact
    v_in_wind_B = vtol_add_wind(env.v_B, env.R_W, env.wind_W)
    # caluclate aerodynamic forces
    torque_B, force_B = vtol_model(v_in_wind_B, next_action, eth_vtol_param);
    # integrate rigid body dynamics for Δt
    # W-world KOS; B-body KOS; Simulation is already complete
    env.x_W, env.v_B, env.R_W, env.ω_B, time = rigid_body_simple(torque_B, force_B, env.x_W, env.v_B, env.R_W, env.ω_B, env.t, env.Δt, eth_vtol_param)


    if env.realtime
        sleep(env.Δt); # just a dirty hack. this is of course slower than real time.
    end
    
    # Visualize the new state 
    # TODO: Can be removed for real trainings
    if env.visualization
        set_transform(env.name, env.x_W, QuatRotation(env.R_W));
        set_actuators(env.name, next_action)
    end
 
    env.t += env.Δt
    
    # State space
    # Pass more states to RF-Learning-Agent, if required
    rot = Rotations.params(RotYXZ(env.R_W))[1]
    env.state[1] = rot # rotation arround y
    env.state[2] = env.ω_B[2] # rotation velocity arround y
    env.state[3] = env.x_W[1] # world position along x
    env.state[4] = env.x_W[3] # world position along z
    
    
    # Termination criteria
    env.done =
        #norm(v_B) > 2.0 || # stop if body is too fast
        env.x_W[3] < -1.0 || # stop if body is below -1m
        0.0 > rot || # Stop if the drone is pitched 90°.
        rot > pi || # Stop if the drone is pitched 90°.
        env.t > 10 # stop after 10s
    nothing #return nothing
end;

In [14]:
RLBase.test_runnable!(env)

[0m[1mTest Summary:              | [22m[32m[1mPass  [22m[39m[36m[1mTotal  [22m[39m[0m[1mTime[22m
random policy with VtolEnv | [32m2000  [39m[36m 2000  [39m[0m1.0s


Test.DefaultTestSet("random policy with VtolEnv", Any[], 2000, false, false, true, 1.673884352576474e9, 1.673884353612373e9)

Show an overview of the environment.

# Setup of a reinforcement learning experiment.

In [15]:
seed = 123    
rng = StableRNG(seed)
    N_ENV = 8 #number of envs
    UPDATE_FREQ = 1024
    
    
    # define multiple environments for parallel training
    env = MultiThreadEnv([
        # use different names for the visualization
        VtolEnv(; rng = StableRNG(hash(seed+i)), name = "vtol$i", visualization = false) for i in 1:N_ENV
    ])

MultiThreadEnv(8 x VtolEnv)

In [16]:
# Define the function approximator
    ns, na = length(state(env[1])), length(action_space(env[1]))
    #ActorCritic Policy
    approximator = ActorCritic(
                #ns - number states as input
                #3 layer; last layer splitted in mean and variance; then action is sampled
                actor = GaussianNetwork(
                    pre = Chain(
                    Dense(ns, 16, relu; initW = glorot_uniform(rng)),#
                    Dense(16, 16, relu; initW = glorot_uniform(rng)),
                    ),
                    μ = Chain(Dense(16, na; initW = glorot_uniform(rng))),
                    logσ = Chain(Dense(16, na; initW = glorot_uniform(rng))),
                ),
                critic = Chain(
                    Dense(ns, 16, relu; initW = glorot_uniform(rng)),
                    Dense(16, 16, relu; initW = glorot_uniform(rng)),
                    Dense(16, 1; initW = glorot_uniform(rng)),
                ),
                optimizer = ADAM(1e-3),
            );

In [17]:
    #learning
    agent = Agent( # A wrapper of an AbstractPolicy
        # AbstractPolicy: the policy to use
        policy = PPOPolicy(;
                    approximator = approximator |> gpu,
                    update_freq=UPDATE_FREQ,
                    dist = Normal,
                    # For parameters visit the docu: https://juliareinforcementlearning.org/docs/rlzoo/#ReinforcementLearningZoo.PPOPolicy
                    ),
        
        # AbstractTrajectory: used to store transitions between an agent and an environment source
        # depends on RL-Algorithm
        trajectory = PPOTrajectory(;
            capacity = UPDATE_FREQ,
            state = Matrix{Float64} => (ns, N_ENV),
            action = Matrix{Float64} => (na, N_ENV),
            action_log_prob = Vector{Float64} => (N_ENV,),
            reward = Vector{Float64} => (N_ENV,),
            terminal = Vector{Bool} => (N_ENV,),
        ),
    )


[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mThe GPU function is being called but the GPU is not accessible. 
[36m[1m└ [22m[39mDefaulting back to the CPU. (No action is required if you want to run on the CPU).


typename(Agent)
├─ policy => typename(PPOPolicy)
│  ├─ approximator => typename(ActorCritic)
│  │  ├─ actor => typename(GaussianNetwork)
│  │  │  ├─ pre => typename(Chain)
│  │  │  │  └─ layers
│  │  │  │     ├─ 1
│  │  │  │     │  └─ typename(Dense)
│  │  │  │     │     ├─ weight => 16×4 Matrix{Float32}
│  │  │  │     │     ├─ bias => 16-element Vector{Float32}
│  │  │  │     │     └─ σ => typename(typeof(relu))
│  │  │  │     └─ 2
│  │  │  │        └─ typename(Dense)
│  │  │  │           ├─ weight => 16×16 Matrix{Float32}
│  │  │  │           ├─ bias => 16-element Vector{Float32}
│  │  │  │           └─ σ => typename(typeof(relu))
│  │  │  ├─ μ => typename(Chain)
│  │  │  │  └─ layers
│  │  │  │     └─ 1
│  │  │  │        └─ typename(Dense)
│  │  │  │           ├─ weight => 2×16 Matrix{Float32}
│  │  │  │           ├─ bias => 2-element Vector{Float32}
│  │  │  │           └─ σ => typename(typeof(identity))
│  │  │  ├─ logσ => typename(Chain)
│  │  │  │  └─ layers
│  │  │  │     └─ 1


In [18]:
function saveModel(t, agent, env)
    model = cpu(agent.policy.approximator)   
    f = joinpath("./RL_models/", "vtol_ppo_2_$t.bson")
    @save f model
    println("parameters at step $t saved to $f")
end

saveModel (generic function with 1 method)

In [19]:
function loadModel()
    # TODO use correct relative path here
    f = joinpath("./RL_models/", "vtol_ppo_2_9320000.bson")
    @load f model
    return model
end

loadModel (generic function with 1 method)

In [20]:
function validate_policy(t, agent, env)
    run(agent.policy, test_env, StopAfterEpisode(1), episode_test_reward_hook)
    # the result of the hook
    println("test reward at step $t: $(episode_test_reward_hook.rewards[end])")
    
end;

episode_test_reward_hook = TotalRewardPerEpisode(;is_display_on_exit=false)
# create a env only for reward test
test_env = VtolEnv(;name = "testVTOL", visualization = true, realtime = true);

In [21]:
#use pretrained model
#agent.policy.approximator = loadModel();

In [22]:
#run actual training
#here: connect tensorboard
run(
           agent,
           env,
           StopAfterStep(1_000_000),
           ComposedHook(
                DoEveryNStep(saveModel, n=100_000), 
                DoEveryNStep(validate_policy, n=10_000)),
       )

[32mProgress:   1%|▍                                        |  ETA: 0:55:32[39m39m

test reward at step 10000: -899.1738639898911


[32mProgress:   2%|▋                                        |  ETA: 0:34:46[39m

test reward at step 20000: -1365.9739281720017


[32mProgress:   3%|█▎                                       |  ETA: 0:19:54[39m

test reward at step 30000: -1242.7983338250456


[32mProgress:   4%|█▌                                       |  ETA: 0:17:00[39m

test reward at step 40000: -1384.147594818335


[32mProgress:   5%|██                                       |  ETA: 0:13:13[39m

test reward at step 50000: 16010.00875925086


[32mProgress:   6%|██▎                                      |  ETA: 0:12:28[39m

test reward at step 60000: 15133.175797818838


[32mProgress:   7%|██▉                                      |  ETA: 0:10:54[39m

test reward at step 70000: 9839.027231205251


[32mProgress:   8%|███▏                                     |  ETA: 0:10:28[39m

test reward at step 80000: 10868.1229459878


[32mProgress:   9%|███▋                                     |  ETA: 0:09:16[39m

test reward at step 90000: 20929.20980738508


[32mProgress:  10%|████                                     |  ETA: 0:09:06[39m

parameters at step 100000 saved to ./RL_models/vtol_ppo_2_100000.bson
test reward at step 100000: 12441.889907208122


[32mProgress:  11%|████▌                                    |  ETA: 0:09:01[39m

test reward at step 110000: 20316.530804599442


[32mProgress:  12%|████▉                                    |  ETA: 0:08:30[39m

test reward at step 120000: 23865.10138993155


[32mProgress:  13%|█████▏                                   |  ETA: 0:08:21[39m

test reward at step 130000: 13008.557555647403


[32mProgress:  14%|█████▊                                   |  ETA: 0:07:46[39m

test reward at step 140000: 25552.644401255668


[32mProgress:  15%|██████                                   |  ETA: 0:07:39[39m

test reward at step 150000: 10492.772356690894


[32mProgress:  15%|██████▍                                  |  ETA: 0:07:30[39m

test reward at step 160000: 16340.189805970855


[32mProgress:  17%|██████▉                                  |  ETA: 0:07:03[39m

test reward at step 170000: 14650.067733976613


[32mProgress:  18%|███████▎                                 |  ETA: 0:06:56[39m

test reward at step 180000: 13051.946339558792


[32mProgress:  18%|███████▌                                 |  ETA: 0:06:49[39m

test reward at step 190000: 10594.273021932991


[32mProgress:  20%|████████▏                                |  ETA: 0:06:27[39m

parameters at step 200000 saved to ./RL_models/vtol_ppo_2_200000.bson
test reward at step 200000: 5015.997261348494


[32mProgress:  21%|████████▌                                |  ETA: 0:06:19[39m

test reward at step 210000: 8991.319726428703


[32mProgress:  21%|████████▊                                |  ETA: 0:06:11[39m

test reward at step 220000: 19783.292270381942


[32mProgress:  23%|█████████▍                               |  ETA: 0:05:54[39m

test reward at step 230000: 25845.18820322162


[32mProgress:  24%|█████████▋                               |  ETA: 0:05:51[39m

test reward at step 240000: 6870.9650506670005


[32mProgress:  24%|██████████                               |  ETA: 0:05:44[39m

test reward at step 250000: 5064.795174660288


[32mProgress:  26%|██████████▋                              |  ETA: 0:05:28[39m

test reward at step 260000: 3199.4734146870815


[32mProgress:  27%|██████████▉                              |  ETA: 0:05:22[39m

test reward at step 270000: 5809.57831008215


[32mProgress:  27%|███████████▎                             |  ETA: 0:05:16[39m

test reward at step 280000: 8574.356905885286


[32mProgress:  29%|███████████▉                             |  ETA: 0:05:02[39m

test reward at step 290000: 5804.154143822808


[32mProgress:  30%|████████████▏                            |  ETA: 0:04:57[39m

parameters at step 300000 saved to ./RL_models/vtol_ppo_2_300000.bson
test reward at step 300000: 12456.693647061242


[32mProgress:  30%|████████████▌                            |  ETA: 0:04:53[39m

test reward at step 310000: 13540.87725868163


[32mProgress:  32%|█████████████▏                           |  ETA: 0:04:44[39m

test reward at step 320000: 9920.843411107458


[32mProgress:  33%|█████████████▍                           |  ETA: 0:04:36[39m

test reward at step 330000: 15072.140735328621


[32mProgress:  33%|█████████████▊                           |  ETA: 0:04:33[39m

test reward at step 340000: 22119.097280283753


[32mProgress:  35%|██████████████▎                          |  ETA: 0:04:24[39m

test reward at step 350000: 27682.33407738047


[32mProgress:  36%|██████████████▋                          |  ETA: 0:04:22[39m

test reward at step 360000: 20489.652039283337


[32mProgress:  37%|███████████████▎                         |  ETA: 0:04:17[39m

test reward at step 370000: 26766.18415859122


[32mProgress:  38%|███████████████▌                         |  ETA: 0:04:10[39m

test reward at step 380000: 15086.142472647965


[32mProgress:  38%|███████████████▊                         |  ETA: 0:04:08[39m

test reward at step 390000: 15162.276511264747


[32mProgress:  40%|████████████████▍                        |  ETA: 0:03:59[39m

parameters at step 400000 saved to ./RL_models/vtol_ppo_2_400000.bson
test reward at step 400000: 24643.803175338522


[32mProgress:  41%|████████████████▋                        |  ETA: 0:03:56[39m

test reward at step 410000: 38749.05506559511


[32mProgress:  41%|█████████████████                        |  ETA: 0:03:54[39m

test reward at step 420000: 14917.204592906934


[32mProgress:  43%|█████████████████▋                       |  ETA: 0:03:45[39m

test reward at step 430000: 17893.60160139251


[32mProgress:  44%|██████████████████                       |  ETA: 0:03:43[39m

test reward at step 440000: 21431.791105666904


[32mProgress:  45%|██████████████████▍                      |  ETA: 0:03:37[39m

test reward at step 450000: 21427.035511751303


[32mProgress:  45%|██████████████████▋                      |  ETA: 0:03:35[39m

test reward at step 460000: 25204.237228380876


[32mProgress:  47%|███████████████████▎                     |  ETA: 0:03:28[39m

test reward at step 470000: 23806.22346180418


[32mProgress:  48%|███████████████████▌                     |  ETA: 0:03:26[39m

test reward at step 480000: 33944.639895039894


[32mProgress:  49%|████████████████████▏                    |  ETA: 0:03:21[39m

test reward at step 490000: 24400.98085444695


[32mProgress:  50%|████████████████████▍                    |  ETA: 0:03:16[39m

parameters at step 500000 saved to ./RL_models/vtol_ppo_2_500000.bson
test reward at step 500000: 21926.8284949202


[32mProgress:  51%|████████████████████▊                    |  ETA: 0:03:13[39m

test reward at step 510000: 31023.81670191517


[32mProgress:  52%|█████████████████████▍                   |  ETA: 0:03:08[39m

test reward at step 520000: 40584.23639826264


[32mProgress:  53%|█████████████████████▋                   |  ETA: 0:03:04[39m

test reward at step 530000: 25181.78221849841


[32mProgress:  53%|█████████████████████▉                   |  ETA: 0:03:01[39m

test reward at step 540000: 51673.34555053218


[32mProgress:  55%|██████████████████████▌                  |  ETA: 0:02:55[39m

test reward at step 550000: 42337.661387177715


[32mProgress:  56%|██████████████████████▊                  |  ETA: 0:02:53[39m

test reward at step 560000: 67750.58288644995


[32mProgress:  56%|███████████████████████▏                 |  ETA: 0:02:52[39m

test reward at step 570000: 48520.79927252033


[32mProgress:  58%|███████████████████████▋                 |  ETA: 0:02:45[39m

test reward at step 580000: 58174.6218151397


[32mProgress:  58%|████████████████████████                 |  ETA: 0:02:43[39m

test reward at step 590000: 77897.99754351944


[32mProgress:  59%|████████████████████████▍                |  ETA: 0:02:40[39m

parameters at step 600000 saved to ./RL_models/vtol_ppo_2_600000.bson
test reward at step 600000: 154246.4558890657


[32mProgress:  61%|█████████████████████████                |  ETA: 0:02:35[39m

test reward at step 610000: 96595.06197629748


[32mProgress:  62%|█████████████████████████▎               |  ETA: 0:02:33[39m

test reward at step 620000: 86091.9828854186


[32mProgress:  63%|█████████████████████████▉               |  ETA: 0:02:30[39m

test reward at step 630000: 92502.04698605643


[32mProgress:  64%|██████████████████████████▏              |  ETA: 0:02:26[39m

test reward at step 640000: 112859.44147371271


[32mProgress:  65%|██████████████████████████▌              |  ETA: 0:02:24[39m

test reward at step 650000: 91821.70404188079


[32mProgress:  66%|███████████████████████████              |  ETA: 0:02:18[39m

test reward at step 660000: 113715.86050433118


[32mProgress:  67%|███████████████████████████▎             |  ETA: 0:02:16[39m

test reward at step 670000: 130997.10200843193


[32mProgress:  67%|███████████████████████████▋             |  ETA: 0:02:14[39m

test reward at step 680000: 163327.82479413928


[32mProgress:  69%|████████████████████████████▎            |  ETA: 0:02:08[39m

test reward at step 690000: 57520.320157535396


[32mProgress:  70%|████████████████████████████▌            |  ETA: 0:02:06[39m

parameters at step 700000 saved to ./RL_models/vtol_ppo_2_700000.bson
test reward at step 700000: 37940.41240120534


[32mProgress:  70%|████████████████████████████▉            |  ETA: 0:02:03[39m

test reward at step 710000: 187640.88261707948


[32mProgress:  72%|█████████████████████████████▌           |  ETA: 0:01:57[39m

test reward at step 720000: 217584.8465656009


[32mProgress:  73%|█████████████████████████████▊           |  ETA: 0:01:55[39m

test reward at step 730000: 278931.44674931985


[32mProgress:  73%|██████████████████████████████▏          |  ETA: 0:01:53[39m

test reward at step 740000: 298500.12551030767


[32mProgress:  75%|██████████████████████████████▊          |  ETA: 0:01:47[39m

test reward at step 750000: 539855.6401860852


[32mProgress:  76%|███████████████████████████████          |  ETA: 0:01:46[39m

test reward at step 760000: 79502.59391161131


[32mProgress:  76%|███████████████████████████████▍         |  ETA: 0:01:43[39m

test reward at step 770000: 219690.90884988612


[32mProgress:  78%|████████████████████████████████         |  ETA: 0:01:37[39m

test reward at step 780000: 560528.187996154


[32mProgress:  79%|████████████████████████████████▎        |  ETA: 0:01:36[39m

test reward at step 790000: 529798.4745197479


[32mProgress:  79%|████████████████████████████████▌        |  ETA: 0:01:34[39m

parameters at step 800000 saved to ./RL_models/vtol_ppo_2_800000.bson
test reward at step 800000: 347012.5090271418


[32mProgress:  81%|█████████████████████████████████▏       |  ETA: 0:01:27[39m

test reward at step 810000: 478462.1722484009


[32mProgress:  82%|█████████████████████████████████▌       |  ETA: 0:01:25[39m

test reward at step 820000: 119476.08393930033


[32mProgress:  82%|█████████████████████████████████▊       |  ETA: 0:01:22[39m

test reward at step 830000: 441569.98038484534


[32mProgress:  84%|██████████████████████████████████▍      |  ETA: 0:01:16[39m

test reward at step 840000: 207815.049588758


[32mProgress:  85%|██████████████████████████████████▊      |  ETA: 0:01:13[39m

test reward at step 850000: 692799.5022442362


[32mProgress:  85%|███████████████████████████████████      |  ETA: 0:01:10[39m

test reward at step 860000: 922631.892110704


[32mProgress:  87%|███████████████████████████████████▋     |  ETA: 0:01:04[39m

test reward at step 870000: 163720.66205169013


[32mProgress:  88%|███████████████████████████████████▉     |  ETA: 0:01:01[39m

test reward at step 880000: 315.8492141528139


[32mProgress:  88%|████████████████████████████████████▎    |  ETA: 0:00:57[39m

test reward at step 890000: 969480.457131844


[32mProgress:  90%|████████████████████████████████████▉    |  ETA: 0:00:50[39m

parameters at step 900000 saved to ./RL_models/vtol_ppo_2_900000.bson
test reward at step 900000: 759416.2937521525


[32mProgress:  91%|█████████████████████████████████████▏   |  ETA: 0:00:47[39m

test reward at step 910000: 750744.5068174687


[32mProgress:  92%|█████████████████████████████████████▊   |  ETA: 0:00:42[39m

test reward at step 920000: 898012.2883187303


[32mProgress:  93%|██████████████████████████████████████   |  ETA: 0:00:38[39m

test reward at step 930000: 713082.7388620012


[32mProgress:  93%|██████████████████████████████████████▍  |  ETA: 0:00:34[39m

test reward at step 940000: 1.119983812523094e6


[32mProgress:  95%|██████████████████████████████████████▉  |  ETA: 0:00:27[39m

test reward at step 950000: 1.5112882181439463e6


[32mProgress:  96%|███████████████████████████████████████▎ |  ETA: 0:00:23[39m

test reward at step 960000: 1.0849653977598925e6


[32mProgress:  97%|███████████████████████████████████████▊ |  ETA: 0:00:16[39m

test reward at step 970000: 1.2737276580355105e6


[32mProgress:  98%|████████████████████████████████████████ |  ETA: 0:00:13[39m

test reward at step 980000: 486103.24072085787


[32mProgress:  99%|████████████████████████████████████████▋|  ETA: 0:00:06[39m

test reward at step 990000: 254455.7542446831


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:15[39m


parameters at step 1000000 saved to ./RL_models/vtol_ppo_2_1000000.bson
test reward at step 1000000: 42977.263683668876
