In [None]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [35]:
import os
import gym 
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load and Test Environment

In [29]:
environment_name = "Pong-v4"

In [20]:
env = gym.make(environment_name)

In [21]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-21.0
Episode:2 Score:-21.0
Episode:3 Score:-20.0
Episode:4 Score:-18.0
Episode:5 Score:-20.0


In [22]:
env.action_space.sample()

2

In [6]:
env.observation_space.sample()

array([[[134,  85, 232],
        [ 85, 167, 244],
        [181,  85,   8],
        ...,
        [ 43, 252,  14],
        [ 71,   4,  26],
        [106,  96,  21]],

       [[  0, 140,  23],
        [250, 150,  15],
        [ 12, 137, 105],
        ...,
        [187,  67,   6],
        [149,   6, 201],
        [149,  28, 190]],

       [[  2, 191, 255],
        [117, 236,  69],
        [219,  83, 149],
        ...,
        [169,  43, 146],
        [184,  91,  10],
        [231,  15, 126]],

       ...,

       [[177,  90, 215],
        [176, 188,  46],
        [ 37, 142, 204],
        ...,
        [ 41, 168,  79],
        [ 28, 182,  58],
        [153,  12, 146]],

       [[ 19, 107,  24],
        [253, 179,   5],
        [235, 215, 240],
        ...,
        [204, 164, 169],
        [ 60, 187, 174],
        [ 71, 145, 149]],

       [[154, 146, 124],
        [175,  48, 169],
        [ 19,  43,  27],
        ...,
        [ 38,  37, 115],
        [224, 121, 194],
        [100, 137, 188]]

# 3. Train an RL Model

In [32]:
env = gym.make(environment_name)
model = DQN('CnnPolicy',env, verbose = 1,buffer_size = 10000 )

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [33]:
model.learn(total_timesteps=20000)

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.23e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2737     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4910     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.22e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2758     |
|    time_elapsed     | 3        |
|    total_timesteps  | 9785     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.2e+03  |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes       

<stable_baselines3.dqn.dqn.DQN at 0x7f3fc15cd460>

# 4. Save and Reload Model

In [36]:
DQN_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [38]:
model.save(DQN_path)

In [39]:
del model

In [48]:
model = DQN.load(DQN_path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


# 4. Evaluation

In [49]:
from stable_baselines3.common.evaluation import evaluate_policy

In [50]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(-21.0, 0.0)

In [51]:
env.close()

# 5. Test Model

In [52]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break

info {'lives': 0, 'episode_frame_number': 3056, 'frame_number': 93574}


In [53]:
env.close()

# 6. Viewing Logs in Tensorboard

In [56]:
log_path = os.path.join('Training','Logs')
training_log_path = os.path.join(log_path, 'DQN_Pong')

In [57]:
!tensorboard --logdir={training_log_path}

2022-08-14 23:37:06.171732: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-14 23:37:07.060327: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-14 23:37:07.148370: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-14 23:37:07.148495: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero

NOTE: Using experimental fast data loading lo

# 7. Adding a callback to the training Stage

In [61]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [62]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [75]:
env = gym.make(environment_name)


In [76]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=20, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [77]:
model = DQN('CnnPolicy', env, verbose = 1, buffer_size = 10000, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [78]:
model.learn(total_timesteps=8000000, callback=eval_callback)

Logging to Training/Logs/DQN_5




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.18e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2729     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4734     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.17e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2735     |
|    time_elapsed     | 3        |
|    total_timesteps  | 9397     |
----------------------------------
Eval num_timesteps=10000, episode_reward=-21.00 +/- 0.00
Episode length: 1017.80 +/- 5.95
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.02e+03 |
|    mean_reward      | -21      |


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.27e+03 |
|    ep_rew_mean      | -21.4    |
|    exploration_rate | 0.915    |
| time/               |          |
|    episodes         | 56       |
|    fps              | 732      |
|    time_elapsed     | 97       |
|    total_timesteps  | 71331    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000934 |
|    n_updates        | 5332     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.27e+03 |
|    ep_rew_mean      | -21.4    |
|    exploration_rate | 0.909    |
| time/               |          |
|    episodes         | 60       |
|    fps              | 700      |
|    time_elapsed     | 109      |
|    total_timesteps  | 76357    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000347 |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.27e+03 |
|    ep_rew_mean      | -21.2    |
|    exploration_rate | 0.849    |
| time/               |          |
|    episodes         | 100      |
|    fps              | 506      |
|    time_elapsed     | 251      |
|    total_timesteps  | 127255   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00126  |
|    n_updates        | 19313    |
----------------------------------
Eval num_timesteps=130000, episode_reward=-20.20 +/- 1.17
Episode length: 1266.60 +/- 64.32
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.27e+03 |
|    mean_reward      | -20.2    |
| rollout/            |          |
|    exploration_rate | 0.846    |
| time/               |          |
|    total_timesteps  | 130000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00109  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.35e+03 |
|    ep_rew_mean      | -21.3    |
|    exploration_rate | 0.781    |
| time/               |          |
|    episodes         | 140      |
|    fps              | 433      |
|    time_elapsed     | 425      |
|    total_timesteps  | 184631   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000815 |
|    n_updates        | 33657    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.35e+03 |
|    ep_rew_mean      | -21.3    |
|    exploration_rate | 0.774    |
| time/               |          |
|    episodes         | 144      |
|    fps              | 433      |
|    time_elapsed     | 437      |
|    total_timesteps  | 189949   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000951 |
|    n_updates      

New best mean reward!
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.38e+03 |
|    ep_rew_mean      | -21.1    |
|    exploration_rate | 0.714    |
| time/               |          |
|    episodes         | 180      |
|    fps              | 397      |
|    time_elapsed     | 606      |
|    total_timesteps  | 241243   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00122  |
|    n_updates        | 47810    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.39e+03 |
|    ep_rew_mean      | -21      |
|    exploration_rate | 0.708    |
| time/               |          |
|    episodes         | 184      |
|    fps              | 397      |
|    time_elapsed     | 619      |
|    total_timesteps  | 246202   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000255 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.48e+03 |
|    ep_rew_mean      | -21.4    |
|    exploration_rate | 0.64     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 373      |
|    time_elapsed     | 811      |
|    total_timesteps  | 303105   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00436  |
|    n_updates        | 63276    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.48e+03 |
|    ep_rew_mean      | -21.4    |
|    exploration_rate | 0.633    |
| time/               |          |
|    episodes         | 224      |
|    fps              | 373      |
|    time_elapsed     | 825      |
|    total_timesteps  | 308772   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00158  |
|    n_updates      

Eval num_timesteps=370000, episode_reward=-20.60 +/- 0.49
Episode length: 1272.60 +/- 148.21
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.27e+03 |
|    mean_reward      | -20.6    |
| rollout/            |          |
|    exploration_rate | 0.561    |
| time/               |          |
|    total_timesteps  | 370000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000862 |
|    n_updates        | 79999    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.58e+03 |
|    ep_rew_mean      | -21.4    |
|    exploration_rate | 0.559    |
| time/               |          |
|    episodes         | 260      |
|    fps              | 355      |
|    time_elapsed     | 1042     |
|    total_timesteps  | 371128   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00136 

Eval num_timesteps=430000, episode_reward=-19.00 +/- 1.79
Episode length: 1665.60 +/- 408.14
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.67e+03 |
|    mean_reward      | -19      |
| rollout/            |          |
|    exploration_rate | 0.489    |
| time/               |          |
|    total_timesteps  | 430000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00313  |
|    n_updates        | 94999    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.64e+03 |
|    ep_rew_mean      | -21.1    |
|    exploration_rate | 0.484    |
| time/               |          |
|    episodes         | 300      |
|    fps              | 343      |
|    time_elapsed     | 1265     |
|    total_timesteps  | 434861   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0055  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.63e+03 |
|    ep_rew_mean      | -21      |
|    exploration_rate | 0.415    |
| time/               |          |
|    episodes         | 336      |
|    fps              | 334      |
|    time_elapsed     | 1470     |
|    total_timesteps  | 492488   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00167  |
|    n_updates        | 110621   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.61e+03 |
|    ep_rew_mean      | -21      |
|    exploration_rate | 0.409    |
| time/               |          |
|    episodes         | 340      |
|    fps              | 335      |
|    time_elapsed     | 1484     |
|    total_timesteps  | 497484   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00718  |
|    n_updates      

Eval num_timesteps=550000, episode_reward=-20.40 +/- 0.80
Episode length: 1246.80 +/- 187.59
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.25e+03 |
|    mean_reward      | -20.4    |
| rollout/            |          |
|    exploration_rate | 0.347    |
| time/               |          |
|    total_timesteps  | 550000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00636  |
|    n_updates        | 124999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.54e+03 |
|    ep_rew_mean      | -20.7    |
|    exploration_rate | 0.341    |
| time/               |          |
|    episodes         | 380      |
|    fps              | 330      |
|    time_elapsed     | 1681     |
|    total_timesteps  | 554995   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000916

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.58e+03 |
|    ep_rew_mean      | -20.4    |
|    exploration_rate | 0.264    |
| time/               |          |
|    episodes         | 416      |
|    fps              | 323      |
|    time_elapsed     | 1918     |
|    total_timesteps  | 619615   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0113   |
|    n_updates        | 142403   |
----------------------------------
Eval num_timesteps=620000, episode_reward=-17.80 +/- 1.17
Episode length: 2204.60 +/- 435.37
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.2e+03  |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.264    |
| time/               |          |
|    total_timesteps  | 620000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00218 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.74e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.181    |
| time/               |          |
|    episodes         | 452      |
|    fps              | 312      |
|    time_elapsed     | 2204     |
|    total_timesteps  | 689930   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00321  |
|    n_updates        | 159982   |
----------------------------------
Eval num_timesteps=690000, episode_reward=-17.60 +/- 1.36
Episode length: 2092.20 +/- 293.20
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.09e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.181    |
| time/               |          |
|    total_timesteps  | 690000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00505 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.92e+03 |
|    ep_rew_mean      | -19.3    |
|    exploration_rate | 0.0977   |
| time/               |          |
|    episodes         | 488      |
|    fps              | 304      |
|    time_elapsed     | 2496     |
|    total_timesteps  | 759810   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00174  |
|    n_updates        | 177452   |
----------------------------------
Eval num_timesteps=760000, episode_reward=-17.20 +/- 1.17
Episode length: 2275.20 +/- 139.28
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.28e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.0975   |
| time/               |          |
|    total_timesteps  | 760000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00449 

New best mean reward!
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.1e+03  |
|    ep_rew_mean      | -19.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 520      |
|    fps              | 297      |
|    time_elapsed     | 2809     |
|    total_timesteps  | 836729   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00182  |
|    n_updates        | 196682   |
----------------------------------
Eval num_timesteps=840000, episode_reward=-16.40 +/- 1.62
Episode length: 2198.60 +/- 168.96
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.2e+03  |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 840000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss 

Eval num_timesteps=910000, episode_reward=-18.00 +/- 0.00
Episode length: 1631.00 +/- 113.07
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.63e+03 |
|    mean_reward      | -18      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 910000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00127  |
|    n_updates        | 214999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.17e+03 |
|    ep_rew_mean      | -18.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 556      |
|    fps              | 293      |
|    time_elapsed     | 3116     |
|    total_timesteps  | 914886   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00173 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.23e+03 |
|    ep_rew_mean      | -19      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 588      |
|    fps              | 290      |
|    time_elapsed     | 3381     |
|    total_timesteps  | 982618   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00197  |
|    n_updates        | 233154   |
----------------------------------
Eval num_timesteps=990000, episode_reward=-17.80 +/- 2.40
Episode length: 1844.60 +/- 247.45
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.84e+03 |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 990000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00503 

Eval num_timesteps=1060000, episode_reward=-16.20 +/- 1.72
Episode length: 2141.80 +/- 86.30
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.14e+03 |
|    mean_reward      | -16.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1060000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00204  |
|    n_updates        | 252499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.22e+03 |
|    ep_rew_mean      | -18.5    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 624      |
|    fps              | 287      |
|    time_elapsed     | 3715     |
|    total_timesteps  | 1067931  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0107  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.23e+03 |
|    ep_rew_mean      | -18.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 285      |
|    time_elapsed     | 3986     |
|    total_timesteps  | 1137493  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00629  |
|    n_updates        | 271873   |
----------------------------------
Eval num_timesteps=1140000, episode_reward=-18.00 +/- 0.63
Episode length: 1728.00 +/- 196.40
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.73e+03 |
|    mean_reward      | -18      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1140000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00147

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.17e+03 |
|    ep_rew_mean      | -19.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 692      |
|    fps              | 284      |
|    time_elapsed     | 4252     |
|    total_timesteps  | 1208895  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00607  |
|    n_updates        | 289723   |
----------------------------------
Eval num_timesteps=1210000, episode_reward=-19.20 +/- 1.33
Episode length: 1360.80 +/- 180.90
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.36e+03 |
|    mean_reward      | -19.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1210000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00943

Eval num_timesteps=1280000, episode_reward=-19.00 +/- 0.89
Episode length: 2190.80 +/- 347.74
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.19e+03 |
|    mean_reward      | -19      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1280000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000818 |
|    n_updates        | 307499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.07e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 728      |
|    fps              | 282      |
|    time_elapsed     | 4544     |
|    total_timesteps  | 1282049  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0117 

Eval num_timesteps=1350000, episode_reward=-17.60 +/- 1.50
Episode length: 1698.60 +/- 202.66
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.7e+03  |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1350000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00222  |
|    n_updates        | 324999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.03e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 281      |
|    time_elapsed     | 4819     |
|    total_timesteps  | 1355048  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00122

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.09e+03 |
|    ep_rew_mean      | -19.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 796      |
|    fps              | 279      |
|    time_elapsed     | 5088     |
|    total_timesteps  | 1424364  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00395  |
|    n_updates        | 343590   |
----------------------------------
Eval num_timesteps=1430000, episode_reward=-16.60 +/- 1.36
Episode length: 1889.60 +/- 187.28
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.89e+03 |
|    mean_reward      | -16.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1430000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00793

Eval num_timesteps=1500000, episode_reward=-14.20 +/- 2.32
Episode length: 2278.80 +/- 294.92
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.28e+03 |
|    mean_reward      | -14.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1500000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00156  |
|    n_updates        | 362499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.15e+03 |
|    ep_rew_mean      | -19.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 832      |
|    fps              | 278      |
|    time_elapsed     | 5397     |
|    total_timesteps  | 1503745  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00329

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.19e+03 |
|    ep_rew_mean      | -19      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 277      |
|    time_elapsed     | 5664     |
|    total_timesteps  | 1573976  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0132   |
|    n_updates        | 380993   |
----------------------------------
Eval num_timesteps=1580000, episode_reward=-17.00 +/- 2.61
Episode length: 2125.00 +/- 234.97
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.12e+03 |
|    mean_reward      | -17      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1580000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00336

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.14e+03 |
|    ep_rew_mean      | -19.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 900      |
|    fps              | 277      |
|    time_elapsed     | 5935     |
|    total_timesteps  | 1646266  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00203  |
|    n_updates        | 399066   |
----------------------------------
Eval num_timesteps=1650000, episode_reward=-15.00 +/- 1.79
Episode length: 2292.20 +/- 167.67
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.29e+03 |
|    mean_reward      | -15      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1650000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00218

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.18e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 932      |
|    fps              | 276      |
|    time_elapsed     | 6237     |
|    total_timesteps  | 1721825  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00287  |
|    n_updates        | 417956   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 936      |
|    fps              | 276      |
|    time_elapsed     | 6261     |
|    total_timesteps  | 1729702  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00155  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.08e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 968      |
|    fps              | 275      |
|    time_elapsed     | 6498     |
|    total_timesteps  | 1791629  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00186  |
|    n_updates        | 435407   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.07e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 972      |
|    fps              | 275      |
|    time_elapsed     | 6520     |
|    total_timesteps  | 1799066  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00118  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.05e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1004     |
|    fps              | 275      |
|    time_elapsed     | 6766     |
|    total_timesteps  | 1861820  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00629  |
|    n_updates        | 452954   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.03e+03 |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1008     |
|    fps              | 275      |
|    time_elapsed     | 6790     |
|    total_timesteps  | 1869678  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00281  |
|    n_updates      

Eval num_timesteps=1930000, episode_reward=-18.40 +/- 1.02
Episode length: 1867.20 +/- 153.93
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.87e+03 |
|    mean_reward      | -18.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 1930000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00568  |
|    n_updates        | 469999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.88e+03 |
|    ep_rew_mean      | -19.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1044     |
|    fps              | 274      |
|    time_elapsed     | 7043     |
|    total_timesteps  | 1934078  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00154

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.96e+03 |
|    ep_rew_mean      | -19.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1076     |
|    fps              | 273      |
|    time_elapsed     | 7314     |
|    total_timesteps  | 2002979  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00458  |
|    n_updates        | 488244   |
----------------------------------
Eval num_timesteps=2010000, episode_reward=-16.00 +/- 1.41
Episode length: 2145.20 +/- 233.94
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.15e+03 |
|    mean_reward      | -16      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2010000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00196

Eval num_timesteps=2080000, episode_reward=-18.00 +/- 1.10
Episode length: 1887.60 +/- 143.35
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.89e+03 |
|    mean_reward      | -18      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2080000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.002    |
|    n_updates        | 507499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.07e+03 |
|    ep_rew_mean      | -19.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1112     |
|    fps              | 273      |
|    time_elapsed     | 7620     |
|    total_timesteps  | 2081480  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00357

Eval num_timesteps=2150000, episode_reward=-19.00 +/- 1.10
Episode length: 1992.60 +/- 255.68
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.99e+03 |
|    mean_reward      | -19      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2150000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00534  |
|    n_updates        | 524999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.12e+03 |
|    ep_rew_mean      | -19.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1148     |
|    fps              | 272      |
|    time_elapsed     | 7894     |
|    total_timesteps  | 2154128  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00132

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.14e+03 |
|    ep_rew_mean      | -19.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1180     |
|    fps              | 272      |
|    time_elapsed     | 8183     |
|    total_timesteps  | 2226943  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00241  |
|    n_updates        | 544235   |
----------------------------------
Eval num_timesteps=2230000, episode_reward=-16.40 +/- 1.50
Episode length: 2339.40 +/- 317.79
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.34e+03 |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2230000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00189

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.05e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1216     |
|    fps              | 271      |
|    time_elapsed     | 8437     |
|    total_timesteps  | 2292335  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00151  |
|    n_updates        | 560583   |
----------------------------------
Eval num_timesteps=2300000, episode_reward=-17.60 +/- 1.74
Episode length: 1922.00 +/- 246.12
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.92e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2300000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00215

Eval num_timesteps=2360000, episode_reward=-19.60 +/- 0.49
Episode length: 1322.60 +/- 143.34
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.32e+03 |
|    mean_reward      | -19.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2360000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00312  |
|    n_updates        | 577499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.89e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1256     |
|    fps              | 271      |
|    time_elapsed     | 8694     |
|    total_timesteps  | 2361497  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00477

Eval num_timesteps=2430000, episode_reward=-19.40 +/- 0.49
Episode length: 1867.00 +/- 157.61
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.87e+03 |
|    mean_reward      | -19.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2430000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00429  |
|    n_updates        | 594999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.8e+03  |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1292     |
|    fps              | 271      |
|    time_elapsed     | 8961     |
|    total_timesteps  | 2431467  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00204

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.82e+03 |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1328     |
|    fps              | 271      |
|    time_elapsed     | 9200     |
|    total_timesteps  | 2495953  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000874 |
|    n_updates        | 611488   |
----------------------------------
Eval num_timesteps=2500000, episode_reward=-21.00 +/- 0.00
Episode length: 1116.80 +/- 94.19
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.12e+03 |
|    mean_reward      | -21      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2500000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00221 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.72e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1368     |
|    fps              | 271      |
|    time_elapsed     | 9412     |
|    total_timesteps  | 2555618  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00255  |
|    n_updates        | 626404   |
----------------------------------
Eval num_timesteps=2560000, episode_reward=-20.40 +/- 0.49
Episode length: 1305.80 +/- 151.57
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.31e+03 |
|    mean_reward      | -20.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2560000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00162

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.57e+03 |
|    ep_rew_mean      | -20.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1408     |
|    fps              | 271      |
|    time_elapsed     | 9618     |
|    total_timesteps  | 2612164  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000947 |
|    n_updates        | 640540   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.55e+03 |
|    ep_rew_mean      | -20.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1412     |
|    fps              | 271      |
|    time_elapsed     | 9632     |
|    total_timesteps  | 2616892  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00174  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.48e+03 |
|    ep_rew_mean      | -20.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1448     |
|    fps              | 271      |
|    time_elapsed     | 9841     |
|    total_timesteps  | 2673634  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00194  |
|    n_updates        | 655908   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.49e+03 |
|    ep_rew_mean      | -20.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1452     |
|    fps              | 271      |
|    time_elapsed     | 9860     |
|    total_timesteps  | 2679911  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00366  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.51e+03 |
|    ep_rew_mean      | -20.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1488     |
|    fps              | 271      |
|    time_elapsed     | 10070    |
|    total_timesteps  | 2735262  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00292  |
|    n_updates        | 671315   |
----------------------------------
Eval num_timesteps=2740000, episode_reward=-16.40 +/- 2.87
Episode length: 2373.80 +/- 372.95
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.37e+03 |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2740000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0133 

Eval num_timesteps=2810000, episode_reward=-15.80 +/- 1.72
Episode length: 2388.00 +/- 125.15
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.39e+03 |
|    mean_reward      | -15.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2810000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00288  |
|    n_updates        | 689999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.76e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1524     |
|    fps              | 271      |
|    time_elapsed     | 10372    |
|    total_timesteps  | 2812143  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00494

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2e+03    |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1556     |
|    fps              | 270      |
|    time_elapsed     | 10659    |
|    total_timesteps  | 2886279  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00298  |
|    n_updates        | 709069   |
----------------------------------
Eval num_timesteps=2890000, episode_reward=-20.40 +/- 0.49
Episode length: 1099.20 +/- 30.43
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.1e+03  |
|    mean_reward      | -20.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2890000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00223 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.27e+03 |
|    ep_rew_mean      | -19.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1588     |
|    fps              | 270      |
|    time_elapsed     | 10949    |
|    total_timesteps  | 2962032  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00347  |
|    n_updates        | 728007   |
----------------------------------
Eval num_timesteps=2970000, episode_reward=-17.60 +/- 2.06
Episode length: 2281.60 +/- 174.82
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.28e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 2970000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00231

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.23e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1624     |
|    fps              | 270      |
|    time_elapsed     | 11220    |
|    total_timesteps  | 3035278  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00324  |
|    n_updates        | 746319   |
----------------------------------
Eval num_timesteps=3040000, episode_reward=-20.20 +/- 0.40
Episode length: 1609.20 +/- 328.95
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.61e+03 |
|    mean_reward      | -20.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3040000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00296

Eval num_timesteps=3110000, episode_reward=-18.40 +/- 1.50
Episode length: 1909.60 +/- 397.26
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.91e+03 |
|    mean_reward      | -18.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3110000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00349  |
|    n_updates        | 764999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | -20.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1660     |
|    fps              | 270      |
|    time_elapsed     | 11518    |
|    total_timesteps  | 3112002  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00167

Eval num_timesteps=3180000, episode_reward=-16.40 +/- 2.42
Episode length: 2183.00 +/- 302.74
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.18e+03 |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3180000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00283  |
|    n_updates        | 782499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.03e+03 |
|    ep_rew_mean      | -20.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1696     |
|    fps              | 270      |
|    time_elapsed     | 11793    |
|    total_timesteps  | 3185194  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00153

Eval num_timesteps=3250000, episode_reward=-17.40 +/- 1.02
Episode length: 1834.80 +/- 182.60
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.83e+03 |
|    mean_reward      | -17.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3250000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00329  |
|    n_updates        | 799999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.03e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1732     |
|    fps              | 269      |
|    time_elapsed     | 12058    |
|    total_timesteps  | 3254996  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0117 

Eval num_timesteps=3330000, episode_reward=-15.20 +/- 1.94
Episode length: 2391.80 +/- 219.05
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.39e+03 |
|    mean_reward      | -15.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3330000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00556  |
|    n_updates        | 819999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.1e+03  |
|    ep_rew_mean      | -19.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1764     |
|    fps              | 269      |
|    time_elapsed     | 12366    |
|    total_timesteps  | 3332554  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00168

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | -18.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1796     |
|    fps              | 269      |
|    time_elapsed     | 12657    |
|    total_timesteps  | 3406140  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00382  |
|    n_updates        | 839034   |
----------------------------------
Eval num_timesteps=3410000, episode_reward=-17.80 +/- 1.60
Episode length: 1910.80 +/- 332.83
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.91e+03 |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3410000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00234

Eval num_timesteps=3480000, episode_reward=-20.00 +/- 0.63
Episode length: 1245.40 +/- 139.41
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.25e+03 |
|    mean_reward      | -20      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3480000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0164   |
|    n_updates        | 857499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.3e+03  |
|    ep_rew_mean      | -18.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1832     |
|    fps              | 268      |
|    time_elapsed     | 12963    |
|    total_timesteps  | 3484910  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0139 

Eval num_timesteps=3550000, episode_reward=-15.80 +/- 3.25
Episode length: 2150.80 +/- 385.61
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.15e+03 |
|    mean_reward      | -15.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3550000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00103  |
|    n_updates        | 874999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.14e+03 |
|    ep_rew_mean      | -19.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1868     |
|    fps              | 268      |
|    time_elapsed     | 13235    |
|    total_timesteps  | 3557083  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0097 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.13e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1900     |
|    fps              | 268      |
|    time_elapsed     | 13509    |
|    total_timesteps  | 3627311  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00545  |
|    n_updates        | 894327   |
----------------------------------
Eval num_timesteps=3630000, episode_reward=-16.80 +/- 3.06
Episode length: 1937.40 +/- 428.45
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.94e+03 |
|    mean_reward      | -16.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3630000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00886

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.98e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1936     |
|    fps              | 268      |
|    time_elapsed     | 13757    |
|    total_timesteps  | 3692251  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00239  |
|    n_updates        | 910562   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.95e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1940     |
|    fps              | 268      |
|    time_elapsed     | 13776    |
|    total_timesteps  | 3698698  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00287  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.84e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1976     |
|    fps              | 268      |
|    time_elapsed     | 13997    |
|    total_timesteps  | 3758395  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0112   |
|    n_updates        | 927098   |
----------------------------------
Eval num_timesteps=3760000, episode_reward=-18.20 +/- 0.40
Episode length: 1881.60 +/- 143.75
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.88e+03 |
|    mean_reward      | -18.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3760000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00474

Eval num_timesteps=3830000, episode_reward=-16.80 +/- 2.04
Episode length: 2300.40 +/- 399.35
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.3e+03  |
|    mean_reward      | -16.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3830000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00182  |
|    n_updates        | 944999   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.84e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2012     |
|    fps              | 268      |
|    time_elapsed     | 14302    |
|    total_timesteps  | 3836232  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00161

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.99e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2044     |
|    fps              | 267      |
|    time_elapsed     | 14570    |
|    total_timesteps  | 3904725  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00308  |
|    n_updates        | 963681   |
----------------------------------
Eval num_timesteps=3910000, episode_reward=-17.80 +/- 0.40
Episode length: 2387.20 +/- 379.71
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.39e+03 |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3910000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00124

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.12e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2080     |
|    fps              | 267      |
|    time_elapsed     | 14858    |
|    total_timesteps  | 3978523  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00384  |
|    n_updates        | 982130   |
----------------------------------
Eval num_timesteps=3980000, episode_reward=-17.80 +/- 2.23
Episode length: 1995.00 +/- 429.29
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2e+03    |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 3980000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0122 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.03e+03 |
|    ep_rew_mean      | -20.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2116     |
|    fps              | 267      |
|    time_elapsed     | 15118    |
|    total_timesteps  | 4046764  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00308  |
|    n_updates        | 999190   |
----------------------------------
Eval num_timesteps=4050000, episode_reward=-15.80 +/- 1.94
Episode length: 2142.20 +/- 222.48
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.14e+03 |
|    mean_reward      | -15.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4050000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00191

Eval num_timesteps=4120000, episode_reward=-17.40 +/- 1.36
Episode length: 2515.00 +/- 268.91
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.52e+03 |
|    mean_reward      | -17.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4120000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00162  |
|    n_updates        | 1017499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.02e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2152     |
|    fps              | 267      |
|    time_elapsed     | 15426    |
|    total_timesteps  | 4125691  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00158

Eval num_timesteps=4200000, episode_reward=-20.00 +/- 0.00
Episode length: 1323.40 +/- 143.08
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.32e+03 |
|    mean_reward      | -20      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4200000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00383  |
|    n_updates        | 1037499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.13e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2184     |
|    fps              | 267      |
|    time_elapsed     | 15728    |
|    total_timesteps  | 4201094  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00145

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.1e+03  |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2220     |
|    fps              | 267      |
|    time_elapsed     | 15973    |
|    total_timesteps  | 4266317  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0025   |
|    n_updates        | 1054079  |
----------------------------------
Eval num_timesteps=4270000, episode_reward=-17.00 +/- 1.10
Episode length: 1728.40 +/- 252.21
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.73e+03 |
|    mean_reward      | -17      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4270000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00079

Eval num_timesteps=4340000, episode_reward=-17.20 +/- 1.60
Episode length: 1761.40 +/- 271.10
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.76e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4340000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00925  |
|    n_updates        | 1072499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.11e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2256     |
|    fps              | 266      |
|    time_elapsed     | 16281    |
|    total_timesteps  | 4346429  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00368

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.07e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2288     |
|    fps              | 266      |
|    time_elapsed     | 16549    |
|    total_timesteps  | 4414294  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00615  |
|    n_updates        | 1091073  |
----------------------------------
Eval num_timesteps=4420000, episode_reward=-17.20 +/- 1.94
Episode length: 2080.80 +/- 258.96
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.08e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4420000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00244

Eval num_timesteps=4490000, episode_reward=-17.60 +/- 2.15
Episode length: 2088.20 +/- 109.40
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.09e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4490000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00194  |
|    n_updates        | 1109999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.18e+03 |
|    ep_rew_mean      | -19.7    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2324     |
|    fps              | 266      |
|    time_elapsed     | 16858    |
|    total_timesteps  | 4493712  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00989

Eval num_timesteps=4570000, episode_reward=-17.20 +/- 2.32
Episode length: 2371.60 +/- 256.06
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.37e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4570000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0146   |
|    n_updates        | 1129999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.25e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2356     |
|    fps              | 266      |
|    time_elapsed     | 17166    |
|    total_timesteps  | 4571650  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0029 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.28e+03 |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2388     |
|    fps              | 266      |
|    time_elapsed     | 17442    |
|    total_timesteps  | 4642040  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0139   |
|    n_updates        | 1148009  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.24e+03 |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2392     |
|    fps              | 266      |
|    time_elapsed     | 17460    |
|    total_timesteps  | 4647965  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0106   |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.18e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2424     |
|    fps              | 266      |
|    time_elapsed     | 17710    |
|    total_timesteps  | 4711515  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00254  |
|    n_updates        | 1165378  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.17e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2428     |
|    fps              | 266      |
|    time_elapsed     | 17733    |
|    total_timesteps  | 4719106  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00164  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.08e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2460     |
|    fps              | 265      |
|    time_elapsed     | 18002    |
|    total_timesteps  | 4787087  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00201  |
|    n_updates        | 1184271  |
----------------------------------
Eval num_timesteps=4790000, episode_reward=-16.60 +/- 3.01
Episode length: 1795.60 +/- 368.63
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.8e+03  |
|    mean_reward      | -16.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4790000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00132

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.17e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2492     |
|    fps              | 265      |
|    time_elapsed     | 18308    |
|    total_timesteps  | 4865374  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00127  |
|    n_updates        | 1203843  |
----------------------------------
Eval num_timesteps=4870000, episode_reward=-17.20 +/- 1.47
Episode length: 2317.60 +/- 109.74
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.32e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4870000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00757

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.38e+03 |
|    ep_rew_mean      | -19.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2524     |
|    fps              | 265      |
|    time_elapsed     | 18643    |
|    total_timesteps  | 4949609  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0011   |
|    n_updates        | 1224902  |
----------------------------------
Eval num_timesteps=4950000, episode_reward=-17.80 +/- 1.17
Episode length: 1750.20 +/- 273.54
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.75e+03 |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 4950000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00164

Eval num_timesteps=5020000, episode_reward=-14.80 +/- 2.71
Episode length: 2148.60 +/- 340.39
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.15e+03 |
|    mean_reward      | -14.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5020000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00883  |
|    n_updates        | 1242499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.39e+03 |
|    ep_rew_mean      | -19      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2560     |
|    fps              | 265      |
|    time_elapsed     | 18950    |
|    total_timesteps  | 5026523  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0118 

Eval num_timesteps=5080000, episode_reward=-20.40 +/- 0.80
Episode length: 1180.20 +/- 108.16
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.18e+03 |
|    mean_reward      | -20.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5080000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00321  |
|    n_updates        | 1257499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2e+03    |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2600     |
|    fps              | 265      |
|    time_elapsed     | 19157    |
|    total_timesteps  | 5084324  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00036

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.65e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2636     |
|    fps              | 265      |
|    time_elapsed     | 19390    |
|    total_timesteps  | 5147923  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00634  |
|    n_updates        | 1274480  |
----------------------------------
Eval num_timesteps=5150000, episode_reward=-17.00 +/- 1.67
Episode length: 2018.40 +/- 300.96
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.02e+03 |
|    mean_reward      | -17      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5150000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00073

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.86e+03 |
|    ep_rew_mean      | -20.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2668     |
|    fps              | 265      |
|    time_elapsed     | 19695    |
|    total_timesteps  | 5224497  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00192  |
|    n_updates        | 1293624  |
----------------------------------
Eval num_timesteps=5230000, episode_reward=-16.40 +/- 2.58
Episode length: 2347.60 +/- 339.15
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.35e+03 |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5230000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00141

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.2e+03  |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2700     |
|    fps              | 264      |
|    time_elapsed     | 20022    |
|    total_timesteps  | 5304738  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00186  |
|    n_updates        | 1313684  |
----------------------------------
Eval num_timesteps=5310000, episode_reward=-17.80 +/- 1.17
Episode length: 2495.00 +/- 219.06
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.5e+03  |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5310000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00469

Eval num_timesteps=5380000, episode_reward=-20.00 +/- 0.89
Episode length: 1508.60 +/- 272.80
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.51e+03 |
|    mean_reward      | -20      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5380000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00225  |
|    n_updates        | 1332499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.39e+03 |
|    ep_rew_mean      | -19.5    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2736     |
|    fps              | 264      |
|    time_elapsed     | 20342    |
|    total_timesteps  | 5386526  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00481

Eval num_timesteps=5460000, episode_reward=-14.20 +/- 3.92
Episode length: 2872.80 +/- 631.86
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.87e+03 |
|    mean_reward      | -14.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5460000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00135  |
|    n_updates        | 1352499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.38e+03 |
|    ep_rew_mean      | -18.7    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2768     |
|    fps              | 264      |
|    time_elapsed     | 20653    |
|    total_timesteps  | 5462772  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00175

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.33e+03 |
|    ep_rew_mean      | -18.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2800     |
|    fps              | 264      |
|    time_elapsed     | 20944    |
|    total_timesteps  | 5537611  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00415  |
|    n_updates        | 1371902  |
----------------------------------
Eval num_timesteps=5540000, episode_reward=-16.20 +/- 1.17
Episode length: 2384.60 +/- 33.30
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.38e+03 |
|    mean_reward      | -16.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5540000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00328 

Eval num_timesteps=5620000, episode_reward=-14.80 +/- 1.47
Episode length: 2851.00 +/- 177.99
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.85e+03 |
|    mean_reward      | -14.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5620000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0143   |
|    n_updates        | 1392499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.44e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2832     |
|    fps              | 264      |
|    time_elapsed     | 21288    |
|    total_timesteps  | 5621945  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00599

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.47e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2864     |
|    fps              | 264      |
|    time_elapsed     | 21584    |
|    total_timesteps  | 5699055  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00369  |
|    n_updates        | 1412263  |
----------------------------------
Eval num_timesteps=5700000, episode_reward=-17.60 +/- 1.62
Episode length: 2122.80 +/- 226.43
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.12e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5700000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00246

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.46e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2896     |
|    fps              | 263      |
|    time_elapsed     | 21889    |
|    total_timesteps  | 5774598  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0011   |
|    n_updates        | 1431149  |
----------------------------------
Eval num_timesteps=5780000, episode_reward=-19.80 +/- 0.98
Episode length: 1621.00 +/- 217.75
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.62e+03 |
|    mean_reward      | -19.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5780000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00916

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.2e+03  |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2932     |
|    fps              | 263      |
|    time_elapsed     | 22141    |
|    total_timesteps  | 5841808  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00246  |
|    n_updates        | 1447951  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2936     |
|    fps              | 263      |
|    time_elapsed     | 22160    |
|    total_timesteps  | 5848085  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00122  |
|    n_updates      

Eval num_timesteps=5910000, episode_reward=-17.80 +/- 1.94
Episode length: 1737.40 +/- 321.75
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.74e+03 |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5910000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00138  |
|    n_updates        | 1464999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.99e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2972     |
|    fps              | 263      |
|    time_elapsed     | 22422    |
|    total_timesteps  | 5916695  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.012  

Eval num_timesteps=5980000, episode_reward=-17.40 +/- 1.50
Episode length: 2328.20 +/- 179.30
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.33e+03 |
|    mean_reward      | -17.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 5980000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0119   |
|    n_updates        | 1482499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.92e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3008     |
|    fps              | 263      |
|    time_elapsed     | 22689    |
|    total_timesteps  | 5987242  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00269

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.96e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3040     |
|    fps              | 263      |
|    time_elapsed     | 22944    |
|    total_timesteps  | 6053325  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00416  |
|    n_updates        | 1500831  |
----------------------------------
Eval num_timesteps=6060000, episode_reward=-17.60 +/- 1.74
Episode length: 2294.80 +/- 223.74
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.29e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6060000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00257

Eval num_timesteps=6130000, episode_reward=-18.00 +/- 1.90
Episode length: 2950.60 +/- 215.64
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.95e+03 |
|    mean_reward      | -18      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6130000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000714 |
|    n_updates        | 1519999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.08e+03 |
|    ep_rew_mean      | -20.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3076     |
|    fps              | 263      |
|    time_elapsed     | 23255    |
|    total_timesteps  | 6133856  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00367

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3108     |
|    fps              | 263      |
|    time_elapsed     | 23546    |
|    total_timesteps  | 6208670  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00478  |
|    n_updates        | 1539667  |
----------------------------------
Eval num_timesteps=6210000, episode_reward=-16.40 +/- 2.42
Episode length: 2408.40 +/- 321.28
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.41e+03 |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6210000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00168

Eval num_timesteps=6280000, episode_reward=-11.00 +/- 2.76
Episode length: 2793.00 +/- 318.72
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.79e+03 |
|    mean_reward      | -11      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6280000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00411  |
|    n_updates        | 1557499  |
----------------------------------
New best mean reward!
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3144     |
|    fps              | 263      |
|    time_elapsed     | 23840    |
|    total_timesteps  | 6282760  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.18e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3176     |
|    fps              | 263      |
|    time_elapsed     | 24112    |
|    total_timesteps  | 6351862  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00638  |
|    n_updates        | 1575465  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.15e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3180     |
|    fps              | 263      |
|    time_elapsed     | 24134    |
|    total_timesteps  | 6359123  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00144  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.1e+03  |
|    ep_rew_mean      | -19.4    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3212     |
|    fps              | 263      |
|    time_elapsed     | 24402    |
|    total_timesteps  | 6428508  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00188  |
|    n_updates        | 1594626  |
----------------------------------
Eval num_timesteps=6430000, episode_reward=-16.80 +/- 1.72
Episode length: 2181.20 +/- 341.11
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.18e+03 |
|    mean_reward      | -16.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6430000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00090

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.2e+03  |
|    ep_rew_mean      | -19.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3244     |
|    fps              | 263      |
|    time_elapsed     | 24701    |
|    total_timesteps  | 6502809  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00272  |
|    n_updates        | 1613202  |
----------------------------------
Eval num_timesteps=6510000, episode_reward=-18.20 +/- 0.75
Episode length: 2188.20 +/- 237.32
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.19e+03 |
|    mean_reward      | -18.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6510000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00192

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.14e+03 |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3280     |
|    fps              | 263      |
|    time_elapsed     | 24970    |
|    total_timesteps  | 6573559  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00427  |
|    n_updates        | 1630889  |
----------------------------------
Eval num_timesteps=6580000, episode_reward=-15.40 +/- 1.85
Episode length: 2419.60 +/- 211.40
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.42e+03 |
|    mean_reward      | -15.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6580000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0024 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.23e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3312     |
|    fps              | 263      |
|    time_elapsed     | 25283    |
|    total_timesteps  | 6651965  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00113  |
|    n_updates        | 1650491  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | -19.7    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3316     |
|    fps              | 263      |
|    time_elapsed     | 25304    |
|    total_timesteps  | 6659073  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00398  |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.12e+03 |
|    ep_rew_mean      | -19.7    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3348     |
|    fps              | 263      |
|    time_elapsed     | 25565    |
|    total_timesteps  | 6723844  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00154  |
|    n_updates        | 1668460  |
----------------------------------
Eval num_timesteps=6730000, episode_reward=-17.20 +/- 0.98
Episode length: 2289.40 +/- 119.45
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.29e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6730000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00461

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.13e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3384     |
|    fps              | 262      |
|    time_elapsed     | 25850    |
|    total_timesteps  | 6797569  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00237  |
|    n_updates        | 1686892  |
----------------------------------
Eval num_timesteps=6800000, episode_reward=-20.80 +/- 0.40
Episode length: 1205.00 +/- 96.55
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.2e+03  |
|    mean_reward      | -20.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6800000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0105  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.02e+03 |
|    ep_rew_mean      | -20.3    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3420     |
|    fps              | 262      |
|    time_elapsed     | 26126    |
|    total_timesteps  | 6869975  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0099   |
|    n_updates        | 1704993  |
----------------------------------
Eval num_timesteps=6870000, episode_reward=-15.80 +/- 1.72
Episode length: 2216.80 +/- 191.89
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.22e+03 |
|    mean_reward      | -15.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6870000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0118 

Eval num_timesteps=6940000, episode_reward=-17.00 +/- 3.29
Episode length: 2514.00 +/- 555.60
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.51e+03 |
|    mean_reward      | -17      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 6940000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00501  |
|    n_updates        | 1722499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.99e+03 |
|    ep_rew_mean      | -19.8    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3456     |
|    fps              | 262      |
|    time_elapsed     | 26413    |
|    total_timesteps  | 6943089  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00099

Eval num_timesteps=7010000, episode_reward=-18.20 +/- 1.47
Episode length: 1619.40 +/- 303.02
----------------------------------
| eval/               |          |
|    mean_ep_length   | 1.62e+03 |
|    mean_reward      | -18.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7010000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00399  |
|    n_updates        | 1739999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2e+03    |
|    ep_rew_mean      | -19.7    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3492     |
|    fps              | 262      |
|    time_elapsed     | 26674    |
|    total_timesteps  | 7011998  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00314

Eval num_timesteps=7080000, episode_reward=-16.00 +/- 2.19
Episode length: 2080.40 +/- 282.48
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.08e+03 |
|    mean_reward      | -16      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7080000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00198  |
|    n_updates        | 1757499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.94e+03 |
|    ep_rew_mean      | -19.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3528     |
|    fps              | 262      |
|    time_elapsed     | 26939    |
|    total_timesteps  | 7082301  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0118 

Eval num_timesteps=7150000, episode_reward=-15.40 +/- 1.20
Episode length: 2247.80 +/- 195.86
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.25e+03 |
|    mean_reward      | -15.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7150000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00279  |
|    n_updates        | 1774999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.96e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3564     |
|    fps              | 262      |
|    time_elapsed     | 27223    |
|    total_timesteps  | 7155321  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00069

Eval num_timesteps=7230000, episode_reward=-17.20 +/- 2.04
Episode length: 3154.60 +/- 356.03
----------------------------------
| eval/               |          |
|    mean_ep_length   | 3.15e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7230000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00628  |
|    n_updates        | 1794999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.16e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3596     |
|    fps              | 262      |
|    time_elapsed     | 27537    |
|    total_timesteps  | 7234575  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00326

Eval num_timesteps=7310000, episode_reward=-14.60 +/- 1.85
Episode length: 2821.60 +/- 289.43
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.82e+03 |
|    mean_reward      | -14.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7310000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0025   |
|    n_updates        | 1814999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.32e+03 |
|    ep_rew_mean      | -18.7    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3628     |
|    fps              | 262      |
|    time_elapsed     | 27860    |
|    total_timesteps  | 7314720  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0016 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.59e+03 |
|    ep_rew_mean      | -18.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3656     |
|    fps              | 262      |
|    time_elapsed     | 28197    |
|    total_timesteps  | 7397076  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0029   |
|    n_updates        | 1836768  |
----------------------------------
Eval num_timesteps=7400000, episode_reward=-15.60 +/- 0.49
Episode length: 2294.40 +/- 122.71
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.29e+03 |
|    mean_reward      | -15.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7400000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00231

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.67e+03 |
|    ep_rew_mean      | -19.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3688     |
|    fps              | 262      |
|    time_elapsed     | 28536    |
|    total_timesteps  | 7478938  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00391  |
|    n_updates        | 1857234  |
----------------------------------
Eval num_timesteps=7480000, episode_reward=-16.40 +/- 2.58
Episode length: 2429.80 +/- 358.29
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.43e+03 |
|    mean_reward      | -16.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7480000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0012 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.59e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3720     |
|    fps              | 261      |
|    time_elapsed     | 28859    |
|    total_timesteps  | 7553697  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000731 |
|    n_updates        | 1875924  |
----------------------------------
Eval num_timesteps=7560000, episode_reward=-17.80 +/- 1.60
Episode length: 1995.40 +/- 138.53
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2e+03    |
|    mean_reward      | -17.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7560000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00109

Eval num_timesteps=7640000, episode_reward=-17.20 +/- 0.98
Episode length: 2547.60 +/- 248.64
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.55e+03 |
|    mean_reward      | -17.2    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7640000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00151  |
|    n_updates        | 1897499  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.57e+03 |
|    ep_rew_mean      | -20.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3752     |
|    fps              | 261      |
|    time_elapsed     | 29230    |
|    total_timesteps  | 7642214  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00111

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.67e+03 |
|    ep_rew_mean      | -20      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3780     |
|    fps              | 261      |
|    time_elapsed     | 29583    |
|    total_timesteps  | 7725516  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0031   |
|    n_updates        | 1918878  |
----------------------------------
Eval num_timesteps=7730000, episode_reward=-17.60 +/- 1.62
Episode length: 2252.40 +/- 204.40
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.25e+03 |
|    mean_reward      | -17.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7730000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0021 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.74e+03 |
|    ep_rew_mean      | -19.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3812     |
|    fps              | 260      |
|    time_elapsed     | 29922    |
|    total_timesteps  | 7808153  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.01     |
|    n_updates        | 1939538  |
----------------------------------
Eval num_timesteps=7810000, episode_reward=-18.40 +/- 1.36
Episode length: 2415.20 +/- 321.32
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.42e+03 |
|    mean_reward      | -18.4    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7810000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00197

Eval num_timesteps=7890000, episode_reward=-16.80 +/- 1.60
Episode length: 2661.00 +/- 293.48
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.66e+03 |
|    mean_reward      | -16.8    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7890000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00274  |
|    n_updates        | 1959999  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.77e+03 |
|    ep_rew_mean      | -18.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3844     |
|    fps              | 260      |
|    time_elapsed     | 30281    |
|    total_timesteps  | 7894949  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0161 

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.77e+03 |
|    ep_rew_mean      | -18.5    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 3872     |
|    fps              | 260      |
|    time_elapsed     | 30640    |
|    total_timesteps  | 7979795  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00156  |
|    n_updates        | 1982448  |
----------------------------------
Eval num_timesteps=7980000, episode_reward=-16.60 +/- 2.06
Episode length: 2062.00 +/- 380.52
----------------------------------
| eval/               |          |
|    mean_ep_length   | 2.06e+03 |
|    mean_reward      | -16.6    |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 7980000  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.009  

<stable_baselines3.dqn.dqn.DQN at 0x7f3f5cedfe80>

In [80]:
model.save(DQN_path)

In [87]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [86]:
env.close()

# 8. Changing Policies

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs={'net_arch': net_arch})

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

# 9. Using an Alternate Algorithm

In [16]:
from stable_baselines3 import DQN

In [18]:
model = DQN('MlpPolicy', env, verbose = 1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


MemoryError: Unable to allocate 93.9 GiB for an array with shape (1000000, 1, 3, 210, 160) and data type uint8

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [None]:
model.save(dqn_path)

In [None]:
model = DQN.load(dqn_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()