In [50]:
# imports
import os
import json
import re

import torch 
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import numpy as np

from MDP import MDP

import stable_baselines3
import sb3_contrib

import gym

In [51]:
# check torch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3080'

In [52]:
# create Neural Network

class Net(nn.Module):
    """
    input : 2 X 4 X 4 grid
    label : Move [0;6]
    """
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(11, 32, 2, padding = 1)

        self.conv2 = nn.Conv2d(32, 64, 3, padding= 1)

        self.conv3 = nn.Conv2d(64, 128, 2)

        self.conv4 = nn.Conv2d(128, 64, 3)

        self.fc1 = nn.Linear(64, 32)

        self.fc2 = nn.Linear(32, 16)

        self.out = nn.Linear(16, 6)


    def forward(self, x):
        x = x.float()

        x = F.relu(self.lstm(x))

        x = F.relu(self.conv1(x))

        x = F.relu(self.conv2(x))

        x = F.relu(self.conv3(x))

        x = F.relu(self.conv4(x))

        x = torch.flatten(x,start_dim=1)

        x = F.relu(self.fc1(x))

        x = F.relu(self.fc2(x))

        x = self.out(x)
    
        return x   

In [53]:
#creating model
net = Net()
net.cuda()
print(net)

params = list(net.parameters())
print(f"number of parameters: {len(params)}")

#loss function
loss = nn.CrossEntropyLoss()

#optimizer
optimizer = torch.optim.Adam(net.parameters())
optimizer

Net(
  (conv1): Conv2d(11, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(2, 2), stride=(1, 1))
  (conv4): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (out): Linear(in_features=16, out_features=6, bias=True)
)
number of parameters: 14


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [54]:
#custom environment
from gym import spaces

class Gridworld(gym.Env):

    metadata = {"render.modes" : ["human"]}

    def __init__(self, episodes, dir = ["data_easy", "generated_easy", "data_medium", "generated_med", "data", "generated_imitation"], type = "train", 
                p = [0.1, 0.1, 0.15, 0.1, 0.15, 0.4], lambda1 = 0.01, lambda2 = 0.1, lambda3 = 1, load_optimal = False, epsilon = 0.1) -> None:
        super(Gridworld, self).__init__()
        self.action_space = spaces.Discrete(6)
        self.observation_space = spaces.Box(low = 0, high = 1, shape = (11, 4, 4))

        #available MDPs
        self.dir = dir
        self.type = type
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3

        self.actions = ["move", "turnLeft", "turnRight", "pickMarker", "putMarker","finish"]
        self.actionsDict = {"move" : 0, "turnLeft" : 1, "turnRight" : 2, "pickMarker" : 3, "putMarker" : 4, "finish" : 5}

        self.epsilon = epsilon

        self.load_optimal = load_optimal

        #generate curriculum
        #{"data_easy" : 15% ,"generated_easy" : 20% , "data_medium" : 15% , "generated_med" : 15% "data" : 30%, "generated_imitation" : 15%}
        self.curriculum = []
        #data_easy
        for directory,amount in zip(dir, p):
            tasks = list(np.random.choice(os.listdir(os.sep.join(["datasets", directory, type, "task"])), int(episodes * amount)))
            tasks = zip([directory for _ in range(int(episodes * amount))], tasks)
            self.curriculum += tasks
        self.currentTask = 0
    

    def construct_feature_matrices(self, matrix):
        """
        creates the input for the Neural Network: 11 x 4 x 4 boolean tensor for:
        - if a wall is present
        - if a marker in the pregrid is present
        - if a marker in the postgrid is present
        - if the agent facing left, down, right, up in the pregrid (one for each direction)
        - if the agent facing left, down, right, up in the postgrid (one for each direction)
        """
        wallMat = matrix[0] == 10

        marker_pre = (matrix[0] > 4) & (matrix[0] < 10)
 
        marker_post = (matrix[1] > 4) & (matrix[1] < 10)

        agent_pre = np.zeros((4,4,4))
        agent_dir = (matrix[(0, *self.currentMDP.agentPosition)] - 1) % 4
        agent_pre[(int(agent_dir), *self.currentMDP.agentPosition)] = 1
        
        agent_post = np.zeros((4,4,4))
        agent_pos_post = ((matrix[1] > 0) & (matrix[1] < 9)).nonzero()
        agent_dir = (matrix[(1, *agent_pos_post)] - 1) % 4
        agent_post[(int(agent_dir), *agent_pos_post)] = 1
        
        return np.array([wallMat, marker_pre, marker_post, *agent_pre, *agent_post])


    def reset(self):
        nextDir, self.nexti = self.curriculum[self.currentTask]
        self.nexti = re.sub(r"\D", "", self.nexti)
        self.currentTask += 1
        self.currentMDP = MDP(nextDir, self.type, self.nexti, lambda1= self.lambda1, lambda2 = self.lambda2, lambda3 =self.lambda3)

        # load optimal sequence if possible
        try:
            with open(os.sep.join(["datasets", nextDir, self.type, "seq", self.nexti + "_seq.json"])) as grid:
                grid = json.load(grid)
                self.optimal_seq = grid["sequence"]
                # current step
                assert self.optimal_seq[-1] == "finish"
        except:  
            self.optimal_seq = []
        self.steps = 0

        #with probability 1 - epsilon use either the optimal sequence or better masks
        self.use_optimal =  (np.random.rand() < (1 - self.epsilon)) and self.load_optimal

        return self.construct_feature_matrices(self.currentMDP.get_current_state())

    def step(self, action):
        nextState, rew, done, info = self.currentMDP.sample_next_state_and_reward(self.actions[action])
        self.steps += 1
        if self.steps > 500:
            return nextState, -self.lambda3, True, info 

        return self.construct_feature_matrices(nextState), rew - 0.01, done, info 
        
    def render(self):
        self.currentMDP.print_grid()

    def close(self):
        pass
    
    def action_masks(self):

        mat = self.currentMDP.get_current_state()
        if np.array_equal(mat[0], mat[1]):
            return np.array([0,0,0,0,0,1])

        # force agent to take optimal action if possible
        mask = self.currentMDP.action_mask()

        if self.use_optimal and self.optimal_seq:
            mask = np.zeros(6)
            mask[self.actionsDict[self.optimal_seq[self.steps]]] = 1
            return mask
        
        return mask

    # functions bellow are only used for inheritance 
    def get_MDP(self):
        return self.currentMDP

    def get_MDP_name(self):
        return self.nextDir, self.nextType, self.nexti

In [55]:
testEnv = Gridworld(100, dir = ["generated_imitation"], load_optimal= True)

In [56]:
m = testEnv.reset()
testEnv.render()
print(testEnv.action_masks())
m, r, _, _ = testEnv.step(np.argmax(testEnv.action_masks()))
print(r)
testEnv.render()
testEnv.action_masks()
_, r, _, _ = testEnv.step(np.argmax(testEnv.action_masks()))
print(r)
testEnv.render()

[[['#' '.' '#' '#']
  ['#' '.' '#' '#']
  ['v' '#' '.' '#']
  ['.' '#' '#' '#']]

 [['#' '.' '#' '#']
  ['#' '.' '#' '#']
  ['<' '#' '.' '#']
  ['O' '#' '#' '#']]]
[1. 0. 0. 0. 0. 0.]
-0.02
[[['#' '.' '#' '#']
  ['#' '.' '#' '#']
  ['.' '#' '.' '#']
  ['v' '#' '#' '#']]

 [['#' '.' '#' '#']
  ['#' '.' '#' '#']
  ['<' '#' '.' '#']
  ['O' '#' '#' '#']]]
0.09000000000000001
[[['#' '.' '#' '#']
  ['#' '.' '#' '#']
  ['.' '#' '.' '#']
  ['d' '#' '#' '#']]

 [['#' '.' '#' '#']
  ['#' '.' '#' '#']
  ['<' '#' '.' '#']
  ['O' '#' '#' '#']]]


In [57]:
np.random.seed(123)

def test_RL_models(model):
    totalCorrect, totalOptimal = 0,0
    for dir, num in zip(["data_easy", "data_medium", "data"], [80, 24, 480]):
        print("current data: " + dir)
        total_steps = 0
        valDataset = Gridworld(num * 10, dir = [dir], p = [1], type = "val", lambda1=0, lambda2=0, load_optimal= False)
        correct, total, optimal = 0, num*10, 0
        for task in range(int(total)):
            if task % num == num-1:
                print(f"{(task+1) / total *100} %, running acc: {(correct*100)/(task+1)}, task solved optimaly: {optimal*100/task} %, average steps to solve: {total_steps / task}")
            currMDP = valDataset.reset()
            lenOptimalSeq = len(valDataset.optimal_seq)
            done = False
            steps = 0
            while not done and steps < 50:
                action = model.predict(currMDP, action_masks = valDataset.action_masks(), deterministic = False)[0]
                currMDP, rew, done, _ = valDataset.step(action)
                steps += 1
                if rew > 0:
                    correct += 1
                    total_steps += steps
                    if steps == lenOptimalSeq:
                        optimal += 1
        totalCorrect += correct
        totalOptimal += optimal
    print(f"Total Accuracy : {totalCorrect*100/5840}, Solved Optimally : {totalOptimal *100 / 5840}")

In [58]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.env_util import make_vec_env

import torch.nn.functional as F
from torch import nn

class CustomFeatureExtractorTorch(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 16):
        super().__init__(observation_space, features_dim)

        self.conv1 = net.conv1
        self.conv2 = net.conv2
        self.conv3 = net.conv3
        self.conv4 = net.conv4
        
        self.fc1 = net.fc1
        self.fc2 = net.fc2
  
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x = F.max_pool2d(x, kernel_size = 2, stride = 2)
        x = torch.flatten(x, start_dim=1)
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return x

net_arch = [
    32, 128,
    dict(vf = [64, 32, 18], pi = [32, 16, 6])
]

policy_kwargs = dict(
    features_extractor_class = CustomFeatureExtractorTorch,
    net_arch = net_arch
)

In [59]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy

from stable_baselines3.ppo import PPO
from stable_baselines3.common.policies import ActorCriticPolicy

In [60]:
episodes = 5 * 1e5

FinalEnv = make_vec_env(Gridworld, n_envs= 2  , 
    env_kwargs={"episodes" : episodes, "lambda1" : 0, "lambda2" : 0.25, "lambda3": 1, 
            "dir" : ["data_easy", "data_medium", "data", "generated_imitation", "generated_easy", "generated_med"], "load_optimal" : True, "epsilon" : 0.5})

FinalModel = MaskablePPO(MaskableActorCriticPolicy, FinalEnv,  policy_kwargs = policy_kwargs, verbose = 1, n_steps= 500)#

FinalModel.learn(episodes)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10.6     |
|    ep_rew_mean     | 0.894    |
| time/              |          |
|    fps             | 1109     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1000     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 14.9        |
|    ep_rew_mean          | 0.851       |
| time/                   |             |
|    fps                  | 727         |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 2000        |
| train/                  |             |
|    approx_kl            | 0.000995809 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.765      |
|    explained_variance   | -0.0607     |
|    learnin

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 11.1       |
|    ep_rew_mean          | 0.889      |
| time/                   |            |
|    fps                  | 573        |
|    iterations           | 11         |
|    time_elapsed         | 19         |
|    total_timesteps      | 11000      |
| train/                  |            |
|    approx_kl            | 0.02709507 |
|    clip_fraction        | 0.227      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.671     |
|    explained_variance   | 0.272      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0104    |
|    n_updates            | 100        |
|    policy_gradient_loss | -0.0306    |
|    value_loss           | 0.0139     |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.5        |
|    ep_rew_m

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 7.38       |
|    ep_rew_mean          | 0.926      |
| time/                   |            |
|    fps                  | 558        |
|    iterations           | 21         |
|    time_elapsed         | 37         |
|    total_timesteps      | 21000      |
| train/                  |            |
|    approx_kl            | 0.04254249 |
|    clip_fraction        | 0.248      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.597     |
|    explained_variance   | 0.215      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0555    |
|    n_updates            | 200        |
|    policy_gradient_loss | -0.0317    |
|    value_loss           | 0.0172     |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 16.1       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 11.3        |
|    ep_rew_mean          | 0.887       |
| time/                   |             |
|    fps                  | 554         |
|    iterations           | 31          |
|    time_elapsed         | 55          |
|    total_timesteps      | 31000       |
| train/                  |             |
|    approx_kl            | 0.033599127 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.47       |
|    explained_variance   | 0.413       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0491     |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.0178     |
|    value_loss           | 0.0233      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 7.68    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 11.1        |
|    ep_rew_mean          | 0.889       |
| time/                   |             |
|    fps                  | 552         |
|    iterations           | 41          |
|    time_elapsed         | 74          |
|    total_timesteps      | 41000       |
| train/                  |             |
|    approx_kl            | 0.036704887 |
|    clip_fraction        | 0.288       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.467      |
|    explained_variance   | 0.51        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0907     |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 0.0131      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.67    

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.84       |
|    ep_rew_mean          | 0.962      |
| time/                   |            |
|    fps                  | 548        |
|    iterations           | 51         |
|    time_elapsed         | 92         |
|    total_timesteps      | 51000      |
| train/                  |            |
|    approx_kl            | 0.06438805 |
|    clip_fraction        | 0.27       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.372     |
|    explained_variance   | 0.423      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0164    |
|    n_updates            | 500        |
|    policy_gradient_loss | -0.0279    |
|    value_loss           | 0.0116     |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.14       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.22        |
|    ep_rew_mean          | 0.968       |
| time/                   |             |
|    fps                  | 545         |
|    iterations           | 61          |
|    time_elapsed         | 111         |
|    total_timesteps      | 61000       |
| train/                  |             |
|    approx_kl            | 0.055443227 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.163      |
|    explained_variance   | 0.508       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0522     |
|    n_updates            | 600         |
|    policy_gradient_loss | -0.000786   |
|    value_loss           | 0.00336     |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 7.32    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.6         |
|    ep_rew_mean          | 0.974       |
| time/                   |             |
|    fps                  | 543         |
|    iterations           | 71          |
|    time_elapsed         | 130         |
|    total_timesteps      | 71000       |
| train/                  |             |
|    approx_kl            | 0.023422062 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.205      |
|    explained_variance   | 0.631       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.08       |
|    n_updates            | 700         |
|    policy_gradient_loss | -0.012      |
|    value_loss           | 0.00295     |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.99    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.13        |
|    ep_rew_mean          | 0.949       |
| time/                   |             |
|    fps                  | 542         |
|    iterations           | 81          |
|    time_elapsed         | 149         |
|    total_timesteps      | 81000       |
| train/                  |             |
|    approx_kl            | 0.039208118 |
|    clip_fraction        | 0.1         |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.126      |
|    explained_variance   | -0.583      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0463     |
|    n_updates            | 800         |
|    policy_gradient_loss | -0.0307     |
|    value_loss           | 0.000551    |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 8.13    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.03        |
|    ep_rew_mean          | 0.97        |
| time/                   |             |
|    fps                  | 540         |
|    iterations           | 91          |
|    time_elapsed         | 168         |
|    total_timesteps      | 91000       |
| train/                  |             |
|    approx_kl            | 0.043298464 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.41       |
|    explained_variance   | 0.512       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0151     |
|    n_updates            | 900         |
|    policy_gradient_loss | -0.014      |
|    value_loss           | 0.0107      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.74    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.08        |
|    ep_rew_mean          | 0.969       |
| time/                   |             |
|    fps                  | 539         |
|    iterations           | 101         |
|    time_elapsed         | 187         |
|    total_timesteps      | 101000      |
| train/                  |             |
|    approx_kl            | 0.033064023 |
|    clip_fraction        | 0.0938      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.105      |
|    explained_variance   | 0.43        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0393     |
|    n_updates            | 1000        |
|    policy_gradient_loss | -0.0202     |
|    value_loss           | 0.000246    |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.98    

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.53       |
|    ep_rew_mean          | 0.975      |
| time/                   |            |
|    fps                  | 538        |
|    iterations           | 111        |
|    time_elapsed         | 205        |
|    total_timesteps      | 111000     |
| train/                  |            |
|    approx_kl            | 0.09715406 |
|    clip_fraction        | 0.187      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.299     |
|    explained_variance   | 0.728      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0129    |
|    n_updates            | 1100       |
|    policy_gradient_loss | -0.00955   |
|    value_loss           | 0.00597    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.11       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.73        |
|    ep_rew_mean          | 0.973       |
| time/                   |             |
|    fps                  | 537         |
|    iterations           | 121         |
|    time_elapsed         | 224         |
|    total_timesteps      | 121000      |
| train/                  |             |
|    approx_kl            | 0.048141755 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.203      |
|    explained_variance   | 0.571       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00949    |
|    n_updates            | 1200        |
|    policy_gradient_loss | -0.0301     |
|    value_loss           | 0.00378     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.16  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.45       |
|    ep_rew_mean          | 0.975      |
| time/                   |            |
|    fps                  | 536        |
|    iterations           | 131        |
|    time_elapsed         | 244        |
|    total_timesteps      | 131000     |
| train/                  |            |
|    approx_kl            | 0.05651129 |
|    clip_fraction        | 0.141      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.24      |
|    explained_variance   | 0.347      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0639    |
|    n_updates            | 1300       |
|    policy_gradient_loss | -0.0109    |
|    value_loss           | 0.00429    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.83        |
|    ep_rew_m

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.62       |
|    ep_rew_mean          | 0.974      |
| time/                   |            |
|    fps                  | 536        |
|    iterations           | 141        |
|    time_elapsed         | 262        |
|    total_timesteps      | 141000     |
| train/                  |            |
|    approx_kl            | 0.02835552 |
|    clip_fraction        | 0.119      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.237     |
|    explained_variance   | 0.607      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.00709   |
|    n_updates            | 1400       |
|    policy_gradient_loss | -0.0178    |
|    value_loss           | 0.00361    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.54        |
|    ep_rew_m

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.09       |
|    ep_rew_mean          | 0.969      |
| time/                   |            |
|    fps                  | 535        |
|    iterations           | 151        |
|    time_elapsed         | 282        |
|    total_timesteps      | 151000     |
| train/                  |            |
|    approx_kl            | 0.07136517 |
|    clip_fraction        | 0.146      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.19      |
|    explained_variance   | 0.442      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0728    |
|    n_updates            | 1500       |
|    policy_gradient_loss | -0.0206    |
|    value_loss           | 0.00425    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.14       |
|    ep_rew_mean

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.78       |
|    ep_rew_mean          | 0.972      |
| time/                   |            |
|    fps                  | 534        |
|    iterations           | 161        |
|    time_elapsed         | 301        |
|    total_timesteps      | 161000     |
| train/                  |            |
|    approx_kl            | 0.13953134 |
|    clip_fraction        | 0.238      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.208     |
|    explained_variance   | 0.333      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0096    |
|    n_updates            | 1600       |
|    policy_gradient_loss | -0.0457    |
|    value_loss           | 0.00269    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.61       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.62        |
|    ep_rew_mean          | 0.944       |
| time/                   |             |
|    fps                  | 533         |
|    iterations           | 171         |
|    time_elapsed         | 320         |
|    total_timesteps      | 171000      |
| train/                  |             |
|    approx_kl            | 0.041143544 |
|    clip_fraction        | 0.0715      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0993     |
|    explained_variance   | -0.522      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0448     |
|    n_updates            | 1700        |
|    policy_gradient_loss | -0.0241     |
|    value_loss           | 0.000771    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.38  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.5         |
|    ep_rew_mean          | 0.975       |
| time/                   |             |
|    fps                  | 533         |
|    iterations           | 181         |
|    time_elapsed         | 339         |
|    total_timesteps      | 181000      |
| train/                  |             |
|    approx_kl            | 0.028157212 |
|    clip_fraction        | 0.0672      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0871     |
|    explained_variance   | 0.28        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0485     |
|    n_updates            | 1800        |
|    policy_gradient_loss | -0.0287     |
|    value_loss           | 0.000542    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3     

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.61       |
|    ep_rew_mean          | 0.974      |
| time/                   |            |
|    fps                  | 532        |
|    iterations           | 191        |
|    time_elapsed         | 358        |
|    total_timesteps      | 191000     |
| train/                  |            |
|    approx_kl            | 0.16569726 |
|    clip_fraction        | 0.166      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.159     |
|    explained_variance   | 0.296      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0968    |
|    n_updates            | 1900       |
|    policy_gradient_loss | -0.0388    |
|    value_loss           | 0.00364    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.58       |
|    ep_rew_mean

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.69       |
|    ep_rew_mean          | 0.973      |
| time/                   |            |
|    fps                  | 532        |
|    iterations           | 201        |
|    time_elapsed         | 377        |
|    total_timesteps      | 201000     |
| train/                  |            |
|    approx_kl            | 0.06415797 |
|    clip_fraction        | 0.133      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.143     |
|    explained_variance   | 0.286      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0689    |
|    n_updates            | 2000       |
|    policy_gradient_loss | -0.0332    |
|    value_loss           | 0.00127    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.8        |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.59        |
|    ep_rew_mean          | 0.974       |
| time/                   |             |
|    fps                  | 531         |
|    iterations           | 211         |
|    time_elapsed         | 396         |
|    total_timesteps      | 211000      |
| train/                  |             |
|    approx_kl            | 0.027336102 |
|    clip_fraction        | 0.0873      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.142      |
|    explained_variance   | 0.786       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0535     |
|    n_updates            | 2100        |
|    policy_gradient_loss | -0.0132     |
|    value_loss           | 0.00382     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.81

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.68        |
|    ep_rew_mean          | 0.943       |
| time/                   |             |
|    fps                  | 530         |
|    iterations           | 221         |
|    time_elapsed         | 416         |
|    total_timesteps      | 221000      |
| train/                  |             |
|    approx_kl            | 0.020509195 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.138      |
|    explained_variance   | -0.302      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0216     |
|    n_updates            | 2200        |
|    policy_gradient_loss | -0.0112     |
|    value_loss           | 0.000855    |
-----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.52      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.76        |
|    ep_rew_mean          | 0.972       |
| time/                   |             |
|    fps                  | 530         |
|    iterations           | 231         |
|    time_elapsed         | 435         |
|    total_timesteps      | 231000      |
| train/                  |             |
|    approx_kl            | 0.027722765 |
|    clip_fraction        | 0.0514      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0865     |
|    explained_variance   | 0.451       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0218     |
|    n_updates            | 2300        |
|    policy_gradient_loss | -0.0234     |
|    value_loss           | 0.000173    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.67  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.7         |
|    ep_rew_mean          | 0.963       |
| time/                   |             |
|    fps                  | 529         |
|    iterations           | 241         |
|    time_elapsed         | 454         |
|    total_timesteps      | 241000      |
| train/                  |             |
|    approx_kl            | 0.020718308 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.206      |
|    explained_variance   | 0.624       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0435     |
|    n_updates            | 2400        |
|    policy_gradient_loss | -0.0296     |
|    value_loss           | 0.00387     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.54  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.59        |
|    ep_rew_mean          | 0.974       |
| time/                   |             |
|    fps                  | 528         |
|    iterations           | 251         |
|    time_elapsed         | 474         |
|    total_timesteps      | 251000      |
| train/                  |             |
|    approx_kl            | 0.058250222 |
|    clip_fraction        | 0.0746      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0802     |
|    explained_variance   | 0.232       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0261     |
|    n_updates            | 2500        |
|    policy_gradient_loss | -0.021      |
|    value_loss           | 0.00109     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.55  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.59        |
|    ep_rew_mean          | 0.974       |
| time/                   |             |
|    fps                  | 528         |
|    iterations           | 261         |
|    time_elapsed         | 493         |
|    total_timesteps      | 261000      |
| train/                  |             |
|    approx_kl            | 0.094754666 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.226      |
|    explained_variance   | 0.563       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0451     |
|    n_updates            | 2600        |
|    policy_gradient_loss | -0.0298     |
|    value_loss           | 0.00461     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.55  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.59        |
|    ep_rew_mean          | 0.974       |
| time/                   |             |
|    fps                  | 528         |
|    iterations           | 271         |
|    time_elapsed         | 512         |
|    total_timesteps      | 271000      |
| train/                  |             |
|    approx_kl            | 0.016785689 |
|    clip_fraction        | 0.0677      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.158      |
|    explained_variance   | 0.476       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0114      |
|    n_updates            | 2700        |
|    policy_gradient_loss | -0.00694    |
|    value_loss           | 0.00305     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.55  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.77        |
|    ep_rew_mean          | 0.972       |
| time/                   |             |
|    fps                  | 528         |
|    iterations           | 281         |
|    time_elapsed         | 532         |
|    total_timesteps      | 281000      |
| train/                  |             |
|    approx_kl            | 0.016751155 |
|    clip_fraction        | 0.0556      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.102      |
|    explained_variance   | 0.519       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00115     |
|    n_updates            | 2800        |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 0.00175     |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.58    

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.85       |
|    ep_rew_mean          | 0.972      |
| time/                   |            |
|    fps                  | 528        |
|    iterations           | 291        |
|    time_elapsed         | 551        |
|    total_timesteps      | 291000     |
| train/                  |            |
|    approx_kl            | 0.07536399 |
|    clip_fraction        | 0.0734     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.0804    |
|    explained_variance   | 0.423      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0443    |
|    n_updates            | 2900       |
|    policy_gradient_loss | -0.025     |
|    value_loss           | 0.00111    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.74        |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.67        |
|    ep_rew_mean          | 0.973       |
| time/                   |             |
|    fps                  | 527         |
|    iterations           | 301         |
|    time_elapsed         | 570         |
|    total_timesteps      | 301000      |
| train/                  |             |
|    approx_kl            | 0.010524962 |
|    clip_fraction        | 0.0648      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0971     |
|    explained_variance   | 0.299       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0152     |
|    n_updates            | 3000        |
|    policy_gradient_loss | -0.0227     |
|    value_loss           | 0.00115     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.58  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.85        |
|    ep_rew_mean          | 0.972       |
| time/                   |             |
|    fps                  | 527         |
|    iterations           | 311         |
|    time_elapsed         | 589         |
|    total_timesteps      | 311000      |
| train/                  |             |
|    approx_kl            | 0.026606528 |
|    clip_fraction        | 0.0784      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.138      |
|    explained_variance   | 0.173       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0206     |
|    n_updates            | 3100        |
|    policy_gradient_loss | -0.0211     |
|    value_loss           | 0.00336     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.64  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.56       |
|    ep_rew_mean          | 0.974      |
| time/                   |            |
|    fps                  | 527        |
|    iterations           | 321        |
|    time_elapsed         | 608        |
|    total_timesteps      | 321000     |
| train/                  |            |
|    approx_kl            | 0.03454499 |
|    clip_fraction        | 0.08       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.172     |
|    explained_variance   | 0.685      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.436      |
|    n_updates            | 3200       |
|    policy_gradient_loss | -0.00582   |
|    value_loss           | 0.00323    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.17        |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.54        |
|    ep_rew_mean          | 0.975       |
| time/                   |             |
|    fps                  | 527         |
|    iterations           | 331         |
|    time_elapsed         | 627         |
|    total_timesteps      | 331000      |
| train/                  |             |
|    approx_kl            | 0.007596098 |
|    clip_fraction        | 0.0653      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.125      |
|    explained_variance   | 0.717       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0159      |
|    n_updates            | 3300        |
|    policy_gradient_loss | -0.0229     |
|    value_loss           | 0.0023      |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.62

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.92        |
|    ep_rew_mean          | 0.961       |
| time/                   |             |
|    fps                  | 527         |
|    iterations           | 341         |
|    time_elapsed         | 646         |
|    total_timesteps      | 341000      |
| train/                  |             |
|    approx_kl            | 0.040259816 |
|    clip_fraction        | 0.0958      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.107      |
|    explained_variance   | 0.334       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00914     |
|    n_updates            | 3400        |
|    policy_gradient_loss | -0.0232     |
|    value_loss           | 0.002       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.45  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.79        |
|    ep_rew_mean          | 0.972       |
| time/                   |             |
|    fps                  | 527         |
|    iterations           | 351         |
|    time_elapsed         | 664         |
|    total_timesteps      | 351000      |
| train/                  |             |
|    approx_kl            | 0.025750682 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.311      |
|    explained_variance   | -0.163      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0591     |
|    n_updates            | 3500        |
|    policy_gradient_loss | -0.0319     |
|    value_loss           | 0.0197      |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 14.7  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 6.31        |
|    ep_rew_mean          | 1.12        |
| time/                   |             |
|    fps                  | 528         |
|    iterations           | 361         |
|    time_elapsed         | 682         |
|    total_timesteps      | 361000      |
| train/                  |             |
|    approx_kl            | 0.048343733 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.434      |
|    explained_variance   | 0.192       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.026      |
|    n_updates            | 3600        |
|    policy_gradient_loss | -0.0335     |
|    value_loss           | 0.023       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.26  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.11       |
|    ep_rew_mean          | 1.11       |
| time/                   |            |
|    fps                  | 529        |
|    iterations           | 371        |
|    time_elapsed         | 700        |
|    total_timesteps      | 371000     |
| train/                  |            |
|    approx_kl            | 0.04324929 |
|    clip_fraction        | 0.185      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.352     |
|    explained_variance   | 0.0668     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0071    |
|    n_updates            | 3700       |
|    policy_gradient_loss | -0.034     |
|    value_loss           | 0.0132     |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.44       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.29        |
|    ep_rew_mean          | 1.11        |
| time/                   |             |
|    fps                  | 530         |
|    iterations           | 381         |
|    time_elapsed         | 718         |
|    total_timesteps      | 381000      |
| train/                  |             |
|    approx_kl            | 0.025820356 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.287      |
|    explained_variance   | 0.136       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0506     |
|    n_updates            | 3800        |
|    policy_gradient_loss | -0.0329     |
|    value_loss           | 0.0116      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.63    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 11          |
|    ep_rew_mean          | 1.04        |
| time/                   |             |
|    fps                  | 530         |
|    iterations           | 391         |
|    time_elapsed         | 736         |
|    total_timesteps      | 391000      |
| train/                  |             |
|    approx_kl            | 0.042672925 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.377      |
|    explained_variance   | 0.445       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0442     |
|    n_updates            | 3900        |
|    policy_gradient_loss | -0.0349     |
|    value_loss           | 0.0142      |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.93  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.01       |
|    ep_rew_mean          | 1.13       |
| time/                   |            |
|    fps                  | 531        |
|    iterations           | 401        |
|    time_elapsed         | 754        |
|    total_timesteps      | 401000     |
| train/                  |            |
|    approx_kl            | 0.07620488 |
|    clip_fraction        | 0.155      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.208     |
|    explained_variance   | 0.407      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0195    |
|    n_updates            | 4000       |
|    policy_gradient_loss | -0.0314    |
|    value_loss           | 0.00869    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.23       |
|    ep_rew_mean

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.8       |
|    ep_rew_mean          | 1.12      |
| time/                   |           |
|    fps                  | 531       |
|    iterations           | 411       |
|    time_elapsed         | 772       |
|    total_timesteps      | 411000    |
| train/                  |           |
|    approx_kl            | 0.0477698 |
|    clip_fraction        | 0.143     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.282    |
|    explained_variance   | 0.567     |
|    learning_rate        | 0.0003    |
|    loss                 | -0.00939  |
|    n_updates            | 4100      |
|    policy_gradient_loss | -0.0113   |
|    value_loss           | 0.00873   |
---------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.9         |
|    ep_rew_mean          | 1.15  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.24       |
|    ep_rew_mean          | 1.11       |
| time/                   |            |
|    fps                  | 531        |
|    iterations           | 421        |
|    time_elapsed         | 791        |
|    total_timesteps      | 421000     |
| train/                  |            |
|    approx_kl            | 0.04498528 |
|    clip_fraction        | 0.208      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.304     |
|    explained_variance   | 0.264      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0378    |
|    n_updates            | 4200       |
|    policy_gradient_loss | -0.0297    |
|    value_loss           | 0.0118     |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.86       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4           |
|    ep_rew_mean          | 1.12        |
| time/                   |             |
|    fps                  | 532         |
|    iterations           | 431         |
|    time_elapsed         | 809         |
|    total_timesteps      | 431000      |
| train/                  |             |
|    approx_kl            | 0.058108535 |
|    clip_fraction        | 0.0953      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.106      |
|    explained_variance   | 0.726       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.013      |
|    n_updates            | 4300        |
|    policy_gradient_loss | -0.0262     |
|    value_loss           | 0.00363     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.04  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.29        |
|    ep_rew_mean          | 1.13        |
| time/                   |             |
|    fps                  | 533         |
|    iterations           | 441         |
|    time_elapsed         | 827         |
|    total_timesteps      | 441000      |
| train/                  |             |
|    approx_kl            | 0.027688624 |
|    clip_fraction        | 0.0761      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.107      |
|    explained_variance   | 0.72        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0545     |
|    n_updates            | 4400        |
|    policy_gradient_loss | -0.0223     |
|    value_loss           | 0.00235     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.75  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.23       |
|    ep_rew_mean          | 1.12       |
| time/                   |            |
|    fps                  | 533        |
|    iterations           | 451        |
|    time_elapsed         | 844        |
|    total_timesteps      | 451000     |
| train/                  |            |
|    approx_kl            | 0.12055193 |
|    clip_fraction        | 0.174      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.162     |
|    explained_variance   | 0.725      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0518    |
|    n_updates            | 4500       |
|    policy_gradient_loss | -0.0447    |
|    value_loss           | 0.00313    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 5.83       |
|    ep_rew_mean

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.72       |
|    ep_rew_mean          | 1.12       |
| time/                   |            |
|    fps                  | 534        |
|    iterations           | 461        |
|    time_elapsed         | 862        |
|    total_timesteps      | 461000     |
| train/                  |            |
|    approx_kl            | 0.08135363 |
|    clip_fraction        | 0.115      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.137     |
|    explained_variance   | 0.762      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.00194   |
|    n_updates            | 4600       |
|    policy_gradient_loss | -0.0295    |
|    value_loss           | 0.00274    |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.83       |
|    ep_rew_mean

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.94        |
|    ep_rew_mean          | 1.13        |
| time/                   |             |
|    fps                  | 534         |
|    iterations           | 471         |
|    time_elapsed         | 880         |
|    total_timesteps      | 471000      |
| train/                  |             |
|    approx_kl            | 0.058269355 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.189      |
|    explained_variance   | 0.617       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0437     |
|    n_updates            | 4700        |
|    policy_gradient_loss | -0.0312     |
|    value_loss           | 0.00371     |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.93    

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.87       |
|    ep_rew_mean          | 1.13       |
| time/                   |            |
|    fps                  | 535        |
|    iterations           | 481        |
|    time_elapsed         | 898        |
|    total_timesteps      | 481000     |
| train/                  |            |
|    approx_kl            | 0.09859938 |
|    clip_fraction        | 0.177      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.221     |
|    explained_variance   | 0.652      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0649    |
|    n_updates            | 4800       |
|    policy_gradient_loss | -0.0305    |
|    value_loss           | 0.00974    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.32        |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.08        |
|    ep_rew_mean          | 1.11        |
| time/                   |             |
|    fps                  | 535         |
|    iterations           | 491         |
|    time_elapsed         | 916         |
|    total_timesteps      | 491000      |
| train/                  |             |
|    approx_kl            | 0.077933215 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.143      |
|    explained_variance   | 0.557       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0631     |
|    n_updates            | 4900        |
|    policy_gradient_loss | -0.0247     |
|    value_loss           | 0.0061      |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.12  

<sb3_contrib.ppo_mask.ppo_mask.MaskablePPO at 0x7f8403f214f0>

In [162]:
FinalModel.learn(1.5 * 1e5)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.7      |
|    ep_rew_mean     | 1.18     |
| time/              |          |
|    fps             | 1063     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1000     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.63       |
|    ep_rew_mean          | 1.16       |
| time/                   |            |
|    fps                  | 713        |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 2000       |
| train/                  |            |
|    approx_kl            | 0.06643972 |
|    clip_fraction        | 0.0586     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.0456    |
|    explained_variance   | 0.79       |
|    learning_rate        | 0.0003     |
|   

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.58       |
|    ep_rew_mean          | 1.19       |
| time/                   |            |
|    fps                  | 559        |
|    iterations           | 11         |
|    time_elapsed         | 19         |
|    total_timesteps      | 11000      |
| train/                  |            |
|    approx_kl            | 0.06222095 |
|    clip_fraction        | 0.0656     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.0484    |
|    explained_variance   | 0.536      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0471    |
|    n_updates            | 5100       |
|    policy_gradient_loss | -0.0287    |
|    value_loss           | 0.0122     |
----------------------------------------


KeyboardInterrupt: 

In [63]:
compare = MaskablePPO.load("RLModel-restrictiveActionMasks")

test_RL_models(compare)

current data: data_easy
10.0 %, running acc: 97.5, task solved optimaly: 91.13924050632912 %, average steps to solve: 2.6835443037974684
20.0 %, running acc: 98.125, task solved optimaly: 90.56603773584905 %, average steps to solve: 2.691823899371069
30.0 %, running acc: 98.75, task solved optimaly: 90.7949790794979 %, average steps to solve: 2.728033472803347
40.0 %, running acc: 99.0625, task solved optimaly: 90.59561128526646 %, average steps to solve: 2.7774294670846396
50.0 %, running acc: 99.25, task solved optimaly: 91.2280701754386 %, average steps to solve: 2.7669172932330826
60.0 %, running acc: 99.375, task solved optimaly: 91.44050104384134 %, average steps to solve: 2.7724425887265136
70.0 %, running acc: 99.46428571428571, task solved optimaly: 91.23434704830053 %, average steps to solve: 2.7924865831842576
80.0 %, running acc: 99.53125, task solved optimaly: 91.39280125195619 %, average steps to solve: 2.7746478873239435
90.0 %, running acc: 99.58333333333333, task solve

In [61]:
test_RL_models(FinalModel)

current data: data_easy
10.0 %, running acc: 98.75, task solved optimaly: 83.54430379746836 %, average steps to solve: 3.2151898734177213
20.0 %, running acc: 99.375, task solved optimaly: 87.42138364779875 %, average steps to solve: 3.0754716981132075
30.0 %, running acc: 99.58333333333333, task solved optimaly: 89.1213389121339 %, average steps to solve: 2.9748953974895396
40.0 %, running acc: 99.375, task solved optimaly: 89.96865203761756 %, average steps to solve: 2.909090909090909
50.0 %, running acc: 99.5, task solved optimaly: 90.47619047619048 %, average steps to solve: 2.887218045112782
60.0 %, running acc: 99.58333333333333, task solved optimaly: 89.97912317327766 %, average steps to solve: 2.903966597077244
70.0 %, running acc: 99.64285714285714, task solved optimaly: 90.16100178890876 %, average steps to solve: 2.892665474060823
80.0 %, running acc: 99.6875, task solved optimaly: 89.82785602503913 %, average steps to solve: 2.9029733959311423
90.0 %, running acc: 99.722222

In [64]:
FinalModel.save("RLModel-restrictiveActionMasks")

In [13]:
testEnv = Gridworld(100,load_optimal= False, dir = ["data"])

In [141]:
curr = testEnv.reset()
testEnv.render()

[[['#' '.' '.' '.']
  ['#' '.' '^' 'O']
  ['.' '.' '#' '#']
  ['.' '.' '#' '#']]

 [['#' '^' 'O' '.']
  ['#' '.' '.' 'O']
  ['.' '.' '#' '#']
  ['.' '.' '#' '#']]]


In [146]:
a = FinalModel.predict(curr, action_masks= testEnv.action_masks(), deterministic= True)[0]
curr, _, done, _ = testEnv.step(a)
print(testEnv.action_masks())
testEnv.render()
if done:
    print("finished")

[0 0 0 0 0 1]
[[['#' '^' 'O' '.']
  ['#' '.' '.' 'O']
  ['.' '.' '#' '#']
  ['.' '.' '#' '#']]

 [['#' '^' 'O' '.']
  ['#' '.' '.' 'O']
  ['.' '.' '#' '#']
  ['.' '.' '#' '#']]]
