In [1]:
import datetime

import gym

from time import sleep
from tqdm import tqdm
import torch
from torch import nn
import copy
import random
from torch.utils.tensorboard import SummaryWriter

class Config():
    def __init__(self, lr = 0.01,
                gamma = 0.6,
                min_epsilon = 0.1,
                renew_target = 100,
                store_and_replay = True,
                double_dqn = True):
        self.lr = lr
        self.gamma = gamma
        self.min_epsilon = min_epsilon
        self.store_and_replay = store_and_replay
        self.double_dqn = double_dqn
        self.max_time_steps = 1000
        self.renew_target = renew_target
        
        self.replay_memory_fail = []
        self.replay_memory_fail_size = 1000
        self.replay_memory_success = []
        
        self.prev_success = set()
        
        dt = datetime.datetime.now()
        dt_str = dt.strftime("%Y-%m-%d %H-%M-%S")
        self.opt_str = f"SR_{store_and_replay}_DD_{double_dqn}"
        self.writer = SummaryWriter(f"runs/{self.opt_str}_{dt_str}")
        
        self.num_episodes = 5000
        
    def get_epsilon(self, episode = -1):
        if episode < 0:
            return self.min_epsilon
        return max(self.min_epsilon, 0.99**episode)
    
    def _insert_record(self, memory, rec, memory_size = -1):
        if rec in memory:
            return
        if len(memory) < memory_size or memory_size == - 1:
            memory.append(rec)
        else:
            memory[random.randint(0, memory_size - 1)] = rec
    
    def insert_record(self, rec):
        done = rec[-1]
        if done:
            self._insert_record(self.replay_memory_success, rec)
            self.prev_success.add(rec[0])
        else:
            self._insert_record(self.replay_memory_fail, rec, self.replay_memory_fail_size)
        
    def get_replay_memory(self):
        return self.replay_memory_success + self.replay_memory_fail
    
    def get_replay_record(self):
        replay_memory = self.get_replay_memory()
        mid_replay = random.randint(0, len(replay_memory) - 1)
        return mid_replay, replay_memory[mid_replay]
    

class QNet(nn.Module):
    def __init__(self, num_states, num_actions, hidden_dim = 16):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Embedding(num_states, 2*hidden_dim),
            nn.Linear(2 * hidden_dim, 2 * hidden_dim),
            nn.PReLU(),
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.PReLU(),
            nn.Linear(hidden_dim, num_actions)
        )

    def forward(self, x):
        x = self.layers(x)
        return x
        
def train(env, config, qnet):
    optimizer = torch.optim.SGD(qnet.parameters(), lr = 0.001)
    if config.double_dqn:
        qnet2 = copy.deepcopy(qnet)
    else:
        qnet2 = qnet
    
    criteria = nn.MSELoss()
    num_episodes = config.num_episodes
    
    for i in tqdm(range(1, num_episodes + 1)):
        train_episode(env, config, qnet, qnet2, optimizer, criteria, episode_count = i)
        if i % config.renew_target == 0:
            if config.double_dqn:
                qnet2 = copy.deepcopy(qnet)
                
        if i == 1 or i % 50 == 0:
            test(env, config, qnet, i)
            
    print("Training finished.\n")
    
def train_episode(env, config, qnet, qnet2, optimizer, criteria, episode_count = -1):
    done = False
    state = env.reset()
    n_steps = 0
    tot_reward = 0
    penalties = 0
    loss = 0
    
    while not done and n_steps < config.max_time_steps:
        state_t = torch.LongTensor([state])

        epsilon = config.get_epsilon(episode_count)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_hat = qnet(state_t)
                action = torch.argmax(q_hat[0]).item()

        next_state, reward, done, info = env.step(action)

        tot_reward += reward

        new_tuple = (state, action, next_state, reward, done)

        state = next_state
        if reward == -10:
            penalties += 1

        if config.store_and_replay:
            config.insert_record(new_tuple)
            loss_i, step_i = replay(qnet, qnet2, config, optimizer, criteria)
        else:
            loss_i, step_i = replay(qnet, qnet2, config, optimizer, criteria, rec = new_tuple)
        loss += loss_i
        n_steps += 1
        
def replay(qnet, qnet2, config, optimizer, criteria, num_instances = 5, rec = None):
    loss_i = 0
    
    for _ in range(num_instances):
        optimizer.zero_grad()
        if config.store_and_replay:
            mid_replay, rec = config.get_replay_record()
        
        state, action, next_state, reward, done = rec
        
        if done:
            y_t = torch.Tensor([reward])
        else:
            next_state_r_t = torch.LongTensor([next_state])
            with torch.no_grad():
                q_next = qnet2(next_state_r_t)
                
                y_t = reward + config.gamma * q_next.max(dim = -1)[0]
                
        state_r_t = torch.LongTensor([state])
        q_hat = qnet(state_r_t)
        q_hat = q_hat[:, action]
        
        loss = criteria(q_hat, y_t)
        loss.backward()
        optimizer.step()
        loss_i += loss.item()
        
    return loss_i, num_instances
        
    
def test(env, config, qnet, global_step = -1):
    qnet.eval()
    total_epochs, total_penalties = 0, 0
    episodes = 100
    
    total_reward = 0
    writer = config.writer
    
    for _ in tqdm(range(episodes)):
        state = env.reset()
        epochs, penalties, reward = 0, 0, 0
        
        done = False
        
        while not done and epochs < config.max_time_steps:
            with torch.no_grad():
                state_t = torch.LongTensor([state])
                q_hat = qnet(state_t)
                action = torch.argmax(q_hat[0]).item()
                
            state, reward, done, info = env.step(action)
            total_reward += reward
            if reward == -10:
                penalties += 1
            
            epochs += 1
            
        total_penalties += penalties
        total_epochs += epochs
    
    avg_steps = total_epochs / episodes
    avg_penalty = total_penalties / episodes
    avg_reward = total_reward / episodes
    
    print(f"Results after {episodes} episodes:")
    print(f"Average timesteps per episode: {avg_steps}")
    print(f"Average penalty per episode: {avg_penalty}")
    print(f"Average reward per episode: {avg_reward}")
    
    if global_step > 0:
        writer.add_scalar("Steps", avg_steps, global_step)
        writer.add_scalar("Penalty", avg_penalty, global_step)
        writer.add_scalar("Reward", avg_reward, global_step)
    
if __name__ == "__main__":
    env = gym.make("Taxi-v3").env
    
    print("Action Space {}".format(env.action_space))
    print("State Space {}".format(env.observation_space))
    store_and_replay = True
    double_dqn = True
    
    print(f"Store & replay: {store_and_replay}")
    print(f"Double DQN: {double_dqn}")
    
    qnet = QNet(env.observation_space.n, env.action_space.n, hidden_dim = 32)
    config = Config(store_and_replay = store_and_replay, double_dqn = double_dqn, lr = 0.01, gamma = 0.75, renew_target = 20)
    
    train(env, config, qnet)
    test(env, config, qnet, config.num_episodes)

Action Space Discrete(6)
State Space Discrete(500)
Store & replay: False
Double DQN: False


  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.29it/s][A
  4%|▍         | 4/100 [00:00<00:05, 19.18it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.07it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.11it/s][A
 10%|█         | 10/100 [00:00<00:04, 19.11it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.17it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 19.09it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 19.04it/s][A
 18%|█▊        | 18/100 [00:00<00:04, 19.11it/s][A
 20%|██        | 20/100 [00:01<00:04, 19.06it/s][A
 22%|██▏       | 22/100 [00:01<00:04, 19.07it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 19.13it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 19.13it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 19.11it/s][A
 30%|███       | 30/100 [00:01<00:03, 19.11it/s][A
 32%|███▏      | 32/100 [00:01<00:03, 19.17it/s][A
 34%|███▍      | 34/100 [00:01<00:03, 19.10it/s][A
 36%|███▌      | 36/100 [00:01<00:03

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalty per episode: 1000.0
Average reward per episode: -10000.0


  1%|          | 49/5000 [01:18<1:59:21,  1.45s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.48it/s][A
  4%|▍         | 4/100 [00:00<00:04, 19.26it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.19it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.38it/s][A
 10%|█         | 10/100 [00:00<00:04, 19.35it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.38it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 19.36it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 19.34it/s][A
 18%|█▊        | 18/100 [00:00<00:04, 19.12it/s][A
 20%|██        | 20/100 [00:01<00:04, 19.11it/s][A
 22%|██▏       | 22/100 [00:01<00:04, 19.17it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 19.26it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 19.33it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 19.38it/s][A
 30%|███       | 30/100 [00:01<00:03, 19.35it/s][A
 32%|███▏      | 32/100 [00:01<00:03, 19.39it/s][A
 34%|███▍      | 34/100 [00:01<00:03, 19.44it/s][A
 36%|███▌      | 36/100 [

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalty per episode: 0.0
Average reward per episode: -1000.0


  2%|▏         | 98/5000 [02:20<1:38:00,  1.20s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 18.93it/s][A
  4%|▍         | 4/100 [00:00<00:05, 18.83it/s][A
  6%|▌         | 6/100 [00:00<00:04, 18.87it/s][A
  8%|▊         | 8/100 [00:00<00:04, 18.83it/s][A
 10%|█         | 10/100 [00:00<00:04, 18.67it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 18.70it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 18.72it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 18.73it/s][A
 18%|█▊        | 18/100 [00:00<00:04, 18.86it/s][A
 20%|██        | 20/100 [00:01<00:04, 18.88it/s][A
 22%|██▏       | 22/100 [00:01<00:04, 18.90it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 19.01it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 19.04it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 19.17it/s][A
 30%|███       | 30/100 [00:01<00:03, 19.21it/s][A
 32%|███▏      | 32/100 [00:01<00:03, 19.29it/s][A
 34%|███▍      | 34/100 [00:01<00:03, 19.29it/s][A
 36%|███▌      | 36/100 [

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalty per episode: 0.0
Average reward per episode: -1000.0


  3%|▎         | 148/5000 [03:27<2:14:48,  1.67s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.48it/s][A
  4%|▍         | 4/100 [00:00<00:04, 19.26it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.36it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.34it/s][A
 10%|█         | 10/100 [00:00<00:04, 19.32it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.38it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 19.41it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 19.31it/s][A
 18%|█▊        | 18/100 [00:00<00:04, 19.19it/s][A
 20%|██        | 20/100 [00:01<00:04, 19.19it/s][A
 22%|██▏       | 22/100 [00:01<00:04, 19.22it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 19.24it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 19.20it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 19.33it/s][A
 30%|███       | 30/100 [00:01<00:03, 19.32it/s][A
 32%|███▏      | 32/100 [00:01<00:03, 19.33it/s][A
 34%|███▍      | 34/100 [00:01<00:03, 19.43it/s][A
 36%|███▌      | 36/100 

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalty per episode: 0.0
Average reward per episode: -1000.0


  4%|▍         | 199/5000 [04:45<2:24:26,  1.81s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.11it/s][A
  4%|▍         | 4/100 [00:00<00:04, 19.22it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.34it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.18it/s][A
 10%|█         | 10/100 [00:00<00:04, 19.29it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.41it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 19.44it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 19.16it/s][A
 18%|█▊        | 18/100 [00:00<00:04, 18.92it/s][A
 20%|██        | 20/100 [00:01<00:04, 18.70it/s][A
 22%|██▏       | 22/100 [00:01<00:04, 18.83it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 19.02it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 19.10it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 19.22it/s][A
 30%|███       | 30/100 [00:01<00:03, 19.18it/s][A
 32%|███▏      | 32/100 [00:01<00:03, 19.16it/s][A
 34%|███▍      | 34/100 [00:01<00:03, 19.26it/s][A
 36%|███▌      | 36/100 

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalty per episode: 0.0
Average reward per episode: -1000.0


  5%|▍         | 249/5000 [05:58<2:24:06,  1.82s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.48it/s][A
  4%|▍         | 4/100 [00:00<00:04, 19.37it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.34it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.25it/s][A
 10%|█         | 10/100 [00:00<00:04, 19.32it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.25it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 19.18it/s][A
 17%|█▋        | 17/100 [00:00<00:03, 22.28it/s][A
 20%|██        | 20/100 [00:00<00:03, 21.02it/s][A
 23%|██▎       | 23/100 [00:01<00:03, 20.58it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 20.23it/s][A
 29%|██▉       | 29/100 [00:01<00:03, 19.75it/s][A
 31%|███       | 31/100 [00:01<00:03, 19.64it/s][A
 33%|███▎      | 33/100 [00:01<00:03, 19.64it/s][A
 35%|███▌      | 35/100 [00:01<00:03, 19.50it/s][A
 37%|███▋      | 37/100 [00:01<00:03, 19.44it/s][A
 39%|███▉      | 39/100 [00:01<00:03, 19.45it/s][A
 41%|████      | 41/100 

Results after 100 episodes:
Average timesteps per episode: 990.06
Average penalty per episode: 0.0
Average reward per episode: -989.85


  6%|▌         | 299/5000 [07:01<1:24:31,  1.08s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.48it/s][A
  4%|▍         | 4/100 [00:00<00:04, 19.26it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.01it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.12it/s][A
 10%|█         | 10/100 [00:00<00:04, 19.19it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.22it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 19.25it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 18.75it/s][A
 19%|█▉        | 19/100 [00:00<00:03, 21.87it/s][A
 22%|██▏       | 22/100 [00:01<00:03, 20.85it/s][A
 25%|██▌       | 25/100 [00:01<00:03, 20.48it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 22.77it/s][A
 31%|███       | 31/100 [00:01<00:03, 21.68it/s][A
 34%|███▍      | 34/100 [00:01<00:03, 20.92it/s][A
 37%|███▋      | 37/100 [00:01<00:03, 20.05it/s][A
 40%|████      | 40/100 [00:01<00:02, 20.01it/s][A
 43%|████▎     | 43/100 [00:02<00:02, 19.87it/s][A
 46%|████▌     | 46/100 

Results after 100 episodes:
Average timesteps per episode: 940.54
Average penalty per episode: 0.0
Average reward per episode: -939.28


  7%|▋         | 349/5000 [08:05<1:22:38,  1.07s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 18.75it/s][A
  4%|▍         | 4/100 [00:00<00:05, 18.96it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.20it/s][A
  9%|▉         | 9/100 [00:00<00:03, 23.03it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 21.58it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 23.76it/s][A
 18%|█▊        | 18/100 [00:00<00:03, 25.41it/s][A
 21%|██        | 21/100 [00:00<00:02, 26.56it/s][A
 25%|██▌       | 25/100 [00:00<00:02, 30.02it/s][A
 29%|██▉       | 29/100 [00:01<00:02, 25.18it/s][A
 32%|███▏      | 32/100 [00:01<00:02, 23.03it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 24.43it/s][A
 38%|███▊      | 38/100 [00:01<00:02, 22.79it/s][A
 41%|████      | 41/100 [00:01<00:02, 21.83it/s][A
 44%|████▍     | 44/100 [00:01<00:02, 21.14it/s][A
 47%|████▋     | 47/100 [00:02<00:02, 22.82it/s][A
 50%|█████     | 50/100 [00:02<00:02, 21.69it/s][A
 53%|█████▎    | 53/100 

Results after 100 episodes:
Average timesteps per episode: 851.54
Average penalty per episode: 0.0
Average reward per episode: -848.39


  8%|▊         | 399/5000 [09:08<1:08:18,  1.12it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 29.22it/s][A
  6%|▌         | 6/100 [00:00<00:04, 21.95it/s][A
  9%|▉         | 9/100 [00:00<00:04, 20.33it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 19.96it/s][A
 15%|█▌        | 15/100 [00:00<00:04, 19.31it/s][A
 17%|█▋        | 17/100 [00:00<00:04, 19.26it/s][A
 19%|█▉        | 19/100 [00:00<00:04, 19.22it/s][A
 21%|██        | 21/100 [00:01<00:04, 19.19it/s][A
 23%|██▎       | 23/100 [00:01<00:04, 19.17it/s][A
 25%|██▌       | 25/100 [00:01<00:03, 18.99it/s][A
 27%|██▋       | 27/100 [00:01<00:03, 18.98it/s][A
 29%|██▉       | 29/100 [00:01<00:03, 19.07it/s][A
 31%|███       | 31/100 [00:01<00:03, 19.03it/s][A
 33%|███▎      | 33/100 [00:01<00:03, 19.05it/s][A
 35%|███▌      | 35/100 [00:01<00:03, 19.23it/s][A
 37%|███▋      | 37/100 [00:01<00:03, 19.25it/s][A
 39%|███▉      | 39/100 [00:01<00:03, 19.21it/s][A
 41%|████      | 41/100

Results after 100 episodes:
Average timesteps per episode: 970.2
Average penalty per episode: 0.0
Average reward per episode: -969.57


  9%|▉         | 448/5000 [09:52<44:18,  1.71it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 18.93it/s][A
  4%|▍         | 4/100 [00:00<00:05, 18.93it/s][A
  6%|▌         | 6/100 [00:00<00:04, 18.85it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.01it/s][A
 10%|█         | 10/100 [00:00<00:04, 18.98it/s][A
 13%|█▎        | 13/100 [00:00<00:03, 22.12it/s][A
 16%|█▌        | 16/100 [00:00<00:04, 20.83it/s][A
 19%|█▉        | 19/100 [00:00<00:04, 20.12it/s][A
 22%|██▏       | 22/100 [00:01<00:03, 19.79it/s][A
 25%|██▌       | 25/100 [00:01<00:03, 19.58it/s][A
 27%|██▋       | 27/100 [00:01<00:03, 19.47it/s][A
 29%|██▉       | 29/100 [00:01<00:03, 19.38it/s][A
 31%|███       | 31/100 [00:01<00:03, 19.26it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 24.43it/s][A
 38%|███▊      | 38/100 [00:01<00:02, 22.17it/s][A
 41%|████      | 41/100 [00:02<00:02, 21.00it/s][A
 44%|████▍     | 44/100 [00:02<00:02, 20.32it/s][A
 47%|████▋     | 47/100 

Results after 100 episodes:
Average timesteps per episode: 920.73
Average penalty per episode: 0.0
Average reward per episode: -919.05


 10%|▉         | 499/5000 [10:32<29:14,  2.57it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.11it/s][A
  4%|▍         | 4/100 [00:00<00:05, 19.00it/s][A
  8%|▊         | 8/100 [00:00<00:03, 27.39it/s][A
 11%|█         | 11/100 [00:00<00:03, 23.37it/s][A
 14%|█▍        | 14/100 [00:00<00:03, 24.83it/s][A
 17%|█▋        | 17/100 [00:00<00:03, 25.72it/s][A
 20%|██        | 20/100 [00:00<00:03, 23.07it/s][A
 23%|██▎       | 23/100 [00:00<00:03, 24.47it/s][A
 26%|██▌       | 26/100 [00:01<00:02, 25.51it/s][A
 29%|██▉       | 29/100 [00:01<00:02, 26.13it/s][A
 32%|███▏      | 32/100 [00:01<00:02, 23.46it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 21.91it/s][A
 38%|███▊      | 38/100 [00:01<00:02, 23.49it/s][A
 41%|████      | 41/100 [00:01<00:02, 21.90it/s][A
 44%|████▍     | 44/100 [00:01<00:02, 21.08it/s][A
 47%|████▋     | 47/100 [00:02<00:02, 20.51it/s][A
 50%|█████     | 50/100 [00:02<00:02, 20.17it/s][A
 53%|█████▎    | 53/100

Results after 100 episodes:
Average timesteps per episode: 881.14
Average penalty per episode: 0.0
Average reward per episode: -878.62


 11%|█         | 549/5000 [11:23<32:44,  2.27it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.13it/s][A
  6%|▌         | 6/100 [00:00<00:04, 21.78it/s][A
  9%|▉         | 9/100 [00:00<00:04, 20.64it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 20.03it/s][A
 15%|█▌        | 15/100 [00:00<00:04, 19.76it/s][A
 18%|█▊        | 18/100 [00:00<00:03, 22.09it/s][A
 21%|██        | 21/100 [00:00<00:03, 23.82it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 22.01it/s][A
 27%|██▋       | 27/100 [00:01<00:03, 21.22it/s][A
 30%|███       | 30/100 [00:01<00:03, 23.07it/s][A
 33%|███▎      | 33/100 [00:01<00:03, 21.58it/s][A
 36%|███▌      | 36/100 [00:01<00:03, 20.70it/s][A
 39%|███▉      | 39/100 [00:01<00:03, 20.25it/s][A
 42%|████▏     | 42/100 [00:01<00:02, 19.99it/s][A
 45%|████▌     | 45/100 [00:02<00:02, 22.00it/s][A
 48%|████▊     | 48/100 [00:02<00:02, 23.54it/s][A
 51%|█████     | 51/100 [00:02<00:02, 22.18it/s][A
 54%|█████▍    | 54/100

Results after 100 episodes:
Average timesteps per episode: 881.19
Average penalty per episode: 0.0
Average reward per episode: -878.67


 12%|█▏        | 599/5000 [11:58<49:28,  1.48it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 18.58it/s][A
  6%|▌         | 6/100 [00:00<00:03, 29.67it/s][A
  9%|▉         | 9/100 [00:00<00:03, 29.09it/s][A
 12%|█▏        | 12/100 [00:00<00:03, 28.71it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 23.57it/s][A
 19%|█▉        | 19/100 [00:00<00:02, 27.66it/s][A
 22%|██▏       | 22/100 [00:00<00:02, 27.72it/s][A
 25%|██▌       | 25/100 [00:00<00:03, 24.38it/s][A
 28%|██▊       | 28/100 [00:01<00:03, 22.48it/s][A
 31%|███       | 31/100 [00:01<00:02, 23.87it/s][A
 34%|███▍      | 34/100 [00:01<00:02, 24.82it/s][A
 37%|███▋      | 37/100 [00:01<00:02, 25.80it/s][A
 40%|████      | 40/100 [00:01<00:02, 23.21it/s][A
 43%|████▎     | 43/100 [00:01<00:02, 21.64it/s][A
 46%|████▌     | 46/100 [00:01<00:02, 23.04it/s][A
 49%|████▉     | 49/100 [00:01<00:02, 24.19it/s][A
 52%|█████▏    | 52/100 [00:02<00:01, 25.25it/s][A
 55%|█████▌    | 55/100

Results after 100 episodes:
Average timesteps per episode: 762.63
Average penalty per episode: 0.0
Average reward per episode: -757.59


 13%|█▎        | 648/5000 [12:36<26:47,  2.71it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.30it/s][A
  4%|▍         | 4/100 [00:00<00:05, 18.76it/s][A
  8%|▊         | 8/100 [00:00<00:03, 27.44it/s][A
 11%|█         | 11/100 [00:00<00:03, 23.24it/s][A
 14%|█▍        | 14/100 [00:00<00:03, 21.52it/s][A
 17%|█▋        | 17/100 [00:00<00:03, 23.40it/s][A
 21%|██        | 21/100 [00:00<00:02, 27.33it/s][A
 24%|██▍       | 24/100 [00:00<00:02, 27.49it/s][A
 27%|██▋       | 27/100 [00:01<00:03, 24.11it/s][A
 30%|███       | 30/100 [00:01<00:02, 25.25it/s][A
 33%|███▎      | 33/100 [00:01<00:02, 22.96it/s][A
 37%|███▋      | 37/100 [00:01<00:02, 26.72it/s][A
 43%|████▎     | 43/100 [00:01<00:01, 34.43it/s][A
 47%|████▋     | 47/100 [00:01<00:01, 35.10it/s][A
 51%|█████     | 51/100 [00:01<00:01, 28.14it/s][A
 55%|█████▌    | 55/100 [00:02<00:01, 30.32it/s][A
 59%|█████▉    | 59/100 [00:02<00:01, 28.55it/s][A
 63%|██████▎   | 63/100

Results after 100 episodes:
Average timesteps per episode: 693.11
Average penalty per episode: 0.0
Average reward per episode: -686.6


 14%|█▍        | 698/5000 [13:04<46:34,  1.54it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.11it/s][A
  4%|▍         | 4/100 [00:00<00:05, 19.11it/s][A
  8%|▊         | 8/100 [00:00<00:03, 27.84it/s][A
 11%|█         | 11/100 [00:00<00:03, 28.27it/s][A
 14%|█▍        | 14/100 [00:00<00:03, 28.42it/s][A
 18%|█▊        | 18/100 [00:00<00:02, 31.55it/s][A
 22%|██▏       | 22/100 [00:00<00:02, 29.19it/s][A
 27%|██▋       | 27/100 [00:00<00:02, 34.42it/s][A
 31%|███       | 31/100 [00:01<00:02, 27.65it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 30.23it/s][A
 39%|███▉      | 39/100 [00:01<00:01, 32.14it/s][A
 43%|████▎     | 43/100 [00:01<00:02, 26.73it/s][A
 46%|████▌     | 46/100 [00:01<00:01, 27.12it/s][A
 49%|████▉     | 49/100 [00:01<00:02, 24.53it/s][A
 52%|█████▏    | 52/100 [00:01<00:01, 25.47it/s][A
 55%|█████▌    | 55/100 [00:02<00:01, 22.99it/s][A
 58%|█████▊    | 58/100 [00:02<00:01, 24.31it/s][A
 61%|██████    | 61/100

Results after 100 episodes:
Average timesteps per episode: 772.58
Average penalty per episode: 0.0
Average reward per episode: -767.75


 15%|█▍        | 749/5000 [13:38<28:25,  2.49it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:06, 15.92it/s][A
  5%|▌         | 5/100 [00:00<00:04, 22.62it/s][A
  8%|▊         | 8/100 [00:00<00:03, 24.92it/s][A
 13%|█▎        | 13/100 [00:00<00:02, 33.47it/s][A
 17%|█▋        | 17/100 [00:00<00:03, 26.40it/s][A
 20%|██        | 20/100 [00:00<00:03, 23.96it/s][A
 23%|██▎       | 23/100 [00:00<00:03, 22.33it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 23.89it/s][A
 29%|██▉       | 29/100 [00:01<00:02, 25.13it/s][A
 32%|███▏      | 32/100 [00:01<00:02, 26.14it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 26.83it/s][A
 40%|████      | 40/100 [00:01<00:01, 32.62it/s][A
 44%|████▍     | 44/100 [00:01<00:01, 34.24it/s][A
 48%|████▊     | 48/100 [00:01<00:01, 27.52it/s][A
 52%|█████▏    | 52/100 [00:02<00:01, 24.39it/s][A
 55%|█████▌    | 55/100 [00:02<00:01, 22.89it/s][A
 58%|█████▊    | 58/100 [00:02<00:01, 21.74it/s][A
 61%|██████    | 61/100

Results after 100 episodes:
Average timesteps per episode: 722.92
Average penalty per episode: 0.0
Average reward per episode: -717.04


 16%|█▌        | 798/5000 [14:07<30:48,  2.27it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  7%|▋         | 7/100 [00:00<00:02, 34.23it/s][A
 11%|█         | 11/100 [00:00<00:03, 29.30it/s][A
 15%|█▌        | 15/100 [00:00<00:02, 32.08it/s][A
 19%|█▉        | 19/100 [00:00<00:02, 33.67it/s][A
 25%|██▌       | 25/100 [00:00<00:01, 41.17it/s][A
 31%|███       | 31/100 [00:00<00:01, 45.80it/s][A
 38%|███▊      | 38/100 [00:00<00:01, 51.93it/s][A
 44%|████▍     | 44/100 [00:01<00:01, 52.96it/s][A
 50%|█████     | 50/100 [00:01<00:01, 47.14it/s][A
 55%|█████▌    | 55/100 [00:01<00:01, 42.10it/s][A
 61%|██████    | 61/100 [00:01<00:00, 45.78it/s][A
 66%|██████▌   | 66/100 [00:01<00:00, 41.03it/s][A
 71%|███████   | 71/100 [00:01<00:00, 34.36it/s][A
 75%|███████▌  | 75/100 [00:01<00:00, 35.15it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 35.82it/s][A
 83%|████████▎ | 83/100 [00:02<00:00, 29.44it/s][A
 87%|████████▋ | 87/10

Results after 100 episodes:
Average timesteps per episode: 505.5
Average penalty per episode: 0.0
Average reward per episode: -495.0


 17%|█▋        | 849/5000 [14:29<27:31,  2.51it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  4%|▍         | 4/100 [00:00<00:02, 37.86it/s][A
  8%|▊         | 8/100 [00:00<00:02, 38.07it/s][A
 12%|█▏        | 12/100 [00:00<00:03, 26.76it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 27.28it/s][A
 18%|█▊        | 18/100 [00:00<00:02, 27.71it/s][A
 21%|██        | 21/100 [00:00<00:02, 28.00it/s][A
 25%|██▌       | 25/100 [00:00<00:02, 31.02it/s][A
 29%|██▉       | 29/100 [00:00<00:02, 33.20it/s][A
 33%|███▎      | 33/100 [00:01<00:02, 30.27it/s][A
 37%|███▋      | 37/100 [00:01<00:02, 28.65it/s][A
 40%|████      | 40/100 [00:01<00:02, 25.48it/s][A
 43%|████▎     | 43/100 [00:01<00:02, 23.57it/s][A
 46%|████▌     | 46/100 [00:01<00:02, 24.78it/s][A
 49%|████▉     | 49/100 [00:01<00:01, 25.65it/s][A
 53%|█████▎    | 53/100 [00:01<00:01, 28.92it/s][A
 57%|█████▋    | 57/100 [00:01<00:01, 31.39it/s][A
 61%|██████    | 61/100 [00:02<00:01, 32.87it/s][A
 65%|██████▌   | 65/100 

Results after 100 episodes:
Average timesteps per episode: 703.15
Average penalty per episode: 0.0
Average reward per episode: -696.85


 18%|█▊        | 898/5000 [14:47<11:38,  5.88it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  4%|▍         | 4/100 [00:00<00:02, 38.59it/s][A
  8%|▊         | 8/100 [00:00<00:03, 30.16it/s][A
 13%|█▎        | 13/100 [00:00<00:02, 37.16it/s][A
 17%|█▋        | 17/100 [00:00<00:02, 37.59it/s][A
 21%|██        | 21/100 [00:00<00:02, 32.61it/s][A
 27%|██▋       | 27/100 [00:00<00:01, 39.60it/s][A
 32%|███▏      | 32/100 [00:00<00:01, 36.65it/s][A
 36%|███▌      | 36/100 [00:01<00:01, 32.79it/s][A
 42%|████▏     | 42/100 [00:01<00:01, 38.83it/s][A
 47%|████▋     | 47/100 [00:01<00:01, 36.50it/s][A
 54%|█████▍    | 54/100 [00:01<00:01, 43.81it/s][A
 59%|█████▉    | 59/100 [00:01<00:01, 39.60it/s][A
 64%|██████▍   | 64/100 [00:01<00:00, 37.22it/s][A
 68%|██████▊   | 68/100 [00:01<00:00, 33.60it/s][A
 77%|███████▋  | 77/100 [00:01<00:00, 45.77it/s][A
 82%|████████▏ | 82/100 [00:02<00:00, 37.34it/s][A
 87%|████████▋ | 87/100 [00:02<00:00, 35.80it/s][A
 94%|█████████▍| 94/10

Results after 100 episodes:
Average timesteps per episode: 525.77
Average penalty per episode: 0.0
Average reward per episode: -515.69


 19%|█▉        | 949/5000 [15:02<15:07,  4.46it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.30it/s][A
  4%|▍         | 4/100 [00:00<00:05, 19.08it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.17it/s][A
  8%|▊         | 8/100 [00:00<00:04, 19.15it/s][A
 11%|█         | 11/100 [00:00<00:03, 22.59it/s][A
 14%|█▍        | 14/100 [00:00<00:04, 21.28it/s][A
 17%|█▋        | 17/100 [00:00<00:04, 20.67it/s][A
 20%|██        | 20/100 [00:00<00:03, 20.41it/s][A
 23%|██▎       | 23/100 [00:01<00:03, 22.48it/s][A
 26%|██▌       | 26/100 [00:01<00:03, 21.50it/s][A
 29%|██▉       | 29/100 [00:01<00:03, 20.94it/s][A
 32%|███▏      | 32/100 [00:01<00:03, 20.40it/s][A
 35%|███▌      | 35/100 [00:01<00:03, 20.13it/s][A
 38%|███▊      | 38/100 [00:01<00:03, 19.95it/s][A
 41%|████      | 41/100 [00:02<00:02, 19.80it/s][A
 43%|████▎     | 43/100 [00:02<00:02, 19.68it/s][A
 45%|████▌     | 45/100 [00:02<00:02, 19.63it/s][A
 47%|████▋     | 47/100 [0

Results after 100 episodes:
Average timesteps per episode: 970.22
Average penalty per episode: 0.0
Average reward per episode: -969.59


 20%|█▉        | 996/5000 [15:18<30:25,  2.19it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.57it/s][A
  6%|▌         | 6/100 [00:00<00:03, 28.42it/s][A
  9%|▉         | 9/100 [00:00<00:03, 23.26it/s][A
 12%|█▏        | 12/100 [00:00<00:03, 25.10it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 26.25it/s][A
 19%|█▉        | 19/100 [00:00<00:02, 30.00it/s][A
 23%|██▎       | 23/100 [00:00<00:03, 25.09it/s][A
 26%|██▌       | 26/100 [00:00<00:02, 25.94it/s][A
 29%|██▉       | 29/100 [00:01<00:03, 23.50it/s][A
 32%|███▏      | 32/100 [00:01<00:02, 24.80it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 25.76it/s][A
 38%|███▊      | 38/100 [00:01<00:02, 26.49it/s][A
 41%|████      | 41/100 [00:01<00:02, 23.74it/s][A
 45%|████▌     | 45/100 [00:01<00:02, 27.40it/s][A
 48%|████▊     | 48/100 [00:01<00:01, 27.60it/s][A
 51%|█████     | 51/100 [00:01<00:01, 27.82it/s][A
 54%|█████▍    | 54/100 [00:02<00:01, 27.99it/s][A
 57%|█████▋    | 57/100

Results after 100 episodes:
Average timesteps per episode: 634.52
Average penalty per episode: 0.0
Average reward per episode: -626.75


 21%|██        | 1048/5000 [15:36<26:08,  2.52it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  6%|▌         | 6/100 [00:00<00:04, 22.36it/s][A
  9%|▉         | 9/100 [00:00<00:04, 20.92it/s][A
 12%|█▏        | 12/100 [00:00<00:03, 23.49it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 25.12it/s][A
 18%|█▊        | 18/100 [00:00<00:03, 26.23it/s][A
 21%|██        | 21/100 [00:00<00:03, 23.59it/s][A
 24%|██▍       | 24/100 [00:00<00:03, 24.67it/s][A
 27%|██▋       | 27/100 [00:01<00:02, 25.66it/s][A
 30%|███       | 30/100 [00:01<00:02, 23.34it/s][A
 33%|███▎      | 33/100 [00:01<00:02, 24.69it/s][A
 36%|███▌      | 36/100 [00:01<00:02, 25.71it/s][A
 40%|████      | 40/100 [00:01<00:02, 29.22it/s][A
 43%|████▎     | 43/100 [00:01<00:02, 25.64it/s][A
 46%|████▌     | 46/100 [00:01<00:02, 26.49it/s][A
 49%|████▉     | 49/100 [00:01<00:02, 24.01it/s][A
 52%|█████▏    | 52/100 [00:02<00:02, 22.30it/s][A
 55%|█████▌    | 55/100 

Results after 100 episodes:
Average timesteps per episode: 782.28
Average penalty per episode: 0.0
Average reward per episode: -777.66


 22%|██▏       | 1097/5000 [15:46<08:59,  7.23it/s]  
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  6%|▌         | 6/100 [00:00<00:03, 28.94it/s][A
  9%|▉         | 9/100 [00:00<00:03, 28.94it/s][A
 12%|█▏        | 12/100 [00:00<00:03, 28.72it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 24.70it/s][A
 18%|█▊        | 18/100 [00:00<00:03, 22.61it/s][A
 21%|██        | 21/100 [00:00<00:03, 24.16it/s][A
 24%|██▍       | 24/100 [00:00<00:02, 25.49it/s][A
 27%|██▋       | 27/100 [00:01<00:02, 26.48it/s][A
 30%|███       | 30/100 [00:01<00:02, 27.04it/s][A
 33%|███▎      | 33/100 [00:01<00:02, 24.20it/s][A
 36%|███▌      | 36/100 [00:01<00:02, 25.47it/s][A
 39%|███▉      | 39/100 [00:01<00:02, 26.43it/s][A
 43%|████▎     | 43/100 [00:01<00:01, 29.71it/s][A
 47%|████▋     | 47/100 [00:01<00:02, 25.19it/s][A
 51%|█████     | 51/100 [00:01<00:01, 28.44it/s][A
 56%|█████▌    | 56/100 [00:02<00:01, 33.37it/s][A
 60%|██████    | 60/10

Results after 100 episodes:
Average timesteps per episode: 742.69
Average penalty per episode: 0.0
Average reward per episode: -737.23


 23%|██▎       | 1148/5000 [15:58<05:55, 10.85it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.30it/s][A
  4%|▍         | 4/100 [00:00<00:04, 19.51it/s][A
  6%|▌         | 6/100 [00:00<00:04, 19.42it/s][A
  9%|▉         | 9/100 [00:00<00:03, 23.00it/s][A
 12%|█▏        | 12/100 [00:00<00:04, 21.50it/s][A
 15%|█▌        | 15/100 [00:00<00:03, 23.79it/s][A
 18%|█▊        | 18/100 [00:00<00:03, 22.07it/s][A
 21%|██        | 21/100 [00:00<00:03, 21.21it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 23.14it/s][A
 27%|██▋       | 27/100 [00:01<00:02, 24.70it/s][A
 31%|███       | 31/100 [00:01<00:02, 28.55it/s][A
 34%|███▍      | 34/100 [00:01<00:02, 25.28it/s][A
 43%|████▎     | 43/100 [00:01<00:01, 40.97it/s][A
 48%|████▊     | 48/100 [00:01<00:01, 42.72it/s][A
 53%|█████▎    | 53/100 [00:01<00:01, 44.13it/s][A
 58%|█████▊    | 58/100 [00:01<00:01, 35.44it/s][A
 62%|██████▏   | 62/100 [00:02<00:01, 32.51it/s][A
 66%|██████▌   | 66/100 [

Results after 100 episodes:
Average timesteps per episode: 634.48
Average penalty per episode: 0.0
Average reward per episode: -626.71


 24%|██▍       | 1198/5000 [16:12<07:55,  7.99it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  5%|▌         | 5/100 [00:00<00:01, 47.78it/s][A
 10%|█         | 10/100 [00:00<00:02, 37.49it/s][A
 14%|█▍        | 14/100 [00:00<00:02, 37.50it/s][A
 19%|█▉        | 19/100 [00:00<00:01, 40.90it/s][A
 24%|██▍       | 24/100 [00:00<00:02, 36.72it/s][A
 28%|██▊       | 28/100 [00:00<00:02, 32.48it/s][A
 32%|███▏      | 32/100 [00:00<00:02, 26.96it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 24.35it/s][A
 38%|███▊      | 38/100 [00:01<00:02, 22.54it/s][A
 41%|████      | 41/100 [00:01<00:02, 23.87it/s][A
 47%|████▋     | 47/100 [00:01<00:01, 31.82it/s][A
 51%|█████     | 51/100 [00:01<00:01, 33.46it/s][A
 55%|█████▌    | 55/100 [00:01<00:01, 34.64it/s][A
 59%|█████▉    | 59/100 [00:01<00:01, 31.41it/s][A
 63%|██████▎   | 63/100 [00:02<00:01, 29.67it/s][A
 67%|██████▋   | 67/100 [00:02<00:01, 25.76it/s][A
 70%|███████   | 70/100 [00:02<00:01, 26.41it/s][A
 73%|███████▎  | 73/10

Results after 100 episodes:
Average timesteps per episode: 604.43
Average penalty per episode: 0.0
Average reward per episode: -596.03


 25%|██▍       | 1245/5000 [16:20<06:55,  9.03it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  6%|▌         | 6/100 [00:00<00:03, 28.62it/s][A
  9%|▉         | 9/100 [00:00<00:03, 28.77it/s][A
 14%|█▍        | 14/100 [00:00<00:02, 36.13it/s][A
 23%|██▎       | 23/100 [00:00<00:01, 53.17it/s][A
 31%|███       | 31/100 [00:00<00:01, 60.03it/s][A
 37%|███▋      | 37/100 [00:00<00:01, 58.98it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 58.12it/s][A
 49%|████▉     | 49/100 [00:01<00:01, 50.22it/s][A
 60%|██████    | 60/100 [00:01<00:00, 64.37it/s][A
 67%|██████▋   | 67/100 [00:01<00:00, 56.49it/s][A
 73%|███████▎  | 73/100 [00:01<00:00, 50.39it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 42.04it/s][A
 84%|████████▍ | 84/100 [00:01<00:00, 35.91it/s][A
 88%|████████▊ | 88/100 [00:01<00:00, 36.38it/s][A
 92%|█████████▏| 92/100 [00:02<00:00, 36.80it/s][A
100%|██████████| 100/100 [00:02<00:00, 46.28it/s][A
 25%|██▌       | 1253/50

Results after 100 episodes:
Average timesteps per episode: 417.77
Average penalty per episode: 0.0
Average reward per episode: -405.38


 26%|██▌       | 1298/5000 [16:28<13:37,  4.53it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.11it/s][A
  6%|▌         | 6/100 [00:00<00:03, 30.02it/s][A
 10%|█         | 10/100 [00:00<00:02, 33.31it/s][A
 14%|█▍        | 14/100 [00:00<00:02, 35.10it/s][A
 18%|█▊        | 18/100 [00:00<00:02, 30.68it/s][A
 23%|██▎       | 23/100 [00:00<00:02, 35.43it/s][A
 27%|██▋       | 27/100 [00:00<00:02, 31.28it/s][A
 31%|███       | 31/100 [00:01<00:02, 29.59it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 28.41it/s][A
 38%|███▊      | 38/100 [00:01<00:02, 28.54it/s][A
 41%|████      | 41/100 [00:01<00:02, 25.52it/s][A
 44%|████▍     | 44/100 [00:01<00:02, 23.51it/s][A
 49%|████▉     | 49/100 [00:01<00:01, 29.31it/s][A
 53%|█████▎    | 53/100 [00:01<00:01, 28.28it/s][A
 56%|█████▌    | 56/100 [00:01<00:01, 28.44it/s][A
 60%|██████    | 60/100 [00:02<00:01, 31.07it/s][A
 64%|██████▍   | 64/100 [00:02<00:01, 29.30it/s][A
 68%|██████▊   | 68/100

Results after 100 episodes:
Average timesteps per episode: 614.25
Average penalty per episode: 0.0
Average reward per episode: -606.06


 27%|██▋       | 1349/5000 [16:41<11:17,  5.39it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  8%|▊         | 8/100 [00:00<00:01, 74.32it/s][A
 16%|█▌        | 16/100 [00:00<00:01, 58.92it/s][A
 23%|██▎       | 23/100 [00:00<00:01, 61.14it/s][A
 36%|███▌      | 36/100 [00:00<00:00, 81.67it/s][A
 45%|████▌     | 45/100 [00:00<00:00, 82.33it/s][A
 54%|█████▍    | 54/100 [00:00<00:00, 72.28it/s][A
 62%|██████▏   | 62/100 [00:00<00:00, 72.65it/s][A
 70%|███████   | 70/100 [00:00<00:00, 72.94it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 52.64it/s][A
 85%|████████▌ | 85/100 [00:01<00:00, 50.33it/s][A
 91%|█████████ | 91/100 [00:01<00:00, 46.88it/s][A
100%|██████████| 100/100 [00:01<00:00, 59.27it/s][A
 27%|██▋       | 1355/5000 [16:43<13:41,  4.44it/s]

Results after 100 episodes:
Average timesteps per episode: 328.92
Average penalty per episode: 0.0
Average reward per episode: -314.64


 28%|██▊       | 1396/5000 [16:49<15:37,  3.84it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  6%|▌         | 6/100 [00:00<00:03, 29.11it/s][A
 24%|██▍       | 24/100 [00:00<00:00, 90.91it/s][A
 36%|███▌      | 36/100 [00:00<00:00, 97.58it/s][A
 46%|████▌     | 46/100 [00:00<00:00, 72.48it/s][A
 54%|█████▍    | 54/100 [00:00<00:00, 64.46it/s][A
 65%|██████▌   | 65/100 [00:00<00:00, 73.72it/s][A
 73%|███████▎  | 73/100 [00:01<00:00, 73.81it/s][A
 81%|████████  | 81/100 [00:01<00:00, 65.25it/s][A
 88%|████████▊ | 88/100 [00:01<00:00, 58.69it/s][A
100%|██████████| 100/100 [00:01<00:00, 64.86it/s][A
 28%|██▊       | 1403/5000 [16:51<15:53,  3.77it/s]

Results after 100 episodes:
Average timesteps per episode: 299.35
Average penalty per episode: 0.0
Average reward per episode: -284.44


 29%|██▉       | 1447/5000 [16:55<04:03, 14.60it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  7%|▋         | 7/100 [00:00<00:02, 34.42it/s][A
 11%|█         | 11/100 [00:00<00:02, 36.16it/s][A
 15%|█▌        | 15/100 [00:00<00:02, 31.22it/s][A
 19%|█▉        | 19/100 [00:00<00:02, 33.59it/s][A
 24%|██▍       | 24/100 [00:00<00:02, 37.97it/s][A
 30%|███       | 30/100 [00:00<00:01, 43.72it/s][A
 35%|███▌      | 35/100 [00:01<00:02, 31.60it/s][A
 42%|████▏     | 42/100 [00:01<00:01, 39.90it/s][A
 50%|█████     | 50/100 [00:01<00:01, 48.85it/s][A
 56%|█████▌    | 56/100 [00:01<00:00, 45.29it/s][A
 62%|██████▏   | 62/100 [00:01<00:00, 43.07it/s][A
 68%|██████▊   | 68/100 [00:01<00:00, 46.31it/s][A
 73%|███████▎  | 73/100 [00:01<00:00, 41.61it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 38.86it/s][A
 83%|████████▎ | 83/100 [00:02<00:00, 41.00it/s][A
 88%|████████▊ | 88/100 [00:02<00:00, 37.94it/s][A
 92%|█████████▏| 92/100

Results after 100 episodes:
Average timesteps per episode: 496.24
Average penalty per episode: 0.0
Average reward per episode: -485.53


 30%|██▉       | 1498/5000 [17:06<07:00,  8.32it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  4%|▍         | 4/100 [00:00<00:02, 37.86it/s][A
  8%|▊         | 8/100 [00:00<00:03, 29.75it/s][A
 12%|█▏        | 12/100 [00:00<00:02, 32.85it/s][A
 16%|█▌        | 16/100 [00:00<00:02, 34.78it/s][A
 27%|██▋       | 27/100 [00:00<00:01, 57.34it/s][A
 33%|███▎      | 33/100 [00:00<00:01, 56.99it/s][A
 39%|███▉      | 39/100 [00:00<00:01, 49.66it/s][A
 45%|████▌     | 45/100 [00:01<00:01, 45.39it/s][A
 50%|█████     | 50/100 [00:01<00:01, 40.77it/s][A
 55%|█████▌    | 55/100 [00:01<00:01, 42.40it/s][A
 67%|██████▋   | 67/100 [00:01<00:00, 60.05it/s][A
 77%|███████▋  | 77/100 [00:01<00:00, 67.31it/s][A
 87%|████████▋ | 87/100 [00:01<00:00, 73.85it/s][A
100%|██████████| 100/100 [00:01<00:00, 58.20it/s][A
 30%|███       | 1504/5000 [17:08<10:48,  5.39it/s]

Results after 100 episodes:
Average timesteps per episode: 328.25
Average penalty per episode: 0.0
Average reward per episode: -313.97


 31%|███       | 1546/5000 [17:11<05:19, 10.82it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.11it/s][A
  8%|▊         | 8/100 [00:00<00:02, 40.91it/s][A
 13%|█▎        | 13/100 [00:00<00:02, 35.83it/s][A
 19%|█▉        | 19/100 [00:00<00:01, 42.69it/s][A
 24%|██▍       | 24/100 [00:00<00:01, 38.07it/s][A
 28%|██▊       | 28/100 [00:00<00:02, 33.62it/s][A
 41%|████      | 41/100 [00:00<00:01, 56.86it/s][A
 48%|████▊     | 48/100 [00:01<00:00, 52.47it/s][A
 54%|█████▍    | 54/100 [00:01<00:00, 53.45it/s][A
 60%|██████    | 60/100 [00:01<00:00, 48.25it/s][A
 67%|██████▋   | 67/100 [00:01<00:00, 52.79it/s][A
100%|██████████| 100/100 [00:01<00:00, 64.48it/s][A
 31%|███       | 1550/5000 [17:13<11:41,  4.92it/s]

Results after 100 episodes:
Average timesteps per episode: 299.01
Average penalty per episode: 0.0
Average reward per episode: -284.1


 32%|███▏      | 1599/5000 [17:17<04:43, 11.99it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  9%|▉         | 9/100 [00:00<00:01, 82.09it/s][A
 18%|█▊        | 18/100 [00:00<00:00, 82.09it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 82.09it/s][A
 36%|███▌      | 36/100 [00:00<00:00, 81.80it/s][A
 45%|████▌     | 45/100 [00:00<00:00, 80.85it/s][A
 55%|█████▌    | 55/100 [00:00<00:00, 83.82it/s][A
 64%|██████▍   | 64/100 [00:00<00:00, 64.64it/s][A
 76%|███████▌  | 76/100 [00:00<00:00, 76.11it/s][A
100%|██████████| 100/100 [00:01<00:00, 90.80it/s] [A
 32%|███▏      | 1606/5000 [17:18<06:56,  8.15it/s]

Results after 100 episodes:
Average timesteps per episode: 210.15
Average penalty per episode: 0.0
Average reward per episode: -193.35


 33%|███▎      | 1648/5000 [17:20<02:23, 23.36it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 29.22it/s][A
  6%|▌         | 6/100 [00:00<00:03, 28.89it/s][A
 13%|█▎        | 13/100 [00:00<00:01, 45.72it/s][A
 18%|█▊        | 18/100 [00:00<00:01, 46.35it/s][A
 23%|██▎       | 23/100 [00:00<00:01, 46.55it/s][A
 28%|██▊       | 28/100 [00:00<00:02, 35.72it/s][A
 32%|███▏      | 32/100 [00:00<00:02, 32.20it/s][A
 36%|███▌      | 36/100 [00:01<00:02, 30.16it/s][A
 40%|████      | 40/100 [00:01<00:02, 28.84it/s][A
 44%|████▍     | 44/100 [00:01<00:02, 25.31it/s][A
 47%|████▋     | 47/100 [00:01<00:02, 25.99it/s][A
 50%|█████     | 50/100 [00:01<00:02, 24.02it/s][A
 53%|█████▎    | 53/100 [00:01<00:02, 22.64it/s][A
 56%|█████▌    | 56/100 [00:01<00:01, 24.09it/s][A
 59%|█████▉    | 59/100 [00:02<00:01, 25.25it/s][A
 62%|██████▏   | 62/100 [00:02<00:01, 26.21it/s][A
 65%|██████▌   | 65/100 [00:02<00:01, 23.84it/s][A
 68%|██████▊   | 68/100

Results after 100 episodes:
Average timesteps per episode: 643.95
Average penalty per episode: 0.0
Average reward per episode: -636.39


 34%|███▍      | 1698/5000 [17:28<04:26, 12.40it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 16%|█▌        | 16/100 [00:00<00:00, 142.07it/s][A
 39%|███▉      | 39/100 [00:00<00:00, 174.10it/s][A
 57%|█████▋    | 57/100 [00:00<00:00, 139.82it/s][A
 72%|███████▏  | 72/100 [00:00<00:00, 137.93it/s][A
 87%|████████▋ | 87/100 [00:00<00:00, 120.36it/s][A
100%|██████████| 100/100 [00:00<00:00, 129.13it/s][A
 34%|███▍      | 1705/5000 [17:30<07:41,  7.14it/s]

Results after 100 episodes:
Average timesteps per episode: 151.03
Average penalty per episode: 0.0
Average reward per episode: -132.97


 35%|███▍      | 1747/5000 [17:33<03:57, 13.68it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  8%|▊         | 8/100 [00:00<00:01, 71.67it/s][A
 17%|█▋        | 17/100 [00:00<00:01, 76.54it/s][A
 25%|██▌       | 25/100 [00:00<00:01, 52.99it/s][A
 37%|███▋      | 37/100 [00:00<00:00, 70.26it/s][A
 45%|████▌     | 45/100 [00:00<00:00, 62.83it/s][A
 52%|█████▏    | 52/100 [00:00<00:00, 63.77it/s][A
 59%|█████▉    | 59/100 [00:00<00:00, 64.13it/s][A
 66%|██████▌   | 66/100 [00:01<00:00, 57.48it/s][A
 73%|███████▎  | 73/100 [00:01<00:00, 43.32it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 46.22it/s][A
 85%|████████▌ | 85/100 [00:01<00:00, 43.92it/s][A
100%|██████████| 100/100 [00:01<00:00, 57.07it/s][A
 35%|███▌      | 1752/5000 [17:35<10:23,  5.21it/s]

Results after 100 episodes:
Average timesteps per episode: 338.8
Average penalty per episode: 0.0
Average reward per episode: -324.73


 36%|███▌      | 1797/5000 [17:36<01:53, 28.16it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 126.55it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 79.75it/s] [A
 36%|███▌      | 36/100 [00:00<00:01, 61.53it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 62.65it/s][A
 50%|█████     | 50/100 [00:00<00:00, 63.32it/s][A
 57%|█████▋    | 57/100 [00:00<00:00, 63.82it/s][A
 65%|██████▌   | 65/100 [00:00<00:00, 67.06it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 82.28it/s][A
 87%|████████▋ | 87/100 [00:01<00:00, 82.23it/s][A
100%|██████████| 100/100 [00:01<00:00, 74.99it/s][A
 36%|███▌      | 1805/5000 [17:38<05:24,  9.85it/s]

Results after 100 episodes:
Average timesteps per episode: 259.31
Average penalty per episode: 0.0
Average reward per episode: -243.56


 37%|███▋      | 1848/5000 [17:39<01:53, 27.77it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 12%|█▏        | 12/100 [00:00<00:00, 111.48it/s][A
 24%|██▍       | 24/100 [00:00<00:01, 53.64it/s] [A
 35%|███▌      | 35/100 [00:00<00:00, 67.45it/s][A
 44%|████▍     | 44/100 [00:00<00:00, 63.74it/s][A
 58%|█████▊    | 58/100 [00:00<00:00, 80.74it/s][A
 69%|██████▉   | 69/100 [00:00<00:00, 86.36it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 87.65it/s][A
 89%|████████▉ | 89/100 [00:01<00:00, 70.34it/s][A
100%|██████████| 100/100 [00:01<00:00, 72.45it/s][A
 37%|███▋      | 1854/5000 [17:42<08:31,  6.15it/s]

Results after 100 episodes:
Average timesteps per episode: 269.31
Average penalty per episode: 0.0
Average reward per episode: -253.77


 38%|███▊      | 1897/5000 [17:47<03:59, 12.93it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.94it/s][A
  6%|▌         | 6/100 [00:00<00:03, 28.78it/s][A
 10%|█         | 10/100 [00:00<00:02, 32.96it/s][A
 14%|█▍        | 14/100 [00:00<00:03, 25.40it/s][A
 17%|█▋        | 17/100 [00:00<00:03, 26.42it/s][A
 20%|██        | 20/100 [00:00<00:02, 27.07it/s][A
 28%|██▊       | 28/100 [00:00<00:01, 41.42it/s][A
 33%|███▎      | 33/100 [00:00<00:01, 37.96it/s][A
 45%|████▌     | 45/100 [00:01<00:00, 58.02it/s][A
 52%|█████▏    | 52/100 [00:01<00:01, 43.12it/s][A
 58%|█████▊    | 58/100 [00:01<00:00, 46.24it/s][A
 64%|██████▍   | 64/100 [00:01<00:00, 43.37it/s][A
 69%|██████▉   | 69/100 [00:01<00:00, 44.20it/s][A
 74%|███████▍  | 74/100 [00:01<00:00, 44.77it/s][A
 79%|███████▉  | 79/100 [00:02<00:00, 32.78it/s][A
 83%|████████▎ | 83/100 [00:02<00:00, 33.73it/s][A
 87%|████████▋ | 87/100 [00:02<00:00, 30.78it/s][A
 91%|█████████ | 91/100

Results after 100 episodes:
Average timesteps per episode: 565.34
Average penalty per episode: 0.0
Average reward per episode: -556.1


 39%|███▉      | 1947/5000 [17:52<01:45, 28.98it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 29.22it/s][A
  6%|▌         | 6/100 [00:00<00:03, 28.92it/s][A
  9%|▉         | 9/100 [00:00<00:03, 28.68it/s][A
 20%|██        | 20/100 [00:00<00:01, 57.99it/s][A
 27%|██▋       | 27/100 [00:00<00:01, 60.75it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 74.45it/s][A
 46%|████▌     | 46/100 [00:00<00:00, 65.10it/s][A
 53%|█████▎    | 53/100 [00:00<00:00, 65.22it/s][A
 60%|██████    | 60/100 [00:01<00:00, 51.44it/s][A
 68%|██████▊   | 68/100 [00:01<00:00, 57.26it/s][A
 75%|███████▌  | 75/100 [00:01<00:00, 53.11it/s][A
 81%|████████  | 81/100 [00:01<00:00, 47.76it/s][A
 87%|████████▋ | 87/100 [00:01<00:00, 45.24it/s][A
100%|██████████| 100/100 [00:01<00:00, 55.86it/s][A
 39%|███▉      | 1954/5000 [17:54<07:19,  6.93it/s]

Results after 100 episodes:
Average timesteps per episode: 347.98
Average penalty per episode: 0.0
Average reward per episode: -334.12


 40%|███▉      | 1999/5000 [17:58<02:17, 21.90it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  7%|▋         | 7/100 [00:00<00:01, 65.64it/s][A
 14%|█▍        | 14/100 [00:00<00:02, 42.35it/s][A
 19%|█▉        | 19/100 [00:00<00:01, 44.09it/s][A
 24%|██▍       | 24/100 [00:00<00:01, 45.31it/s][A
 31%|███       | 31/100 [00:00<00:01, 51.92it/s][A
 39%|███▉      | 39/100 [00:00<00:01, 58.89it/s][A
 46%|████▌     | 46/100 [00:00<00:01, 53.53it/s][A
 55%|█████▌    | 55/100 [00:01<00:00, 61.38it/s][A
 62%|██████▏   | 62/100 [00:01<00:00, 62.42it/s][A
 69%|██████▉   | 69/100 [00:01<00:00, 50.25it/s][A
 75%|███████▌  | 75/100 [00:01<00:00, 51.61it/s][A
 81%|████████  | 81/100 [00:01<00:00, 52.94it/s][A
 87%|████████▋ | 87/100 [00:01<00:00, 47.99it/s][A
 93%|█████████▎| 93/100 [00:01<00:00, 44.85it/s][A
100%|██████████| 100/100 [00:02<00:00, 47.68it/s][A
 40%|████      | 2003/5000 [18:00<09:59,  5.00it/s]

Results after 100 episodes:
Average timesteps per episode: 407.02
Average penalty per episode: 0.0
Average reward per episode: -394.42


 41%|████      | 2049/5000 [18:03<02:57, 16.66it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  7%|▋         | 7/100 [00:00<00:01, 65.64it/s][A
 14%|█▍        | 14/100 [00:00<00:01, 51.68it/s][A
 20%|██        | 20/100 [00:00<00:02, 39.20it/s][A
 25%|██▌       | 25/100 [00:00<00:01, 41.58it/s][A
 30%|███       | 30/100 [00:00<00:01, 38.08it/s][A
 35%|███▌      | 35/100 [00:00<00:01, 35.89it/s][A
 42%|████▏     | 42/100 [00:00<00:01, 43.51it/s][A
 47%|████▋     | 47/100 [00:01<00:01, 39.55it/s][A
 59%|█████▉    | 59/100 [00:01<00:00, 57.69it/s][A
 66%|██████▌   | 66/100 [00:01<00:00, 53.56it/s][A
 72%|███████▏  | 72/100 [00:01<00:00, 54.24it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 48.73it/s][A
 84%|████████▍ | 84/100 [00:01<00:00, 50.59it/s][A
 90%|█████████ | 90/100 [00:01<00:00, 51.97it/s][A
100%|██████████| 100/100 [00:01<00:00, 50.04it/s][A
 41%|████      | 2055/5000 [18:05<08:28,  5.79it/s]

Results after 100 episodes:
Average timesteps per episode: 387.58
Average penalty per episode: 0.0
Average reward per episode: -374.56


 42%|████▏     | 2096/5000 [18:06<01:45, 27.53it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 16%|█▌        | 16/100 [00:00<00:00, 142.06it/s][A
 31%|███       | 31/100 [00:00<00:00, 137.56it/s][A
 45%|████▌     | 45/100 [00:00<00:00, 130.43it/s][A
100%|██████████| 100/100 [00:00<00:00, 208.59it/s][A
 42%|████▏     | 2104/5000 [18:07<02:50, 16.95it/s]

Results after 100 episodes:
Average timesteps per episode: 91.83
Average penalty per episode: 0.0
Average reward per episode: -72.51


 43%|████▎     | 2148/5000 [18:09<01:49, 26.10it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  9%|▉         | 9/100 [00:00<00:01, 82.09it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 177.09it/s][A
 56%|█████▌    | 56/100 [00:00<00:00, 140.96it/s][A
100%|██████████| 100/100 [00:00<00:00, 188.60it/s][A
 43%|████▎     | 2154/5000 [18:09<03:23, 13.97it/s]

Results after 100 episodes:
Average timesteps per episode: 101.84
Average penalty per episode: 0.0
Average reward per episode: -82.73


 44%|████▍     | 2197/5000 [18:11<01:48, 25.72it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 125.42it/s][A
 44%|████▍     | 44/100 [00:00<00:00, 199.27it/s][A
 73%|███████▎  | 73/100 [00:00<00:00, 217.52it/s][A
100%|██████████| 100/100 [00:00<00:00, 171.33it/s][A
 44%|████▍     | 2204/5000 [18:12<03:09, 14.73it/s]

Results after 100 episodes:
Average timesteps per episode: 111.71
Average penalty per episode: 0.0
Average reward per episode: -92.81


 45%|████▍     | 2246/5000 [18:13<01:29, 30.87it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  4%|▍         | 4/100 [00:00<00:02, 38.15it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 136.90it/s][A
 41%|████      | 41/100 [00:00<00:00, 81.75it/s] [A
 51%|█████     | 51/100 [00:00<00:00, 84.67it/s][A
 61%|██████    | 61/100 [00:00<00:00, 86.34it/s][A
 71%|███████   | 71/100 [00:00<00:00, 78.02it/s][A
 80%|████████  | 80/100 [00:01<00:00, 70.76it/s][A
 88%|████████▊ | 88/100 [00:01<00:00, 58.37it/s][A
100%|██████████| 100/100 [00:01<00:00, 75.17it/s][A
 45%|████▌     | 2254/5000 [18:15<04:38,  9.88it/s]

Results after 100 episodes:
Average timesteps per episode: 259.48
Average penalty per episode: 0.0
Average reward per episode: -243.73


 46%|████▌     | 2298/5000 [18:16<01:33, 28.77it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 15%|█▌        | 15/100 [00:00<00:00, 134.38it/s][A
 29%|██▉       | 29/100 [00:00<00:00, 81.83it/s] [A
 39%|███▉      | 39/100 [00:00<00:00, 61.45it/s][A
 47%|████▋     | 47/100 [00:00<00:00, 64.31it/s][A
 56%|█████▌    | 56/100 [00:00<00:00, 69.66it/s][A
 64%|██████▍   | 64/100 [00:00<00:00, 56.45it/s][A
 71%|███████   | 71/100 [00:01<00:00, 52.66it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 57.76it/s][A
 86%|████████▌ | 86/100 [00:01<00:00, 53.15it/s][A
 92%|█████████▏| 92/100 [00:01<00:00, 54.02it/s][A
100%|██████████| 100/100 [00:01<00:00, 61.47it/s][A
 46%|████▌     | 2306/5000 [18:18<05:21,  8.38it/s]

Results after 100 episodes:
Average timesteps per episode: 309.0
Average penalty per episode: 0.0
Average reward per episode: -294.3


 47%|████▋     | 2347/5000 [18:21<03:26, 12.82it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 22%|██▏       | 22/100 [00:00<00:00, 187.06it/s][A
 41%|████      | 41/100 [00:00<00:00, 85.31it/s] [A
100%|██████████| 100/100 [00:00<00:00, 183.76it/s][A
 47%|████▋     | 2353/5000 [18:21<04:32,  9.71it/s]

Results after 100 episodes:
Average timesteps per episode: 102.3
Average penalty per episode: 0.0
Average reward per episode: -83.19


 48%|████▊     | 2396/5000 [18:23<01:33, 27.79it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 28.67it/s][A
 15%|█▌        | 15/100 [00:00<00:01, 77.52it/s][A
 23%|██▎       | 23/100 [00:00<00:01, 62.96it/s][A
 32%|███▏      | 32/100 [00:00<00:00, 69.55it/s][A
 46%|████▌     | 46/100 [00:00<00:00, 88.55it/s][A
 71%|███████   | 71/100 [00:00<00:00, 129.44it/s][A
 85%|████████▌ | 85/100 [00:00<00:00, 128.90it/s][A
100%|██████████| 100/100 [00:00<00:00, 101.55it/s][A
 48%|████▊     | 2404/5000 [18:24<03:37, 11.94it/s]

Results after 100 episodes:
Average timesteps per episode: 190.66
Average penalty per episode: 0.0
Average reward per episode: -173.44


 49%|████▉     | 2449/5000 [18:27<02:04, 20.46it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  9%|▉         | 9/100 [00:00<00:01, 83.61it/s][A
 21%|██        | 21/100 [00:00<00:00, 98.40it/s][A
 39%|███▉      | 39/100 [00:00<00:00, 126.37it/s][A
 52%|█████▏    | 52/100 [00:00<00:00, 104.39it/s][A
 63%|██████▎   | 63/100 [00:00<00:00, 89.50it/s] [A
100%|██████████| 100/100 [00:00<00:00, 119.87it/s]A


Results after 100 episodes:
Average timesteps per episode: 161.0
Average penalty per episode: 0.0
Average reward per episode: -143.15


 50%|████▉     | 2499/5000 [18:30<01:21, 30.61it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 15%|█▌        | 15/100 [00:00<00:00, 131.86it/s][A
 29%|██▉       | 29/100 [00:00<00:00, 126.78it/s][A
 57%|█████▋    | 57/100 [00:00<00:00, 170.48it/s][A
 74%|███████▍  | 74/100 [00:00<00:00, 137.76it/s][A
100%|██████████| 100/100 [00:00<00:00, 154.80it/s][A
 50%|█████     | 2503/5000 [18:31<03:20, 12.42it/s]

Results after 100 episodes:
Average timesteps per episode: 121.47
Average penalty per episode: 0.0
Average reward per episode: -102.78


 51%|█████     | 2547/5000 [18:33<02:02, 20.02it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 123.22it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 92.69it/s] [A
 37%|███▋      | 37/100 [00:00<00:00, 67.00it/s][A
 45%|████▌     | 45/100 [00:00<00:01, 49.07it/s][A
 51%|█████     | 51/100 [00:00<00:00, 50.25it/s][A
 59%|█████▉    | 59/100 [00:00<00:00, 55.81it/s][A
 66%|██████▌   | 66/100 [00:01<00:00, 51.79it/s][A
 72%|███████▏  | 72/100 [00:01<00:00, 47.38it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 49.49it/s][A
100%|██████████| 100/100 [00:01<00:00, 63.50it/s][A
 51%|█████     | 2554/5000 [18:35<05:37,  7.25it/s]

Results after 100 episodes:
Average timesteps per episode: 299.0
Average penalty per episode: 0.0
Average reward per episode: -284.09


 52%|█████▏    | 2596/5000 [18:36<01:21, 29.32it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 27.87it/s][A
 17%|█▋        | 17/100 [00:00<00:00, 84.31it/s][A
 26%|██▌       | 26/100 [00:00<00:01, 67.54it/s][A
 33%|███▎      | 33/100 [00:00<00:01, 50.05it/s][A
 41%|████      | 41/100 [00:00<00:01, 56.24it/s][A
 48%|████▊     | 48/100 [00:00<00:01, 51.34it/s][A
 63%|██████▎   | 63/100 [00:00<00:00, 72.64it/s][A
 71%|███████   | 71/100 [00:01<00:00, 58.03it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 53.68it/s][A
 84%|████████▍ | 84/100 [00:01<00:00, 41.43it/s][A
100%|██████████| 100/100 [00:01<00:00, 57.80it/s][A
 52%|█████▏    | 2603/5000 [18:38<05:20,  7.48it/s]

Results after 100 episodes:
Average timesteps per episode: 328.67
Average penalty per episode: 0.0
Average reward per episode: -314.39


 53%|█████▎    | 2648/5000 [18:40<01:21, 28.73it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:03, 27.87it/s][A
  9%|▉         | 9/100 [00:00<00:02, 44.75it/s][A
 14%|█▍        | 14/100 [00:00<00:01, 45.34it/s][A
 19%|█▉        | 19/100 [00:00<00:01, 45.78it/s][A
 24%|██▍       | 24/100 [00:00<00:02, 34.15it/s][A
 34%|███▍      | 34/100 [00:00<00:01, 49.77it/s][A
 40%|████      | 40/100 [00:00<00:01, 40.83it/s][A
 45%|████▌     | 45/100 [00:01<00:01, 37.80it/s][A
 53%|█████▎    | 53/100 [00:01<00:01, 46.22it/s][A
 62%|██████▏   | 62/100 [00:01<00:00, 55.10it/s][A
 69%|██████▉   | 69/100 [00:01<00:00, 45.89it/s][A
 83%|████████▎ | 83/100 [00:01<00:00, 64.40it/s][A
 91%|█████████ | 91/100 [00:01<00:00, 59.90it/s][A
100%|██████████| 100/100 [00:01<00:00, 51.61it/s][A
 53%|█████▎    | 2656/5000 [18:42<05:16,  7.42it/s]

Results after 100 episodes:
Average timesteps per episode: 367.29
Average penalty per episode: 0.0
Average reward per episode: -353.85


 54%|█████▍    | 2697/5000 [18:46<01:59, 19.35it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 20%|██        | 20/100 [00:00<00:00, 170.06it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 105.41it/s][A
 50%|█████     | 50/100 [00:00<00:00, 91.50it/s] [A
 60%|██████    | 60/100 [00:00<00:00, 79.70it/s][A
 70%|███████   | 70/100 [00:00<00:00, 81.98it/s][A
 79%|███████▉  | 79/100 [00:00<00:00, 72.10it/s][A
100%|██████████| 100/100 [00:01<00:00, 89.27it/s][A
 54%|█████▍    | 2703/5000 [18:47<04:14,  9.03it/s]

Results after 100 episodes:
Average timesteps per episode: 210.6
Average penalty per episode: 0.0
Average reward per episode: -193.8


 55%|█████▍    | 2747/5000 [18:49<01:18, 28.76it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
100%|██████████| 100/100 [00:00<00:00, 440.07it/s][A
 55%|█████▌    | 2754/5000 [18:49<01:45, 21.37it/s]

Results after 100 episodes:
Average timesteps per episode: 42.52
Average penalty per episode: 0.0
Average reward per episode: -22.15


 56%|█████▌    | 2798/5000 [18:52<01:58, 18.59it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 13%|█▎        | 13/100 [00:00<00:00, 115.43it/s][A
 28%|██▊       | 28/100 [00:00<00:00, 123.31it/s][A
 44%|████▍     | 44/100 [00:00<00:00, 124.33it/s][A
 57%|█████▋    | 57/100 [00:00<00:00, 103.32it/s][A
100%|██████████| 100/100 [00:00<00:00, 140.13it/s][A
 56%|█████▌    | 2805/5000 [18:53<03:04, 11.90it/s]

Results after 100 episodes:
Average timesteps per episode: 131.25
Average penalty per episode: 0.0
Average reward per episode: -112.77


 57%|█████▋    | 2847/5000 [18:54<01:27, 24.58it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 18%|█▊        | 18/100 [00:00<00:00, 154.36it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 185.93it/s][A
 62%|██████▏   | 62/100 [00:00<00:00, 146.67it/s][A
100%|██████████| 100/100 [00:00<00:00, 156.77it/s][A
 57%|█████▋    | 2853/5000 [18:55<03:31, 10.14it/s]

Results after 100 episodes:
Average timesteps per episode: 120.76
Average penalty per episode: 0.0
Average reward per episode: -102.07


 58%|█████▊    | 2896/5000 [18:57<01:10, 29.93it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  4%|▍         | 4/100 [00:00<00:02, 37.51it/s][A
  8%|▊         | 8/100 [00:00<00:02, 37.51it/s][A
 16%|█▌        | 16/100 [00:00<00:01, 53.94it/s][A
 23%|██▎       | 23/100 [00:00<00:01, 57.89it/s][A
 41%|████      | 41/100 [00:00<00:00, 91.90it/s][A
 50%|█████     | 50/100 [00:00<00:00, 67.68it/s][A
 60%|██████    | 60/100 [00:00<00:00, 73.92it/s][A
 68%|██████▊   | 68/100 [00:00<00:00, 73.66it/s][A
 86%|████████▌ | 86/100 [00:01<00:00, 97.17it/s][A
100%|██████████| 100/100 [00:01<00:00, 78.38it/s][A
 58%|█████▊    | 2904/5000 [18:58<03:27, 10.09it/s]

Results after 100 episodes:
Average timesteps per episode: 239.71
Average penalty per episode: 0.0
Average reward per episode: -223.54


 59%|█████▉    | 2949/5000 [19:00<01:09, 29.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 26%|██▌       | 26/100 [00:00<00:00, 210.32it/s][A
100%|██████████| 100/100 [00:00<00:00, 355.80it/s][A
 59%|█████▉    | 2953/5000 [19:00<01:58, 17.22it/s]

Results after 100 episodes:
Average timesteps per episode: 52.6
Average penalty per episode: 0.0
Average reward per episode: -32.44


 60%|█████▉    | 2997/5000 [19:02<01:10, 28.26it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 19%|█▉        | 19/100 [00:00<00:00, 162.94it/s][A
 36%|███▌      | 36/100 [00:00<00:00, 122.14it/s][A
 67%|██████▋   | 67/100 [00:00<00:00, 172.76it/s][A
100%|██████████| 100/100 [00:00<00:00, 168.63it/s][A
 60%|██████    | 3005/5000 [19:03<02:07, 15.69it/s]

Results after 100 episodes:
Average timesteps per episode: 112.02
Average penalty per episode: 0.0
Average reward per episode: -93.12


 61%|██████    | 3049/5000 [19:05<01:32, 21.18it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  5%|▌         | 5/100 [00:00<00:02, 46.89it/s][A
 11%|█         | 11/100 [00:00<00:01, 51.83it/s][A
 17%|█▋        | 17/100 [00:00<00:01, 53.17it/s][A
 23%|██▎       | 23/100 [00:00<00:01, 54.18it/s][A
 29%|██▉       | 29/100 [00:00<00:01, 46.53it/s][A
 35%|███▌      | 35/100 [00:00<00:01, 48.51it/s][A
 40%|████      | 40/100 [00:00<00:01, 45.86it/s][A
 48%|████▊     | 48/100 [00:00<00:00, 53.84it/s][A
 54%|█████▍    | 54/100 [00:01<00:00, 53.67it/s][A
 60%|██████    | 60/100 [00:01<00:00, 53.69it/s][A
 67%|██████▋   | 67/100 [00:01<00:00, 56.70it/s][A
 73%|███████▎  | 73/100 [00:01<00:00, 55.96it/s][A
 79%|███████▉  | 79/100 [00:01<00:00, 43.58it/s][A
 84%|████████▍ | 84/100 [00:01<00:00, 44.39it/s][A
 89%|████████▉ | 89/100 [00:01<00:00, 44.93it/s][A
 95%|█████████▌| 95/100 [00:01<00:00, 47.86it/s][A
100%|██████████| 100/100 [00:02<00:00, 49.38it/s][A
 61%|██████    | 3056

Results after 100 episodes:
Average timesteps per episode: 377.49
Average penalty per episode: 0.0
Average reward per episode: -364.26


 62%|██████▏   | 3098/5000 [19:09<01:22, 22.92it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 15%|█▌        | 15/100 [00:00<00:00, 125.42it/s][A
 37%|███▋      | 37/100 [00:00<00:00, 159.07it/s][A
 53%|█████▎    | 53/100 [00:00<00:00, 150.40it/s][A
 69%|██████▉   | 69/100 [00:00<00:00, 122.94it/s][A
100%|██████████| 100/100 [00:00<00:00, 153.18it/s][A


Results after 100 episodes:
Average timesteps per episode: 121.33
Average penalty per episode: 0.0
Average reward per episode: -102.64


 63%|██████▎   | 3146/5000 [19:11<01:00, 30.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  9%|▉         | 9/100 [00:00<00:01, 82.84it/s][A
 18%|█▊        | 18/100 [00:00<00:00, 82.40it/s][A
 31%|███       | 31/100 [00:00<00:00, 98.54it/s][A
 51%|█████     | 51/100 [00:00<00:00, 127.50it/s][A
 71%|███████   | 71/100 [00:00<00:00, 144.57it/s][A
100%|██████████| 100/100 [00:00<00:00, 120.16it/s][A
 63%|██████▎   | 3154/5000 [19:12<02:19, 13.24it/s]

Results after 100 episodes:
Average timesteps per episode: 160.48
Average penalty per episode: 0.0
Average reward per episode: -142.63


 64%|██████▍   | 3198/5000 [19:14<00:58, 30.77it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 23%|██▎       | 23/100 [00:00<00:00, 197.24it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 182.11it/s][A
100%|██████████| 100/100 [00:00<00:00, 233.34it/s][A
 64%|██████▍   | 3207/5000 [19:15<01:30, 19.72it/s]

Results after 100 episodes:
Average timesteps per episode: 81.7
Average penalty per episode: 0.0
Average reward per episode: -62.17


 65%|██████▍   | 3249/5000 [19:16<00:51, 33.78it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 21%|██        | 21/100 [00:00<00:00, 181.64it/s][A
 40%|████      | 40/100 [00:00<00:00, 169.74it/s][A
 57%|█████▋    | 57/100 [00:00<00:00, 161.33it/s][A
100%|██████████| 100/100 [00:00<00:00, 187.54it/s][A
 65%|██████▌   | 3253/5000 [19:17<02:08, 13.59it/s]

Results after 100 episodes:
Average timesteps per episode: 101.82
Average penalty per episode: 0.0
Average reward per episode: -82.71


 66%|██████▌   | 3299/5000 [19:19<01:37, 17.50it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 18.93it/s][A
  5%|▌         | 5/100 [00:00<00:03, 24.36it/s][A
 11%|█         | 11/100 [00:00<00:02, 38.63it/s][A
 15%|█▌        | 15/100 [00:00<00:02, 31.86it/s][A
 19%|█▉        | 19/100 [00:00<00:02, 33.67it/s][A
 27%|██▋       | 27/100 [00:00<00:01, 46.21it/s][A
 32%|███▏      | 32/100 [00:00<00:01, 46.28it/s][A
 39%|███▉      | 39/100 [00:00<00:01, 51.59it/s][A
 45%|████▌     | 45/100 [00:01<00:01, 46.35it/s][A
 50%|█████     | 50/100 [00:01<00:01, 46.73it/s][A
 55%|█████▌    | 55/100 [00:01<00:01, 35.62it/s][A
 60%|██████    | 60/100 [00:01<00:01, 38.26it/s][A
 65%|██████▌   | 65/100 [00:01<00:00, 40.62it/s][A
 71%|███████   | 71/100 [00:01<00:00, 44.65it/s][A
 76%|███████▌  | 76/100 [00:01<00:00, 45.13it/s][A
 81%|████████  | 81/100 [00:02<00:00, 35.89it/s][A
 86%|████████▌ | 86/100 [00:02<00:00, 38.27it/s][A
 94%|█████████▍| 94/100

Results after 100 episodes:
Average timesteps per episode: 446.75
Average penalty per episode: 0.0
Average reward per episode: -434.99


 67%|██████▋   | 3348/5000 [19:23<01:00, 27.28it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 11%|█         | 11/100 [00:00<00:00, 98.54it/s][A
 25%|██▌       | 25/100 [00:00<00:00, 110.86it/s][A
 37%|███▋      | 37/100 [00:00<00:00, 89.89it/s] [A
 48%|████▊     | 48/100 [00:00<00:00, 92.30it/s][A
 67%|██████▋   | 67/100 [00:00<00:00, 115.56it/s][A
 79%|███████▉  | 79/100 [00:00<00:00, 112.54it/s][A
100%|██████████| 100/100 [00:00<00:00, 103.81it/s][A
 67%|██████▋   | 3352/5000 [19:24<03:09,  8.69it/s]

Results after 100 episodes:
Average timesteps per episode: 180.35
Average penalty per episode: 0.0
Average reward per episode: -162.92


 68%|██████▊   | 3398/5000 [19:25<00:56, 28.53it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 13%|█▎        | 13/100 [00:00<00:00, 114.42it/s][A
 50%|█████     | 50/100 [00:00<00:00, 216.34it/s][A
100%|██████████| 100/100 [00:00<00:00, 226.49it/s][A
 68%|██████▊   | 3405/5000 [19:26<01:35, 16.70it/s]

Results after 100 episodes:
Average timesteps per episode: 81.73
Average penalty per episode: 0.0
Average reward per episode: -62.2


 69%|██████▉   | 3447/5000 [19:27<00:46, 33.37it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 120.06it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 75.52it/s] [A
 36%|███▌      | 36/100 [00:00<00:00, 77.29it/s][A
 48%|████▊     | 48/100 [00:00<00:00, 87.27it/s][A
100%|██████████| 100/100 [00:00<00:00, 132.54it/s][A
 69%|██████▉   | 3455/5000 [19:28<01:47, 14.31it/s]

Results after 100 episodes:
Average timesteps per episode: 140.95
Average penalty per episode: 0.0
Average reward per episode: -122.68


 70%|██████▉   | 3497/5000 [19:30<00:48, 31.10it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 10%|█         | 10/100 [00:00<00:00, 91.21it/s][A
 31%|███       | 31/100 [00:00<00:00, 144.07it/s][A
 47%|████▋     | 47/100 [00:00<00:00, 143.16it/s][A
 62%|██████▏   | 62/100 [00:00<00:00, 138.76it/s][A
 76%|███████▌  | 76/100 [00:00<00:00, 133.16it/s][A
100%|██████████| 100/100 [00:00<00:00, 118.88it/s][A
 70%|███████   | 3505/5000 [19:31<01:51, 13.43it/s]

Results after 100 episodes:
Average timesteps per episode: 160.65
Average penalty per episode: 0.0
Average reward per episode: -142.8


 71%|███████   | 3547/5000 [19:34<02:40,  9.06it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  7%|▋         | 7/100 [00:00<00:01, 66.28it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 202.98it/s][A
 63%|██████▎   | 63/100 [00:00<00:00, 155.63it/s][A
100%|██████████| 100/100 [00:00<00:00, 186.59it/s][A
 71%|███████   | 3554/5000 [19:35<02:24, 10.04it/s]

Results after 100 episodes:
Average timesteps per episode: 101.75
Average penalty per episode: 0.0
Average reward per episode: -82.64


 72%|███████▏  | 3599/5000 [19:37<01:21, 17.18it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 122.15it/s][A
 40%|████      | 40/100 [00:00<00:00, 175.68it/s][A
100%|██████████| 100/100 [00:00<00:00, 203.53it/s][A
 72%|███████▏  | 3606/5000 [19:38<01:39, 14.01it/s]

Results after 100 episodes:
Average timesteps per episode: 91.34
Average penalty per episode: 0.0
Average reward per episode: -72.02


 73%|███████▎  | 3648/5000 [19:40<00:51, 26.05it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 12%|█▏        | 12/100 [00:00<00:00, 104.70it/s][A
 34%|███▍      | 34/100 [00:00<00:00, 149.15it/s][A
 53%|█████▎    | 53/100 [00:00<00:00, 154.76it/s][A
100%|██████████| 100/100 [00:00<00:00, 165.57it/s][A
 73%|███████▎  | 3655/5000 [19:41<01:34, 14.25it/s]

Results after 100 episodes:
Average timesteps per episode: 111.72
Average penalty per episode: 0.0
Average reward per episode: -92.82


 74%|███████▍  | 3699/5000 [19:42<00:45, 28.72it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  7%|▋         | 7/100 [00:00<00:01, 65.03it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 127.08it/s][A
 40%|████      | 40/100 [00:00<00:00, 98.93it/s] [A
 58%|█████▊    | 58/100 [00:00<00:00, 115.97it/s][A
 72%|███████▏  | 72/100 [00:00<00:00, 117.98it/s][A
 85%|████████▌ | 85/100 [00:00<00:00, 99.03it/s] [A
100%|██████████| 100/100 [00:01<00:00, 96.01it/s][A
 74%|███████▍  | 3705/5000 [19:43<02:12,  9.75it/s]

Results after 100 episodes:
Average timesteps per episode: 190.49
Average penalty per episode: 0.0
Average reward per episode: -173.27


 75%|███████▍  | 3747/5000 [19:45<00:38, 32.97it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 20%|██        | 20/100 [00:00<00:00, 171.51it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 161.42it/s][A
 64%|██████▍   | 64/100 [00:00<00:00, 186.28it/s][A
 83%|████████▎ | 83/100 [00:00<00:00, 128.94it/s][A
100%|██████████| 100/100 [00:00<00:00, 142.83it/s][A
 75%|███████▌  | 3754/5000 [19:46<01:27, 14.29it/s]

Results after 100 episodes:
Average timesteps per episode: 131.07
Average penalty per episode: 0.0
Average reward per episode: -112.59


 76%|███████▌  | 3797/5000 [19:47<00:36, 33.07it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  6%|▌         | 6/100 [00:00<00:01, 56.26it/s][A
 13%|█▎        | 13/100 [00:00<00:01, 61.78it/s][A
 20%|██        | 20/100 [00:00<00:01, 52.37it/s][A
 29%|██▉       | 29/100 [00:00<00:01, 62.95it/s][A
 36%|███▌      | 36/100 [00:00<00:01, 54.70it/s][A
 42%|████▏     | 42/100 [00:00<00:01, 55.28it/s][A
 49%|████▉     | 49/100 [00:00<00:00, 57.99it/s][A
 55%|█████▌    | 55/100 [00:01<00:00, 50.46it/s][A
 61%|██████    | 61/100 [00:01<00:00, 51.89it/s][A
 67%|██████▋   | 67/100 [00:01<00:00, 52.69it/s][A
 85%|████████▌ | 85/100 [00:01<00:00, 83.67it/s][A
100%|██████████| 100/100 [00:01<00:00, 57.04it/s][A
 76%|███████▌  | 3805/5000 [19:49<02:27,  8.12it/s]

Results after 100 episodes:
Average timesteps per episode: 338.3
Average penalty per episode: 0.0
Average reward per episode: -324.23


 77%|███████▋  | 3846/5000 [19:50<00:39, 28.88it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  5%|▌         | 5/100 [00:00<00:02, 46.89it/s][A
 60%|██████    | 60/100 [00:00<00:00, 270.61it/s][A
100%|██████████| 100/100 [00:00<00:00, 231.19it/s][A
 77%|███████▋  | 3854/5000 [19:51<01:02, 18.21it/s]

Results after 100 episodes:
Average timesteps per episode: 82.23
Average penalty per episode: 0.0
Average reward per episode: -62.7


 78%|███████▊  | 3899/5000 [19:53<00:34, 31.74it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:05, 19.30it/s][A
  5%|▌         | 5/100 [00:00<00:03, 24.15it/s][A
 10%|█         | 10/100 [00:00<00:02, 34.50it/s][A
 14%|█▍        | 14/100 [00:00<00:02, 30.14it/s][A
 19%|█▉        | 19/100 [00:00<00:02, 34.70it/s][A
 25%|██▌       | 25/100 [00:00<00:01, 40.68it/s][A
 34%|███▍      | 34/100 [00:00<00:01, 53.49it/s][A
 40%|████      | 40/100 [00:00<00:01, 46.94it/s][A
 46%|████▌     | 46/100 [00:01<00:01, 49.44it/s][A
 71%|███████   | 71/100 [00:01<00:00, 97.70it/s][A
 82%|████████▏ | 82/100 [00:01<00:00, 98.19it/s][A
100%|██████████| 100/100 [00:01<00:00, 65.44it/s][A
 78%|███████▊  | 3903/5000 [19:54<02:40,  6.84it/s]

Results after 100 episodes:
Average timesteps per episode: 289.13
Average penalty per episode: 0.0
Average reward per episode: -274.01


 79%|███████▉  | 3948/5000 [19:55<00:31, 33.08it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 20%|██        | 20/100 [00:00<00:00, 172.99it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 161.86it/s][A
100%|██████████| 100/100 [00:00<00:00, 265.24it/s][A
 79%|███████▉  | 3956/5000 [19:56<00:50, 20.76it/s]

Results after 100 episodes:
Average timesteps per episode: 72.72
Average penalty per episode: 0.0
Average reward per episode: -52.98


 80%|███████▉  | 3997/5000 [19:57<00:31, 31.55it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 124.31it/s][A
 27%|██▋       | 27/100 [00:00<00:00, 119.09it/s][A
 51%|█████     | 51/100 [00:00<00:00, 158.18it/s][A
 74%|███████▍  | 74/100 [00:00<00:00, 173.70it/s][A
100%|██████████| 100/100 [00:00<00:00, 128.47it/s][A
 80%|████████  | 4005/5000 [19:58<01:10, 14.19it/s]

Results after 100 episodes:
Average timesteps per episode: 150.95
Average penalty per episode: 0.0
Average reward per episode: -132.89


 81%|████████  | 4046/5000 [19:59<00:31, 30.58it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 17%|█▋        | 17/100 [00:00<00:00, 149.62it/s][A
 32%|███▏      | 32/100 [00:00<00:00, 140.04it/s][A
 55%|█████▌    | 55/100 [00:00<00:00, 167.27it/s][A
 72%|███████▏  | 72/100 [00:00<00:00, 160.90it/s][A
100%|██████████| 100/100 [00:00<00:00, 159.42it/s][A
 81%|████████  | 4054/5000 [20:00<01:01, 15.36it/s]

Results after 100 episodes:
Average timesteps per episode: 121.37
Average penalty per episode: 0.0
Average reward per episode: -102.68


 82%|████████▏ | 4099/5000 [20:02<00:26, 33.77it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 13%|█▎        | 13/100 [00:00<00:00, 120.77it/s][A
 26%|██▌       | 26/100 [00:00<00:00, 77.37it/s] [A
 35%|███▌      | 35/100 [00:00<00:01, 60.08it/s][A
 42%|████▏     | 42/100 [00:00<00:00, 61.60it/s][A
 53%|█████▎    | 53/100 [00:00<00:00, 72.46it/s][A
 61%|██████    | 61/100 [00:00<00:00, 73.20it/s][A
 69%|██████▉   | 69/100 [00:01<00:00, 57.91it/s][A
 78%|███████▊  | 78/100 [00:01<00:00, 63.80it/s][A
 85%|████████▌ | 85/100 [00:01<00:00, 63.66it/s][A
 92%|█████████▏| 92/100 [00:01<00:00, 63.87it/s][A
100%|██████████| 100/100 [00:01<00:00, 66.87it/s][A
 82%|████████▏ | 4103/5000 [20:03<02:08,  6.99it/s]

Results after 100 episodes:
Average timesteps per episode: 289.21
Average penalty per episode: 0.0
Average reward per episode: -274.09


 83%|████████▎ | 4148/5000 [20:05<00:27, 31.52it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 24%|██▍       | 24/100 [00:00<00:00, 205.81it/s][A
100%|██████████| 100/100 [00:00<00:00, 447.92it/s][A
 83%|████████▎ | 4155/5000 [20:05<00:39, 21.60it/s]

Results after 100 episodes:
Average timesteps per episode: 42.59
Average penalty per episode: 0.0
Average reward per episode: -22.22


 84%|████████▍ | 4199/5000 [20:06<00:21, 37.07it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 50%|█████     | 50/100 [00:00<00:00, 366.18it/s][A
100%|██████████| 100/100 [00:00<00:00, 214.39it/s][A
 84%|████████▍ | 4203/5000 [20:07<00:50, 15.90it/s]

Results after 100 episodes:
Average timesteps per episode: 91.81
Average penalty per episode: 0.0
Average reward per episode: -72.49


 85%|████████▍ | 4248/5000 [20:09<00:26, 28.73it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  7%|▋         | 7/100 [00:00<00:01, 66.26it/s][A
 14%|█▍        | 14/100 [00:00<00:02, 42.31it/s][A
 22%|██▏       | 22/100 [00:00<00:01, 53.62it/s][A
 30%|███       | 30/100 [00:00<00:01, 60.38it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 79.20it/s][A
 52%|█████▏    | 52/100 [00:00<00:00, 70.18it/s][A
 60%|██████    | 60/100 [00:00<00:00, 71.13it/s][A
 76%|███████▌  | 76/100 [00:01<00:00, 91.59it/s][A
 86%|████████▌ | 86/100 [00:01<00:00, 91.48it/s][A
100%|██████████| 100/100 [00:01<00:00, 71.76it/s][A
 85%|████████▌ | 4251/5000 [20:10<01:50,  6.80it/s]

Results after 100 episodes:
Average timesteps per episode: 269.47
Average penalty per episode: 0.0
Average reward per episode: -253.93


 86%|████████▌ | 4296/5000 [20:12<00:23, 29.79it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 34%|███▍      | 34/100 [00:00<00:00, 266.51it/s][A
 61%|██████    | 61/100 [00:00<00:00, 160.99it/s][A
 79%|███████▉  | 79/100 [00:00<00:00, 158.23it/s][A
100%|██████████| 100/100 [00:00<00:00, 168.91it/s][A
 86%|████████▌ | 4304/5000 [20:13<00:44, 15.77it/s]

Results after 100 episodes:
Average timesteps per episode: 111.75
Average penalty per episode: 0.0
Average reward per episode: -92.85


 87%|████████▋ | 4348/5000 [20:14<00:18, 34.77it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  4%|▍         | 4/100 [00:00<00:02, 38.22it/s][A
 13%|█▎        | 13/100 [00:00<00:01, 65.23it/s][A
 47%|████▋     | 47/100 [00:00<00:00, 167.56it/s][A
100%|██████████| 100/100 [00:00<00:00, 190.03it/s][A
 87%|████████▋ | 4355/5000 [20:15<00:40, 15.97it/s]

Results after 100 episodes:
Average timesteps per episode: 101.53
Average penalty per episode: 0.0
Average reward per episode: -82.42


 88%|████████▊ | 4397/5000 [20:16<00:17, 34.56it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  8%|▊         | 8/100 [00:00<00:01, 74.32it/s][A
 21%|██        | 21/100 [00:00<00:00, 99.48it/s][A
 34%|███▍      | 34/100 [00:00<00:00, 107.74it/s][A
 52%|█████▏    | 52/100 [00:00<00:00, 127.66it/s][A
 65%|██████▌   | 65/100 [00:00<00:00, 123.63it/s][A
 78%|███████▊  | 78/100 [00:00<00:00, 105.80it/s][A
100%|██████████| 100/100 [00:00<00:00, 120.30it/s][A
 88%|████████▊ | 4405/5000 [20:18<00:42, 14.04it/s]

Results after 100 episodes:
Average timesteps per episode: 161.25
Average penalty per episode: 0.0
Average reward per episode: -143.4


 89%|████████▉ | 4448/5000 [20:19<00:17, 31.36it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 11%|█         | 11/100 [00:00<00:00, 100.33it/s][A
 29%|██▉       | 29/100 [00:00<00:00, 132.81it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 128.95it/s][A
 56%|█████▌    | 56/100 [00:00<00:00, 124.06it/s][A
100%|██████████| 100/100 [00:00<00:00, 145.94it/s][A
 89%|████████▉ | 4456/5000 [20:20<00:35, 15.36it/s]

Results after 100 episodes:
Average timesteps per episode: 131.48
Average penalty per episode: 0.0
Average reward per episode: -113.0


 90%|████████▉ | 4499/5000 [20:21<00:15, 33.22it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  8%|▊         | 8/100 [00:00<00:01, 74.32it/s][A
 25%|██▌       | 25/100 [00:00<00:00, 119.60it/s][A
 46%|████▌     | 46/100 [00:00<00:00, 147.35it/s][A
 61%|██████    | 61/100 [00:00<00:00, 142.29it/s][A
100%|██████████| 100/100 [00:00<00:00, 172.10it/s][A
 90%|█████████ | 4503/5000 [20:22<00:37, 13.42it/s]

Results after 100 episodes:
Average timesteps per episode: 111.91
Average penalty per episode: 0.0
Average reward per episode: -93.01


 91%|█████████ | 4549/5000 [20:23<00:14, 32.05it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  9%|▉         | 9/100 [00:00<00:01, 84.39it/s][A
 24%|██▍       | 24/100 [00:00<00:00, 114.34it/s][A
 43%|████▎     | 43/100 [00:00<00:00, 139.54it/s][A
100%|██████████| 100/100 [00:00<00:00, 161.83it/s][A
 91%|█████████ | 4553/5000 [20:24<00:34, 13.00it/s]

Results after 100 episodes:
Average timesteps per episode: 120.75
Average penalty per episode: 0.0
Average reward per episode: -102.06


 92%|█████████▏| 4599/5000 [20:25<00:12, 31.65it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 19%|█▉        | 19/100 [00:00<00:00, 167.22it/s][A
 36%|███▌      | 36/100 [00:00<00:00, 156.06it/s][A
 53%|█████▎    | 53/100 [00:00<00:00, 153.74it/s][A
 69%|██████▉   | 69/100 [00:00<00:00, 126.63it/s][A
100%|██████████| 100/100 [00:00<00:00, 136.85it/s][A
 92%|█████████▏| 4607/5000 [20:26<00:26, 14.67it/s]

Results after 100 episodes:
Average timesteps per episode: 141.13
Average penalty per episode: 0.0
Average reward per episode: -122.86


 93%|█████████▎| 4648/5000 [20:29<00:21, 16.00it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 23%|██▎       | 23/100 [00:00<00:00, 190.72it/s][A
 62%|██████▏   | 62/100 [00:00<00:00, 255.12it/s][A
100%|██████████| 100/100 [00:00<00:00, 157.26it/s][A
 93%|█████████▎| 4653/5000 [20:30<00:36,  9.60it/s]

Results after 100 episodes:
Average timesteps per episode: 121.64
Average penalty per episode: 0.0
Average reward per episode: -102.95


 94%|█████████▍| 4696/5000 [20:31<00:08, 33.99it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 12%|█▏        | 12/100 [00:00<00:00, 108.47it/s][A
 26%|██▌       | 26/100 [00:00<00:00, 118.54it/s][A
 38%|███▊      | 38/100 [00:00<00:00, 71.07it/s] [A
 47%|████▋     | 47/100 [00:00<00:00, 65.96it/s][A
 55%|█████▌    | 55/100 [00:00<00:00, 60.36it/s][A
 67%|██████▋   | 67/100 [00:00<00:00, 72.78it/s][A
 77%|███████▋  | 77/100 [00:01<00:00, 77.49it/s][A
100%|██████████| 100/100 [00:01<00:00, 88.00it/s][A
 94%|█████████▍| 4704/5000 [20:32<00:25, 11.53it/s]

Results after 100 episodes:
Average timesteps per episode: 219.84
Average penalty per episode: 0.0
Average reward per episode: -203.25


 95%|█████████▍| 4749/5000 [20:34<00:07, 35.34it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 18%|█▊        | 18/100 [00:00<00:00, 159.83it/s][A
100%|██████████| 100/100 [00:00<00:00, 370.24it/s][A
 95%|█████████▌| 4753/5000 [20:34<00:12, 20.34it/s]

Results after 100 episodes:
Average timesteps per episode: 52.25
Average penalty per episode: 0.0
Average reward per episode: -32.09


 96%|█████████▌| 4796/5000 [20:35<00:06, 33.44it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 10%|█         | 10/100 [00:00<00:00, 92.90it/s][A
 32%|███▏      | 32/100 [00:00<00:00, 151.08it/s][A
 47%|████▋     | 47/100 [00:00<00:00, 89.71it/s] [A
 58%|█████▊    | 58/100 [00:00<00:00, 92.73it/s][A
 74%|███████▍  | 74/100 [00:00<00:00, 107.27it/s][A
 87%|████████▋ | 87/100 [00:00<00:00, 110.56it/s][A
100%|██████████| 100/100 [00:01<00:00, 97.42it/s][A
 96%|█████████▌| 4804/5000 [20:37<00:16, 12.17it/s]

Results after 100 episodes:
Average timesteps per episode: 200.33
Average penalty per episode: 0.0
Average reward per episode: -183.32


 97%|█████████▋| 4848/5000 [20:38<00:04, 35.76it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 14%|█▍        | 14/100 [00:00<00:00, 125.42it/s][A
 31%|███       | 31/100 [00:00<00:00, 141.23it/s][A
 47%|████▋     | 47/100 [00:00<00:00, 141.04it/s][A
 74%|███████▍  | 74/100 [00:00<00:00, 175.61it/s][A
100%|██████████| 100/100 [00:00<00:00, 147.95it/s][A
 97%|█████████▋| 4857/5000 [20:39<00:08, 16.66it/s]

Results after 100 episodes:
Average timesteps per episode: 131.16
Average penalty per episode: 0.0
Average reward per episode: -112.68


 98%|█████████▊| 4897/5000 [20:40<00:03, 31.47it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 35%|███▌      | 35/100 [00:00<00:00, 278.71it/s][A
 63%|██████▎   | 63/100 [00:00<00:00, 206.41it/s][A
100%|██████████| 100/100 [00:00<00:00, 212.16it/s][A
 98%|█████████▊| 4905/5000 [20:41<00:05, 17.71it/s]

Results after 100 episodes:
Average timesteps per episode: 91.96
Average penalty per episode: 0.0
Average reward per episode: -72.64


 99%|█████████▉| 4947/5000 [20:42<00:01, 31.92it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  5%|▌         | 5/100 [00:00<00:01, 48.24it/s][A
 45%|████▌     | 45/100 [00:00<00:00, 215.26it/s][A
 66%|██████▌   | 66/100 [00:00<00:00, 200.14it/s][A
100%|██████████| 100/100 [00:00<00:00, 190.04it/s][A
 99%|█████████▉| 4954/5000 [20:43<00:02, 16.34it/s]

Results after 100 episodes:
Average timesteps per episode: 101.64
Average penalty per episode: 0.0
Average reward per episode: -82.53


100%|█████████▉| 4999/5000 [20:44<00:00, 31.73it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  5%|▌         | 5/100 [00:00<00:02, 47.33it/s][A
 12%|█▏        | 12/100 [00:00<00:01, 57.51it/s][A
 18%|█▊        | 18/100 [00:00<00:01, 56.22it/s][A
 46%|████▌     | 46/100 [00:00<00:00, 128.42it/s][A
 65%|██████▌   | 65/100 [00:00<00:00, 140.56it/s][A
 81%|████████  | 81/100 [00:00<00:00, 139.82it/s][A
100%|██████████| 100/100 [00:00<00:00, 104.73it/s][A
100%|██████████| 5000/5000 [20:45<00:00,  4.01it/s]


Results after 100 episodes:
Average timesteps per episode: 180.67
Average penalty per episode: 0.0
Average reward per episode: -163.24
Training finished.



100%|██████████| 100/100 [00:00<00:00, 114.28it/s]

Results after 100 episodes:
Average timesteps per episode: 170.75
Average penalty per episode: 0.0
Average reward per episode: -153.11



