Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Continuous Mountain Car"

In [2]:
from tqdm import tqdm
import numpy as np
import wandb
import gym
from car_model import Car
from mountain_car_agent import MountainCarAgent

In [4]:
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

# Cambiar render_mode a rgb_array para entrenar/testear
env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')

In [5]:
x_bins = 20  # Number of bins for position
vel_bins = 20  # Number of bins for velocity
action_bins = 5  # Number of discrete actions to sample from
model = Car(env, x_bins, vel_bins, action_bins)

In [6]:
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
agent = MountainCarAgent(model, alpha, gamma)

In [26]:
# Train the agent
num_training_episodes = 1000
epsilon = 0.2
average_training_rewards = agent.train(num_training_episodes, epsilon)
print(f"Average training reward over {num_training_episodes} episodes: {average_training_rewards}")

Training Progress:   0%|          | 0/1000 [00:00<?, ?episode/s]

Training Progress: 100%|██████████| 1000/1000 [00:03<00:00, 250.65episode/s, Episode Reward=-45]

Average training reward over 1000 episodes: -161.604





In [27]:
# Evaluate the agent
num_evaluation_episodes = 100
average_evaluation_rewards = agent.test(num_evaluation_episodes)
print(f"Average evaluation reward over {num_evaluation_episodes} episodes: {average_evaluation_rewards}")

Average evaluation reward over 100 episodes: -33.56


Obtener el estado a partir de la observación

In [28]:
wandb.init(project="mountain_car",
           config={
               'x_bins': x_bins,
               'vel_bins': vel_bins,
               'action_bins': action_bins,
               'alpha': alpha,
               'gamma': gamma,
               'epsilon': epsilon,
           })
    
epsilon_initial = epsilon
for t in range(10):
    train_value = agent.train(100, epsilon_initial)
    eval_value = agent.test(30)
    wandb.log({'train_value': train_value, 'eval_value': eval_value, "t": t})
    epsilon_initial *= 0.9  # Decay epsilon over iterations

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmateogiraz27[0m ([33mmateogiraz27-ort[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Progress: 100%|██████████| 100/100 [00:00<00:00, 266.57episode/s, Episode Reward=-152]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 230.79episode/s, Episode Reward=-52] 
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 309.43episode/s, Episode Reward=-84]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 385.02episode/s, Episode Reward=-63]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 224.38episode/s, Episode Reward=-184]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 332.26episode/s, Episode Reward=-46] 
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 238.91episode/s, Episode Reward=-82]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 344.05episode/s, Episode Reward=-46]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 310.74episode/s, Episode Reward=-46]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 307.73episode/s, Episode Reward=-40]


In [29]:
import wandb
from car_model import Car
from mountain_car_agent import MountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep():
    wandb.init()
    config = wandb.config

    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    model = Car(env, config.x_bins, config.vel_bins, config.action_bins)
    agent = MountainCarAgent(model, config.alpha, config.gamma)

    base_epsilon = config.epsilon

    for t in range(12):
        train_value = agent.train(500, base_epsilon)
        test_value = agent.test(50)
        wandb.log({"train_value": train_value, "test_value": test_value, "t" : t})
        base_epsilon *= (10-t)/10

sweep_config = {
        'name': 'bayesian-sweep-epsilon-decay',
        'method': 'bayes',
        'metric': {
            'name': 'test_value',
            'goal': 'maximize'
        },
        'parameters': {
            'alpha': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'epsilon': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'gamma': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'action_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            },
            'vel_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            },
            'x_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            }
        }
    }
entity = "mateogiraz27-ort"
project = "mountain_car"
sweep_id = wandb.sweep(sweep_config, entity=entity, project=project)

Create sweep with ID: kvt4c6r7
Sweep URL: https://wandb.ai/mateogiraz27-ort/mountain_car/sweeps/kvt4c6r7


In [30]:
sweep_id = "mgvkiiaf"
wandb.agent(sweep_id, function=sweep, count=100, entity=entity, project=project)

[34m[1mwandb[0m: Agent Starting Run: mtuh3h6m with config:
[34m[1mwandb[0m: 	action_bins: 10
[34m[1mwandb[0m: 	alpha: 0.5349624470038623
[34m[1mwandb[0m: 	epsilon: 0.5111950360798202
[34m[1mwandb[0m: 	gamma: 0.7586095428869182
[34m[1mwandb[0m: 	vel_bins: 47
[34m[1mwandb[0m: 	x_bins: 84


[1;34mwandb[0m: 🚀 View run [33mglamorous-butterfly-512[0m at: [34mhttps://wandb.ai/mateogiraz27-ort/mountain_car/runs/mzk13lz1[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20241213_010839-mzk13lz1/logs[0m


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.63episode/s, Episode Reward=-757]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.53episode/s, Episode Reward=-256]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 203.98episode/s, Episode Reward=-234]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 252.45episode/s, Episode Reward=-148]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 274.79episode/s, Episode Reward=-67] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 323.89episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 325.27episode/s, Episode Reward=-10] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 311.60episode/s, Episode Reward=-7]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 324.06episode/s, Episode Reward=-43] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 402.48episode/s, Episode Reward=-64] 
Training Progress: 100%|██████████| 500

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▄▄▇▄███▁▃███
train_value,▁▅▆▇▇███████

0,1
t,11.0
test_value,-33.06
train_value,-48.896


[34m[1mwandb[0m: Agent Starting Run: fga0p2i5 with config:
[34m[1mwandb[0m: 	action_bins: 17
[34m[1mwandb[0m: 	alpha: 0.6853138323766247
[34m[1mwandb[0m: 	epsilon: 0.5109640314754463
[34m[1mwandb[0m: 	gamma: 0.7399405569303439
[34m[1mwandb[0m: 	vel_bins: 75
[34m[1mwandb[0m: 	x_bins: 87


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.61episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.52episode/s, Episode Reward=-428] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.80episode/s, Episode Reward=-525] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.34episode/s, Episode Reward=-289]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 200.12episode/s, Episode Reward=-172]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 220.01episode/s, Episode Reward=-149]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 244.46episode/s, Episode Reward=-156]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 256.95episode/s, Episode Reward=-103]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 278.53episode/s, Episode Reward=-99] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 284.96episode/s, Episode Reward=-89] 
Training Progress: 100%|██████████| 50

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇▇▇▆████▇██
train_value,▁▄▅▆▇▇██████

0,1
t,11.0
test_value,-84.64
train_value,-87.74


[34m[1mwandb[0m: Agent Starting Run: gtucxebv with config:
[34m[1mwandb[0m: 	action_bins: 13
[34m[1mwandb[0m: 	alpha: 0.5288530799749362
[34m[1mwandb[0m: 	epsilon: 0.9754818270101902
[34m[1mwandb[0m: 	gamma: 0.5567815836172685
[34m[1mwandb[0m: 	vel_bins: 87
[34m[1mwandb[0m: 	x_bins: 26


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.37episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.62episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.87episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.30episode/s, Episode Reward=-501] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 156.76episode/s, Episode Reward=-181]
Training Progress:  66%|██████▌   | 328/500 [00:01<00:00, 221.51episode/s, Episode Reward=-233][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
Training Progress:  73%|███████▎  | 366/500 [00:01<00:00, 203.13episode/s, Episode Reward=-128]

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x1081f8080>> (for post_run_cell), with arguments args (<ExecutionResult object at 10ffd8710, execution_count=30 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 10ffd8e30, raw_cell="sweep_id = "mgvkiiaf"
wandb.agent(sweep_id, functi.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/mateogiraz/Documents/MountainCarContinuous%20%281%29/car-continuous/continuous_mountain_car.ipynb#X13sZmlsZQ%3D%3D> result=None>,),kwargs {}:


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 213.21episode/s, Episode Reward=-99] 
Training Progress:   6%|▌         | 31/500 [00:00<00:02, 221.39episode/s, Episode Reward=-80] 

BrokenPipeError: [Errno 32] Broken pipe

In [None]:
import wandb
from car_model import Car
from mountain_car_agent import MountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep_v2():
    wandb.init()
    config = wandb.config

    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    model = Car(env, config.x_bins, config.vel_bins, config.action_bins)
    agent = MountainCarAgent(model, config.alpha, config.gamma)

    base_epsilon = config.epsilon

    for t in range(20):
        train_value = agent.train(500, base_epsilon)
        test_value = agent.test(50)
        wandb.log({"train_value": train_value, "test_value": test_value, "t" : t})
        base_epsilon *= (10-t)/10

sweep_config = {
    'name': 'bayesian-sweep-epsilon-decay-v2',
    'method': 'bayes',
    'metric': {
        'name': 'test_value',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {
            'distribution': 'uniform',
            'min': 0.508,
            'max': 0.99
        },
        'epsilon': {
            'distribution': 'uniform',
            'min': 0.503,
            'max': 0.989
        },
        'gamma': {
            'distribution': 'uniform',
            'min': 0.729,
            'max': 0.974
        },
        'action_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 100
        },
        'vel_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 25
        },
        'x_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 24
        }
    }
}
entity = "mateogiraz27-ort"
project = "mountain_car"
sweep_id = wandb.sweep(sweep_config, entity=entity, project=project)

In [6]:
sweep_id = "l6wrf4n4"
wandb.agent(sweep_id, function=sweep_v2, count=100, entity=entity, project=project)

[34m[1mwandb[0m: Agent Starting Run: ohf1cb0f with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6718927213010032
[34m[1mwandb[0m: 	epsilon: 0.830565126667738
[34m[1mwandb[0m: 	gamma: 0.8871260539991896
[34m[1mwandb[0m: 	vel_bins: 12
[34m[1mwandb[0m: 	x_bins: 14


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.60episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.44episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.84episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.54episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.64episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.43episode/s, Episode Reward=-1000] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.92episode/s, Episode Reward=-656] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.53episode/s, Episode Reward=-273] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.63episode/s, Episode Reward=-49]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.87episode/s, Episode Reward=-45]  
Training Progress: 100%|██████████|

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▁▁▁▁▆▆▁▁███
train_value,▁▁▁▁▁▂▄▅▆▇██

0,1
t,11.0
test_value,-29.8
train_value,-27.066


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6zd5n5cx with config:
[34m[1mwandb[0m: 	action_bins: 26
[34m[1mwandb[0m: 	alpha: 0.5199549354506932
[34m[1mwandb[0m: 	epsilon: 0.9273838229092624
[34m[1mwandb[0m: 	gamma: 0.8667085989818117
[34m[1mwandb[0m: 	vel_bins: 18
[34m[1mwandb[0m: 	x_bins: 14


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.57episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.77episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.99episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.73episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.47episode/s, Episode Reward=-726] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.49episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 136.16episode/s, Episode Reward=-253] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 215.16episode/s, Episode Reward=-86]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 262.38episode/s, Episode Reward=-66]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 321.54episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▁▂▁▇███
train_value,▁▁▁▁▂▃▅▇▇███

0,1
t,11.0
test_value,-5.32
train_value,-89.592


[34m[1mwandb[0m: Agent Starting Run: sgqzcmpw with config:
[34m[1mwandb[0m: 	action_bins: 52
[34m[1mwandb[0m: 	alpha: 0.7828419662676925
[34m[1mwandb[0m: 	epsilon: 0.5486364810025504
[34m[1mwandb[0m: 	gamma: 0.9432563783004267
[34m[1mwandb[0m: 	vel_bins: 19
[34m[1mwandb[0m: 	x_bins: 18


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.71episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 64.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.06episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.08episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.89episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.11episode/s, Episode Reward=-462]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 128.59episode/s, Episode Reward=-164] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.59episode/s, Episode Reward=-174] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 227.89episode/s, Episode Reward=-75]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 305.17episode/s, Episode Reward=-70]  
Training Progress: 100%|██████████|

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▁▁▁▁▇▁▁▇███
train_value,▁▁▁▁▁▃▅▆▇███

0,1
t,11.0
test_value,-14.0
train_value,-11.198


[34m[1mwandb[0m: Agent Starting Run: dvxuzxnm with config:
[34m[1mwandb[0m: 	action_bins: 60
[34m[1mwandb[0m: 	alpha: 0.7911898107470617
[34m[1mwandb[0m: 	epsilon: 0.9564179604629998
[34m[1mwandb[0m: 	gamma: 0.8979606865250944
[34m[1mwandb[0m: 	vel_bins: 20
[34m[1mwandb[0m: 	x_bins: 23


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.96episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.82episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.32episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.34episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.64episode/s, Episode Reward=-348] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.11episode/s, Episode Reward=-659] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 158.90episode/s, Episode Reward=-477] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 228.45episode/s, Episode Reward=-149] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 217.32episode/s, Episode Reward=-161] 
Training Progress: 100%|██████████| 

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▆▃▆▇█▇█
train_value,▁▁▁▁▁▂▃▆▇▇██

0,1
t,11.0
test_value,5.22
train_value,-91.486


[34m[1mwandb[0m: Agent Starting Run: wz5hony2 with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.6289323207980988
[34m[1mwandb[0m: 	epsilon: 0.899880060606207
[34m[1mwandb[0m: 	gamma: 0.9216564254309386
[34m[1mwandb[0m: 	vel_bins: 21
[34m[1mwandb[0m: 	x_bins: 21


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.70episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.26episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.43episode/s, Episode Reward=-1000] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.36episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.93episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.92episode/s, Episode Reward=-1000] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 173.03episode/s, Episode Reward=-560] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 304.74episode/s, Episode Reward=-214]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 322.18episode/s, Episode Reward=-66] 
Training Progress: 100%|██████████| 5

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▁▁▆▁▇██
train_value,▁▁▁▁▁▂▅▆▇▇██

0,1
t,11.0
test_value,9.04
train_value,5.914


[34m[1mwandb[0m: Agent Starting Run: ubkhvkl3 with config:
[34m[1mwandb[0m: 	action_bins: 16
[34m[1mwandb[0m: 	alpha: 0.6309399333069997
[34m[1mwandb[0m: 	epsilon: 0.9227180355654708
[34m[1mwandb[0m: 	gamma: 0.9327721802686408
[34m[1mwandb[0m: 	vel_bins: 22
[34m[1mwandb[0m: 	x_bins: 24


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.72episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.17episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.67episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.91episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.58episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.69episode/s, Episode Reward=-160] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 209.73episode/s, Episode Reward=-130] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 221.55episode/s, Episode Reward=-223] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.60episode/s, Episode Reward=-278]
Training Progress: 100%|██████████| 5

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▁▇▇▅▇██
train_value,▁▁▁▁▁▂▅▇▇███

0,1
t,11.0
test_value,-28.08
train_value,-9.772


[34m[1mwandb[0m: Agent Starting Run: 96gj5h68 with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.709195235908292
[34m[1mwandb[0m: 	epsilon: 0.6145769002543273
[34m[1mwandb[0m: 	gamma: 0.7891815067094485
[34m[1mwandb[0m: 	vel_bins: 12
[34m[1mwandb[0m: 	x_bins: 14


Training Progress:  94%|█████████▍| 472/500 [00:06<00:00, 82.49episode/s, Episode Reward=-1000][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
Training Progress:  95%|█████████▍| 473/500 [00:06<00:00, 82.49episode/s, Episode Reward=-1000]

Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.39episode/s, Episode Reward=-1000]
Training Progress:  24%|██▎       | 118/500 [00:02<00:06, 56.59episode/s, Episode Reward=-1000]Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-631 (_run_job), stopped 6272348160)>>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1032, in _bootstrap
    self._bootstrap_inner()
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1077, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1391, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/Users/mateogiraz/Library/Caches/pypoetry/virtualenvs/ai-project-yycSt0xa-py3.12/lib/python3.12/site-packages/ipykernel/iostream.py", line 604, in flush
    self.pub_thread.schedule(self._flush)
  File "/Users/mateogiraz

In [21]:
# ejecutar el agente con los mejores parametros

alpha = 0.96276
epsilon = 0.75616
gamma = 0.88364
action_bins = 83
x_bins = 10
vel_bins = 13

env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
model = Car(env, x_bins, vel_bins, action_bins)
agent = MountainCarAgent(model, alpha, gamma)

for t in range(20):
    train_value = agent.train(500, epsilon)
    test_value = agent.test(50)
    epsilon *= (10-t)/10
    print(f"Train value: {train_value}, Test value: {test_value}, Epsilon: {epsilon}")

Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.44episode/s, Episode Reward=-1000]


Train value: -999.748, Test value: -1000.0, Epsilon: 0.75616


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.21episode/s, Episode Reward=-1000]


Train value: -1000.0, Test value: -1000.0, Epsilon: 0.680544


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.50episode/s, Episode Reward=-1000]


Train value: -998.956, Test value: -1000.0, Epsilon: 0.5444352


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.58episode/s, Episode Reward=-1000]


Train value: -998.43, Test value: -1000.0, Epsilon: 0.38110464


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.69episode/s, Episode Reward=-1000]


Train value: -986.656, Test value: -1000.0, Epsilon: 0.22866278399999998


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.90episode/s, Episode Reward=-1000]


Train value: -945.766, Test value: -1000.0, Epsilon: 0.11433139199999999


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.31episode/s, Episode Reward=-524]  


Train value: -814.97, Test value: -1000.0, Epsilon: 0.0457325568


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.32episode/s, Episode Reward=-1000]


Train value: -512.564, Test value: -100.98, Epsilon: 0.013719767039999999


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.87episode/s, Episode Reward=-316] 


Train value: -362.738, Test value: -13.42, Epsilon: 0.002743953408


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 248.43episode/s, Episode Reward=-442] 


Train value: -177.526, Test value: -176.8, Epsilon: 0.0002743953408


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 364.14episode/s, Episode Reward=-7]  


Train value: -51.12, Test value: -3.64, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 365.54episode/s, Episode Reward=-5] 


Train value: -3.296, Test value: -4.46, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 443.29episode/s, Episode Reward=0]  


Train value: -3.54, Test value: -3.22, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 456.52episode/s, Episode Reward=-7] 


Train value: -3.29, Test value: -2.64, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 433.20episode/s, Episode Reward=-8] 


Train value: -3.29, Test value: -3.22, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 437.08episode/s, Episode Reward=-6] 


Train value: -3.224, Test value: -3.22, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 311.42episode/s, Episode Reward=0]  


Train value: -3.234, Test value: -3.6, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 405.51episode/s, Episode Reward=-10]


Train value: -3.02, Test value: -3.22, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 435.90episode/s, Episode Reward=-5] 


Train value: -3.198, Test value: -4.16, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 441.37episode/s, Episode Reward=2]  


Train value: -2.868, Test value: -3.24, Epsilon: -0.0


In [20]:
# Guardar el modelo entrenado
import pickle
with open('mountain_car_agent.pkl', 'wb') as f:
    pickle.dump({
        'agent': agent,
        'parameters': {
            'alpha': alpha,
            'epsilon': epsilon,
            'gamma': gamma,
            'action_bins': action_bins,
            'x_bins': x_bins,
            'vel_bins': vel_bins
        }
    }, f)

In [None]:
with open('mountain_car_agent.pkl', 'rb') as f:
    loaded_agent = pickle.load(f)

    test_value = loaded_agent['agent'].test(500)
    print(f"Test value: {test_value}")


In [7]:
from tqdm import tqd
import wandb
import gym
from car_model import Car
from stochastic_mountain_car_agent import StochasticMountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

# ejecutar el agente con los mejores parámetros, ahora usando stochastic q learning
alpha = 0.96276
epsilon = 0.75616
gamma = 0.88364
action_bins = 83
x_bins = 10
vel_bins = 13
log_sample_size = int(np.ceil(np.log(action_bins)))  # tamaño del subconjunto

env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
model = Car(env, x_bins, vel_bins, action_bins)

# Usar el agente estocástico
agent = StochasticMountainCarAgent(model, alpha, gamma, log_sample_size)

# Entrenar y evaluar el agente
for t in range(20):
    train_value = agent.train(500, epsilon)
    test_value = agent.test(50)
    epsilon *= (10-t)/10
    print(f"Train value: {train_value}, Test value: {test_value}, Epsilon: {epsilon}")


Training Progress: 100%|██████████| 500/500 [05:01<00:00,  1.66episode/s, Episode Reward=-1000]


Train value: -997.298, Test value: -1000.0, Epsilon: 0.75616


Training Progress: 100%|██████████| 500/500 [05:53<00:00,  1.41episode/s, Episode Reward=-1000]


Train value: -1000.0, Test value: -1000.0, Epsilon: 0.680544


Training Progress: 100%|██████████| 500/500 [05:23<00:00,  1.54episode/s, Episode Reward=-1000]


Train value: -999.018, Test value: -1000.0, Epsilon: 0.5444352


Training Progress: 100%|██████████| 500/500 [05:14<00:00,  1.59episode/s, Episode Reward=-762] 


Train value: -998.276, Test value: -989.76, Epsilon: 0.38110464


Training Progress: 100%|██████████| 500/500 [04:57<00:00,  1.68episode/s, Episode Reward=-1000]


Train value: -990.336, Test value: -984.84, Epsilon: 0.22866278399999998


Training Progress: 100%|██████████| 500/500 [04:12<00:00,  1.98episode/s, Episode Reward=-1000]


Train value: -951.99, Test value: -1000.0, Epsilon: 0.11433139199999999


Training Progress: 100%|██████████| 500/500 [03:37<00:00,  2.30episode/s, Episode Reward=-1000]


Train value: -770.986, Test value: -1000.0, Epsilon: 0.0457325568


Training Progress: 100%|██████████| 500/500 [01:54<00:00,  4.35episode/s, Episode Reward=-320] 


Train value: -568.99, Test value: -978.58, Epsilon: 0.013719767039999999


Training Progress: 100%|██████████| 500/500 [00:47<00:00, 10.43episode/s, Episode Reward=-1000]


Train value: -310.818, Test value: -966.28, Epsilon: 0.002743953408


Training Progress: 100%|██████████| 500/500 [00:35<00:00, 14.16episode/s, Episode Reward=-270] 


Train value: -239.766, Test value: -952.04, Epsilon: 0.0002743953408


Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.75episode/s, Episode Reward=-64]  


Train value: -171.852, Test value: -971.84, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 35.73episode/s, Episode Reward=-66] 


Train value: -89.958, Test value: -991.16, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.64episode/s, Episode Reward=-18] 


Train value: -47.976, Test value: -993.4, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.35episode/s, Episode Reward=-64]  


Train value: -99.93, Test value: -984.04, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.59episode/s, Episode Reward=-45] 


Train value: -70.042, Test value: -951.94, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 35.94episode/s, Episode Reward=-113]


Train value: -104.034, Test value: -987.12, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 38.18episode/s, Episode Reward=-134]


Train value: -132.238, Test value: -993.72, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.22episode/s, Episode Reward=-83] 


Train value: -92.334, Test value: -995.48, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.59episode/s, Episode Reward=-48]  


Train value: -104.394, Test value: -948.88, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 61.46episode/s, Episode Reward=-45] 


Train value: -60.142, Test value: -978.24, Epsilon: -0.0


In [16]:
import wandb
import pickle
from car_model import Car
from stochastic_mountain_car_agent import StochasticMountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep_stochastic():
    wandb.init()
    config = wandb.config

    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    model = Car(env, config.x_bins, config.vel_bins, config.action_bins)

    log_sample_size = int(np.ceil(np.log(config.action_bins)))
    agent = StochasticMountainCarAgent(model, config.alpha, config.gamma, log_sample_size)

    # epsilon base y alpha base para el decay
    base_epsilon = config.epsilon
    base_alpha = config.alpha

    best_test_value = -np.inf
    best_model_path = "mountain_car_stochastic_agent.pkl"


    for t in range(12):
        # alpha decay
        alpha = base_alpha * (1 - t / 12)
        agent.alpha = alpha

        train_value = agent.train(500, base_epsilon)
        test_value = agent.test(50)
        
        wandb.log({
            "train_value": train_value, 
            "test_value": test_value, 
            "epsilon": base_epsilon, 
            "alpha": alpha, 
            "t": t
        })

        base_epsilon *= (10 - t) / 10


        if test_value > best_test_value:
            best_test_value = test_value
            with open(best_model_path, "wb") as f:
                pickle.dump(agent, f)
            print(f"Nuevo mejor modelo guardado con test_value: {best_test_value}")

    artifact = wandb.Artifact("best_model", type="model")
    artifact.add_file(best_model_path)
    wandb.log_artifact(artifact)

sweep_stochastic_confi = {
    'name': 'bayesian-sweep-stochastic-q-learning-alpha-decay',
    'method': 'bayes',
    'metric': {
        'name': 'test_value',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {
            'distribution': 'uniform',
            'min': 0.5,
            'max': 0.99
        },
        'epsilon': {
            'distribution': 'uniform',
            'min': 0.1,
            'max': 0.9
        },
        'gamma': {
            'distribution': 'uniform',
            'min': 0.5,
            'max': 0.99
        },
        'action_bins': {
            'distribution': 'int_uniform',
            'min': 20,
            'max': 100
        },
        'vel_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 100
        },
        'x_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 100
        }
    }
}

# Registrar el sweep en WandB
entity = "mateogiraz27-ort"
project = "mountain_car"
sweep_id = wandb.sweep(sweep_stochastic_confi, entity=entity, project=project)


Create sweep with ID: vw754ly1
Sweep URL: https://wandb.ai/mateogiraz27-ort/mountain_car/sweeps/vw754ly1


In [17]:
sweep_id = "5rac7rwz"
wandb.agent(sweep_id, function=sweep_stochastic, count=100, entity=entity, project=project)

[34m[1mwandb[0m: Agent Starting Run: 0eajihq8 with config:
[34m[1mwandb[0m: 	action_bins: 82
[34m[1mwandb[0m: 	alpha: 0.7809047492371026
[34m[1mwandb[0m: 	epsilon: 0.10895606074683065
[34m[1mwandb[0m: 	gamma: 0.5603103932565742
[34m[1mwandb[0m: 	vel_bins: 85
[34m[1mwandb[0m: 	x_bins: 61


Training Progress: 100%|██████████| 500/500 [00:52<00:00,  9.53episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -991.76


Training Progress: 100%|██████████| 500/500 [00:45<00:00, 10.90episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -963.54


Training Progress: 100%|██████████| 500/500 [00:39<00:00, 12.77episode/s, Episode Reward=-428] 


Nuevo mejor modelo guardado con test_value: -945.16


Training Progress: 100%|██████████| 500/500 [00:34<00:00, 14.52episode/s, Episode Reward=-375] 


Nuevo mejor modelo guardado con test_value: -921.36


Training Progress: 100%|██████████| 500/500 [00:31<00:00, 15.64episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:27<00:00, 18.16episode/s, Episode Reward=-386] 


Nuevo mejor modelo guardado con test_value: -920.78


Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.32episode/s, Episode Reward=-392] 
Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.61episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -904.54


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 22.83episode/s, Episode Reward=-535] 
Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.33episode/s, Episode Reward=-368] 


Nuevo mejor modelo guardado con test_value: -900.42


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.08episode/s, Episode Reward=-578] 


Nuevo mejor modelo guardado con test_value: -834.72


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.94episode/s, Episode Reward=-505]


Nuevo mejor modelo guardado con test_value: -831.38


VBox(children=(Label(value='0.318 MB of 3.346 MB uploaded\r'), FloatProgress(value=0.09507056605333178, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▃▄▄▄▃▅▄▅██
train_value,▁▁▃▄▅▆▆▆▇▇▇█

0,1
alpha,0.06508
epsilon,0.0
t,11.0
test_value,-831.38
train_value,-338.888


[34m[1mwandb[0m: Agent Starting Run: ui1fwpls with config:
[34m[1mwandb[0m: 	action_bins: 40
[34m[1mwandb[0m: 	alpha: 0.8429086368837633
[34m[1mwandb[0m: 	epsilon: 0.523047475073592
[34m[1mwandb[0m: 	gamma: 0.9351552534456912
[34m[1mwandb[0m: 	vel_bins: 44
[34m[1mwandb[0m: 	x_bins: 81


Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.79episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -773.7


Training Progress: 100%|██████████| 500/500 [00:24<00:00, 20.80episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:22<00:00, 22.07episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.18episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.89episode/s, Episode Reward=-71]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.05episode/s, Episode Reward=-145]


Nuevo mejor modelo guardado con test_value: -735.88


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.52episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.25episode/s, Episode Reward=-71] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 166.33episode/s, Episode Reward=14]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 218.34episode/s, Episode Reward=18] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 223.00episode/s, Episode Reward=15] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 238.73episode/s, Episode Reward=17] 


VBox(children=(Label(value='0.880 MB of 1.135 MB uploaded\r'), FloatProgress(value=0.7755759687147609, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▆▁▃▂▄█▄▅▃▂▃▆
train_value,▁▂▂▅▆▇▇█████

0,1
alpha,0.07024
epsilon,0.0
t,11.0
test_value,-794.24
train_value,5.508


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zv2icrzu with config:
[34m[1mwandb[0m: 	action_bins: 66
[34m[1mwandb[0m: 	alpha: 0.6908167599275631
[34m[1mwandb[0m: 	epsilon: 0.4518318066991161
[34m[1mwandb[0m: 	gamma: 0.7454786690524214
[34m[1mwandb[0m: 	vel_bins: 98
[34m[1mwandb[0m: 	x_bins: 45


Training Progress: 100%|██████████| 500/500 [00:37<00:00, 13.32episode/s, Episode Reward=-868] 


Nuevo mejor modelo guardado con test_value: -1000.0


Training Progress: 100%|██████████| 500/500 [00:34<00:00, 14.70episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -868.36


Training Progress: 100%|██████████| 500/500 [00:29<00:00, 17.20episode/s, Episode Reward=-841] 


Nuevo mejor modelo guardado con test_value: -756.4


Training Progress: 100%|██████████| 500/500 [00:24<00:00, 20.76episode/s, Episode Reward=-504] 


Nuevo mejor modelo guardado con test_value: -687.42


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 23.94episode/s, Episode Reward=-801] 


Nuevo mejor modelo guardado con test_value: -575.54


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.03episode/s, Episode Reward=-386] 
Training Progress: 100%|██████████| 500/500 [00:16<00:00, 31.07episode/s, Episode Reward=-267] 
Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.24episode/s, Episode Reward=-284]
Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.22episode/s, Episode Reward=-182]
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 36.86episode/s, Episode Reward=-212]
Training Progress: 100%|██████████| 500/500 [00:14<00:00, 35.66episode/s, Episode Reward=-225]
Training Progress: 100%|██████████| 500/500 [00:11<00:00, 43.53episode/s, Episode Reward=-346]


VBox(children=(Label(value='0.130 MB of 2.303 MB uploaded\r'), FloatProgress(value=0.05662415285754754, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▅▆█▇▅▄▆▅▅▆
train_value,▁▁▃▅▆▇▇▇▇███

0,1
alpha,0.05757
epsilon,0.0
t,11.0
test_value,-724.6
train_value,-234.95


[34m[1mwandb[0m: Agent Starting Run: 826wasoy with config:
[34m[1mwandb[0m: 	action_bins: 64
[34m[1mwandb[0m: 	alpha: 0.7879209491849253
[34m[1mwandb[0m: 	epsilon: 0.7045485012724528
[34m[1mwandb[0m: 	gamma: 0.5694260261529092
[34m[1mwandb[0m: 	vel_bins: 63
[34m[1mwandb[0m: 	x_bins: 12


Training Progress: 100%|██████████| 500/500 [00:43<00:00, 11.59episode/s, Episode Reward=-741] 


Nuevo mejor modelo guardado con test_value: -542.7


Training Progress: 100%|██████████| 500/500 [00:39<00:00, 12.51episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:38<00:00, 13.15episode/s, Episode Reward=-752] 
Training Progress: 100%|██████████| 500/500 [00:35<00:00, 13.97episode/s, Episode Reward=-724] 
Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.95episode/s, Episode Reward=-272] 
Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.09episode/s, Episode Reward=-228] 
Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.76episode/s, Episode Reward=-112]
Training Progress: 100%|██████████| 500/500 [00:12<00:00, 41.47episode/s, Episode Reward=-191]
Training Progress: 100%|██████████| 500/500 [00:14<00:00, 35.20episode/s, Episode Reward=-83]  
Training Progress: 100%|██████████| 500/500 [00:11<00:00, 42.17episode/s, Episode Reward=-75]  
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 71.53episode/s, Episode Reward=-12]  
Training Progress: 100%|██████████| 500/50

VBox(children=(Label(value='0.396 MB of 0.415 MB uploaded\r'), FloatProgress(value=0.9535325038137072, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,█▅▇▄▄▃▂▁▁▁▂▂
train_value,▁▁▂▃▅▆▇▇▇▇██

0,1
alpha,0.06566
epsilon,0.0
t,11.0
test_value,-919.32
train_value,-60.864


[34m[1mwandb[0m: Agent Starting Run: w6x7c6gv with config:
[34m[1mwandb[0m: 	action_bins: 97
[34m[1mwandb[0m: 	alpha: 0.5250727435600488
[34m[1mwandb[0m: 	epsilon: 0.5213671375318365
[34m[1mwandb[0m: 	gamma: 0.6040809782729827
[34m[1mwandb[0m: 	vel_bins: 89
[34m[1mwandb[0m: 	x_bins: 86


Training Progress: 100%|██████████| 500/500 [00:50<00:00,  9.95episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -1000.0


Training Progress: 100%|██████████| 500/500 [00:44<00:00, 11.16episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:46<00:00, 10.77episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -983.54


Training Progress: 100%|██████████| 500/500 [00:44<00:00, 11.20episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:45<00:00, 11.05episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:43<00:00, 11.58episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:41<00:00, 11.93episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:41<00:00, 12.13episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:39<00:00, 12.58episode/s, Episode Reward=-822] 
Training Progress: 100%|██████████| 500/500 [00:42<00:00, 11.77episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:37<00:00, 13.48episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:33<00:00, 14.88episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -967.7


VBox(children=(Label(value='1.005 MB of 5.805 MB uploaded\r'), FloatProgress(value=0.1731799658447109, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▅▃▄▄▄▄▁▁▂█
train_value,▁▁▁▁▂▂▄▄▅▅▆█

0,1
alpha,0.04376
epsilon,0.0
t,11.0
test_value,-967.7
train_value,-680.454


[34m[1mwandb[0m: Agent Starting Run: t670f70k with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.6201045371487013
[34m[1mwandb[0m: 	epsilon: 0.6339200234950851
[34m[1mwandb[0m: 	gamma: 0.5135313048189525
[34m[1mwandb[0m: 	vel_bins: 83
[34m[1mwandb[0m: 	x_bins: 60


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.59episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -869.46


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.67episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -348.2


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 35.94episode/s, Episode Reward=-587] 


Nuevo mejor modelo guardado con test_value: -283.58


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.89episode/s, Episode Reward=-412] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.53episode/s, Episode Reward=-146]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 64.59episode/s, Episode Reward=-288]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.16episode/s, Episode Reward=-274]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.73episode/s, Episode Reward=-152] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.24episode/s, Episode Reward=-136]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.69episode/s, Episode Reward=-257] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.76episode/s, Episode Reward=-208] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.95episode/s, Episode Reward=-243] 


VBox(children=(Label(value='1.005 MB of 1.064 MB uploaded\r'), FloatProgress(value=0.9445421628126403, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇██▇▇▅▃▄▃▄▅
train_value,▁▂▄▆▇▇█▇▇███

0,1
alpha,0.05168
epsilon,0.0
t,11.0
test_value,-573.04
train_value,-211.716


[34m[1mwandb[0m: Agent Starting Run: 8rge9sqh with config:
[34m[1mwandb[0m: 	action_bins: 30
[34m[1mwandb[0m: 	alpha: 0.6980839823592143
[34m[1mwandb[0m: 	epsilon: 0.28843997668359067
[34m[1mwandb[0m: 	gamma: 0.6582281852897766
[34m[1mwandb[0m: 	vel_bins: 71
[34m[1mwandb[0m: 	x_bins: 36


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 23.98episode/s, Episode Reward=-407] 


Nuevo mejor modelo guardado con test_value: -509.72


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 40.63episode/s, Episode Reward=-445] 


Nuevo mejor modelo guardado con test_value: -408.34


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 58.79episode/s, Episode Reward=-230]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.62episode/s, Episode Reward=-242]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.46episode/s, Episode Reward=-136] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 93.60episode/s, Episode Reward=-161] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.45episode/s, Episode Reward=-82] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.85episode/s, Episode Reward=-28] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.59episode/s, Episode Reward=-95] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.28episode/s, Episode Reward=-173]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.90episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.15episode/s, Episode Reward=-56] 


VBox(children=(Label(value='0.618 MB of 0.618 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▆█▇▇▆▅▅▄▄▁▄▆
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.05817
epsilon,0.0
t,11.0
test_value,-519.54
train_value,-49.912


[34m[1mwandb[0m: Agent Starting Run: 8uaxadvi with config:
[34m[1mwandb[0m: 	action_bins: 28
[34m[1mwandb[0m: 	alpha: 0.6561640607866815
[34m[1mwandb[0m: 	epsilon: 0.48476116989667695
[34m[1mwandb[0m: 	gamma: 0.6332457200635534
[34m[1mwandb[0m: 	vel_bins: 72
[34m[1mwandb[0m: 	x_bins: 53


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.42episode/s, Episode Reward=-700] 


Nuevo mejor modelo guardado con test_value: -633.42


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.89episode/s, Episode Reward=-351] 


Nuevo mejor modelo guardado con test_value: -340.64


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.13episode/s, Episode Reward=-326] 


Nuevo mejor modelo guardado con test_value: -313.12


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.99episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 71.63episode/s, Episode Reward=-221] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 54.81episode/s, Episode Reward=-282] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 62.38episode/s, Episode Reward=-198]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.56episode/s, Episode Reward=-211]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.69episode/s, Episode Reward=-124] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.64episode/s, Episode Reward=-106] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.54episode/s, Episode Reward=-220] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.72episode/s, Episode Reward=-120]


VBox(children=(Label(value='0.851 MB of 0.851 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇██▇▆▄▂▃▃▄▄
train_value,▁▃▅▆▇▇▇█████

0,1
alpha,0.05468
epsilon,0.0
t,11.0
test_value,-486.3
train_value,-128.638


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: f9jg1rpr with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.5607994007717783
[34m[1mwandb[0m: 	epsilon: 0.3994566775728017
[34m[1mwandb[0m: 	gamma: 0.7018632340611032
[34m[1mwandb[0m: 	vel_bins: 89
[34m[1mwandb[0m: 	x_bins: 42


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.42episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -609.68


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 35.76episode/s, Episode Reward=-343] 


Nuevo mejor modelo guardado con test_value: -382.46


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 53.18episode/s, Episode Reward=-326] 


Nuevo mejor modelo guardado con test_value: -373.52


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.80episode/s, Episode Reward=-260]


Nuevo mejor modelo guardado con test_value: -323.54


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.17episode/s, Episode Reward=-303]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.24episode/s, Episode Reward=-72]  
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 91.36episode/s, Episode Reward=-100] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.54episode/s, Episode Reward=-265] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.58episode/s, Episode Reward=-184] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.41episode/s, Episode Reward=-141]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.30episode/s, Episode Reward=-134]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.21episode/s, Episode Reward=-76] 


VBox(children=(Label(value='0.747 MB of 0.747 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇▇█▇▇▅▄▄▆▇▇
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.04673
epsilon,0.0
t,11.0
test_value,-376.74
train_value,-108.968


[34m[1mwandb[0m: Agent Starting Run: 4o6lc7wk with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.5446064502493405
[34m[1mwandb[0m: 	epsilon: 0.35492869340691735
[34m[1mwandb[0m: 	gamma: 0.8025053091338076
[34m[1mwandb[0m: 	vel_bins: 88
[34m[1mwandb[0m: 	x_bins: 15


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 31.11episode/s, Episode Reward=-377] 


Nuevo mejor modelo guardado con test_value: -529.32


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.02episode/s, Episode Reward=-153] 
Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.48episode/s, Episode Reward=-531] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.01episode/s, Episode Reward=-140] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.64episode/s, Episode Reward=-220] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.03episode/s, Episode Reward=-136] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.13episode/s, Episode Reward=-41] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 126.61episode/s, Episode Reward=-135]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 154.35episode/s, Episode Reward=-46] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.85episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.19episode/s, Episode Reward=-54] 
Training Progress: 100%|██████████| 500/

VBox(children=(Label(value='0.226 MB of 0.226 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,█▅▃▆▃▄▁▁▃▂▁▃
train_value,▁▄▄▅▆▇▇▇████

0,1
alpha,0.04538
epsilon,0.0
t,11.0
test_value,-744.26
train_value,-50.802


[34m[1mwandb[0m: Agent Starting Run: ehhsus5c with config:
[34m[1mwandb[0m: 	action_bins: 35
[34m[1mwandb[0m: 	alpha: 0.5958720290709973
[34m[1mwandb[0m: 	epsilon: 0.475743102517919
[34m[1mwandb[0m: 	gamma: 0.7349122238635931
[34m[1mwandb[0m: 	vel_bins: 98
[34m[1mwandb[0m: 	x_bins: 62


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.12episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -960.78


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.19episode/s, Episode Reward=-380] 


Nuevo mejor modelo guardado con test_value: -677.96


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.90episode/s, Episode Reward=-352] 


Nuevo mejor modelo guardado con test_value: -504.64


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 38.39episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.21episode/s, Episode Reward=-332] 
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.19episode/s, Episode Reward=-591] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.89episode/s, Episode Reward=-218]
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 54.69episode/s, Episode Reward=-265]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.51episode/s, Episode Reward=-341]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 61.85episode/s, Episode Reward=-388]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.22episode/s, Episode Reward=-154]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 71.25episode/s, Episode Reward=-210]


VBox(children=(Label(value='0.771 MB of 1.675 MB uploaded\r'), FloatProgress(value=0.46037520723191694, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅██▇▇▆▇▇▆▄▇
train_value,▁▂▃▅▆▇▇▇▇███

0,1
alpha,0.04966
epsilon,0.0
t,11.0
test_value,-558.76
train_value,-239.206


[34m[1mwandb[0m: Agent Starting Run: 5cfxto8r with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.5261746430196825
[34m[1mwandb[0m: 	epsilon: 0.3329342368146485
[34m[1mwandb[0m: 	gamma: 0.6419766304095196
[34m[1mwandb[0m: 	vel_bins: 68
[34m[1mwandb[0m: 	x_bins: 58


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.35episode/s, Episode Reward=-813] 


Nuevo mejor modelo guardado con test_value: -780.28


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 37.65episode/s, Episode Reward=-387] 


Nuevo mejor modelo guardado con test_value: -547.28


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.79episode/s, Episode Reward=-507] 


Nuevo mejor modelo guardado con test_value: -390.34


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.26episode/s, Episode Reward=-378]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.57episode/s, Episode Reward=-348] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.62episode/s, Episode Reward=-242] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.94episode/s, Episode Reward=-315] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.74episode/s, Episode Reward=-152] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.97episode/s, Episode Reward=-306] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.43episode/s, Episode Reward=-161] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.26episode/s, Episode Reward=-136]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.78episode/s, Episode Reward=-167]


VBox(children=(Label(value='0.754 MB of 0.754 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅██▇█▇▇▇▇█▇
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.04385
epsilon,0.0
t,11.0
test_value,-448.32
train_value,-140.686


[34m[1mwandb[0m: Agent Starting Run: 4y3mnlcx with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.5640263864416667
[34m[1mwandb[0m: 	epsilon: 0.4885818901352794
[34m[1mwandb[0m: 	gamma: 0.612615353883662
[34m[1mwandb[0m: 	vel_bins: 65
[34m[1mwandb[0m: 	x_bins: 51


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.54episode/s, Episode Reward=-398] 


Nuevo mejor modelo guardado con test_value: -500.74


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 42.66episode/s, Episode Reward=-235] 


Nuevo mejor modelo guardado con test_value: -307.06


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.93episode/s, Episode Reward=-235] 


Nuevo mejor modelo guardado con test_value: -258.56


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.70episode/s, Episode Reward=-306] 


Nuevo mejor modelo guardado con test_value: -249.24


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.04episode/s, Episode Reward=-232] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.77episode/s, Episode Reward=-316] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.57episode/s, Episode Reward=-227] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.44episode/s, Episode Reward=-235]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.72episode/s, Episode Reward=-96] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 110.03episode/s, Episode Reward=-86] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.07episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.83episode/s, Episode Reward=-62] 


VBox(children=(Label(value='0.224 MB of 0.585 MB uploaded\r'), FloatProgress(value=0.383323898996589, max=1.0)…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆██▄▅▄▅▄▄▃▆
train_value,▁▄▆▆▇▇▇▇████

0,1
alpha,0.047
epsilon,0.0
t,11.0
test_value,-335.88
train_value,-94.212


[34m[1mwandb[0m: Agent Starting Run: dewdga9e with config:
[34m[1mwandb[0m: 	action_bins: 30
[34m[1mwandb[0m: 	alpha: 0.5566218626228528
[34m[1mwandb[0m: 	epsilon: 0.4559393164017323
[34m[1mwandb[0m: 	gamma: 0.6616403552662453
[34m[1mwandb[0m: 	vel_bins: 64
[34m[1mwandb[0m: 	x_bins: 45


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.71episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -678.48


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.47episode/s, Episode Reward=-468] 


Nuevo mejor modelo guardado con test_value: -332.0


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.62episode/s, Episode Reward=-406] 


Nuevo mejor modelo guardado con test_value: -321.08


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 58.80episode/s, Episode Reward=-356]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 71.39episode/s, Episode Reward=-229]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.39episode/s, Episode Reward=-173] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.73episode/s, Episode Reward=-235] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.53episode/s, Episode Reward=-133] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.94episode/s, Episode Reward=-97]  
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.15episode/s, Episode Reward=-180] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.50episode/s, Episode Reward=-89] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.33episode/s, Episode Reward=-61] 


VBox(children=(Label(value='0.130 MB of 0.693 MB uploaded\r'), FloatProgress(value=0.18802597770783475, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁██▇▇█▅▅▅▂▄▄
train_value,▁▃▅▆▇▇▇▇████

0,1
alpha,0.04639
epsilon,0.0
t,11.0
test_value,-534.6
train_value,-89.666


[34m[1mwandb[0m: Agent Starting Run: 4lmuhvqg with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.5774058161654966
[34m[1mwandb[0m: 	epsilon: 0.21936485519420929
[34m[1mwandb[0m: 	gamma: 0.5002294881235695
[34m[1mwandb[0m: 	vel_bins: 99
[34m[1mwandb[0m: 	x_bins: 56


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.31episode/s, Episode Reward=-729] 


Nuevo mejor modelo guardado con test_value: -914.34


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.44episode/s, Episode Reward=-810] 


Nuevo mejor modelo guardado con test_value: -659.42


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.18episode/s, Episode Reward=-657] 


Nuevo mejor modelo guardado con test_value: -483.88


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.18episode/s, Episode Reward=-510] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.33episode/s, Episode Reward=-374] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 62.58episode/s, Episode Reward=-652] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.75episode/s, Episode Reward=-334]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 73.81episode/s, Episode Reward=-215]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.43episode/s, Episode Reward=-264] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.10episode/s, Episode Reward=-259] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.39episode/s, Episode Reward=-325] 


Nuevo mejor modelo guardado con test_value: -481.16


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.18episode/s, Episode Reward=-343] 


Nuevo mejor modelo guardado con test_value: -469.96


VBox(children=(Label(value='1.009 MB of 1.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅█▆▆▆▆▇▇▇██
train_value,▁▃▅▆▇▇▇▇████

0,1
alpha,0.04812
epsilon,0.0
t,11.0
test_value,-469.96
train_value,-211.006


[34m[1mwandb[0m: Agent Starting Run: qdttgn6a with config:
[34m[1mwandb[0m: 	action_bins: 26
[34m[1mwandb[0m: 	alpha: 0.5104483661211227
[34m[1mwandb[0m: 	epsilon: 0.5014631813479172
[34m[1mwandb[0m: 	gamma: 0.5679873334399104
[34m[1mwandb[0m: 	vel_bins: 82
[34m[1mwandb[0m: 	x_bins: 60


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.16episode/s, Episode Reward=-660] 


Nuevo mejor modelo guardado con test_value: -840.2


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.70episode/s, Episode Reward=-650] 


Nuevo mejor modelo guardado con test_value: -429.44


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 40.02episode/s, Episode Reward=-586] 


Nuevo mejor modelo guardado con test_value: -325.76


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.70episode/s, Episode Reward=-461] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 55.73episode/s, Episode Reward=-271]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.76episode/s, Episode Reward=-473]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.74episode/s, Episode Reward=-386]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.86episode/s, Episode Reward=-241]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.71episode/s, Episode Reward=-225] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 70.58episode/s, Episode Reward=-152] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 71.02episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.09episode/s, Episode Reward=-182] 


VBox(children=(Label(value='0.802 MB of 1.013 MB uploaded\r'), FloatProgress(value=0.7917963760141804, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇█▇▆▅▆▄▄▄▃▃
train_value,▁▂▄▆▇▇▇█████

0,1
alpha,0.04254
epsilon,0.0
t,11.0
test_value,-664.2
train_value,-232.746


[34m[1mwandb[0m: Agent Starting Run: 0htuq6t3 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6216568602980768
[34m[1mwandb[0m: 	epsilon: 0.6410649587594561
[34m[1mwandb[0m: 	gamma: 0.6508084391559497
[34m[1mwandb[0m: 	vel_bins: 64
[34m[1mwandb[0m: 	x_bins: 38


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.76episode/s, Episode Reward=-581] 


Nuevo mejor modelo guardado con test_value: -339.34


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.35episode/s, Episode Reward=-627] 


Nuevo mejor modelo guardado con test_value: -240.18


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.73episode/s, Episode Reward=-436] 


Nuevo mejor modelo guardado con test_value: -233.2


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.44episode/s, Episode Reward=-144] 


Nuevo mejor modelo guardado con test_value: -219.76


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.15episode/s, Episode Reward=-123]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.35episode/s, Episode Reward=-176]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.04episode/s, Episode Reward=-182]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 135.62episode/s, Episode Reward=-73] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.61episode/s, Episode Reward=-52] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 166.41episode/s, Episode Reward=-17] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.35episode/s, Episode Reward=-13] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.10episode/s, Episode Reward=-50] 


VBox(children=(Label(value='0.395 MB of 0.395 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▆███▇▅▄▃▃▁▃▃
train_value,▁▄▅▆▇▇▇▇████

0,1
alpha,0.0518
epsilon,0.0
t,11.0
test_value,-520.38
train_value,-29.978


[34m[1mwandb[0m: Agent Starting Run: 4s5rmc7e with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.5723230194948227
[34m[1mwandb[0m: 	epsilon: 0.5071602036459191
[34m[1mwandb[0m: 	gamma: 0.6396136609340646
[34m[1mwandb[0m: 	vel_bins: 68
[34m[1mwandb[0m: 	x_bins: 52


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.74episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -544.88


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 41.65episode/s, Episode Reward=-822] 


Nuevo mejor modelo guardado con test_value: -293.32


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.36episode/s, Episode Reward=-297]


Nuevo mejor modelo guardado con test_value: -246.74


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.78episode/s, Episode Reward=-436]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.80episode/s, Episode Reward=-243] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.70episode/s, Episode Reward=-148] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.44episode/s, Episode Reward=-195] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.94episode/s, Episode Reward=-119]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.16episode/s, Episode Reward=-160]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.20episode/s, Episode Reward=-157]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.05episode/s, Episode Reward=-58] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.48episode/s, Episode Reward=-192]


VBox(children=(Label(value='0.622 MB of 0.622 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇█▇▇▆▅▃▂▃▃▄
train_value,▁▄▆▇▇▇██████

0,1
alpha,0.04769
epsilon,0.0
t,11.0
test_value,-405.76
train_value,-106.038


[34m[1mwandb[0m: Agent Starting Run: 1fxyl3vf with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.6083347024618085
[34m[1mwandb[0m: 	epsilon: 0.1809000351556879
[34m[1mwandb[0m: 	gamma: 0.6064627397722409
[34m[1mwandb[0m: 	vel_bins: 83
[34m[1mwandb[0m: 	x_bins: 58


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.62episode/s, Episode Reward=-720] 


Nuevo mejor modelo guardado con test_value: -770.7


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 38.93episode/s, Episode Reward=-319] 


Nuevo mejor modelo guardado con test_value: -520.76


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 51.89episode/s, Episode Reward=-310] 


Nuevo mejor modelo guardado con test_value: -416.62


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 62.97episode/s, Episode Reward=-156]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 70.27episode/s, Episode Reward=-192]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.97episode/s, Episode Reward=-154] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.82episode/s, Episode Reward=-260] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.92episode/s, Episode Reward=-165] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 91.45episode/s, Episode Reward=-288] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.36episode/s, Episode Reward=-222] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.12episode/s, Episode Reward=-140] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.20episode/s, Episode Reward=-239]


VBox(children=(Label(value='0.505 MB of 0.878 MB uploaded\r'), FloatProgress(value=0.5752560395809561, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆█▇▇▆▆▇▇▇██
train_value,▁▄▆▇▇▇▇█████

0,1
alpha,0.05069
epsilon,0.0
t,11.0
test_value,-426.56
train_value,-154.626


[34m[1mwandb[0m: Agent Starting Run: 40lh78kk with config:
[34m[1mwandb[0m: 	action_bins: 29
[34m[1mwandb[0m: 	alpha: 0.5006907742205332
[34m[1mwandb[0m: 	epsilon: 0.5754687716694992
[34m[1mwandb[0m: 	gamma: 0.5406471217930151
[34m[1mwandb[0m: 	vel_bins: 40
[34m[1mwandb[0m: 	x_bins: 59


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.44episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -523.04


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.14episode/s, Episode Reward=-514] 


Nuevo mejor modelo guardado con test_value: -278.76


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.71episode/s, Episode Reward=-347] 


Nuevo mejor modelo guardado con test_value: -276.54


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.07episode/s, Episode Reward=-320]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 70.34episode/s, Episode Reward=-204] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.18episode/s, Episode Reward=-155] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.25episode/s, Episode Reward=-184] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.52episode/s, Episode Reward=-114] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.42episode/s, Episode Reward=-194] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.59episode/s, Episode Reward=-89]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.32episode/s, Episode Reward=-143]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.18episode/s, Episode Reward=-130]


VBox(children=(Label(value='0.553 MB of 0.553 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▃███▆▆▅▁▃▃▄▄
train_value,▁▃▅▆▇▇▇█████

0,1
alpha,0.04172
epsilon,0.0
t,11.0
test_value,-509.4
train_value,-107.176


[34m[1mwandb[0m: Agent Starting Run: akwci1bd with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.603258377053195
[34m[1mwandb[0m: 	epsilon: 0.4124017825016274
[34m[1mwandb[0m: 	gamma: 0.5366577501545364
[34m[1mwandb[0m: 	vel_bins: 76
[34m[1mwandb[0m: 	x_bins: 59


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.95episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -683.18


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.70episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -391.18


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 55.76episode/s, Episode Reward=-457] 


Nuevo mejor modelo guardado con test_value: -266.02


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.78episode/s, Episode Reward=-138] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.18episode/s, Episode Reward=-193]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.15episode/s, Episode Reward=-138] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.02episode/s, Episode Reward=-226] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.70episode/s, Episode Reward=-294] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.45episode/s, Episode Reward=-131] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 100.16episode/s, Episode Reward=-125]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.82episode/s, Episode Reward=-161]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.78episode/s, Episode Reward=-215]


VBox(children=(Label(value='0.749 MB of 0.749 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆█▇▆▆▆▄▄▃▅▄
train_value,▁▄▆▇▇▇▇█████

0,1
alpha,0.05027
epsilon,0.0
t,11.0
test_value,-505.24
train_value,-154.944


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0x2c7yj0 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6311057784377424
[34m[1mwandb[0m: 	epsilon: 0.3592769536526085
[34m[1mwandb[0m: 	gamma: 0.5643982562927861
[34m[1mwandb[0m: 	vel_bins: 70
[34m[1mwandb[0m: 	x_bins: 47


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.89episode/s, Episode Reward=-491] 


Nuevo mejor modelo guardado con test_value: -577.84


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 53.05episode/s, Episode Reward=-563] 


Nuevo mejor modelo guardado con test_value: -440.5


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 70.75episode/s, Episode Reward=-445]


Nuevo mejor modelo guardado con test_value: -374.5


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.40episode/s, Episode Reward=-183] 


Nuevo mejor modelo guardado con test_value: -372.28


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.08episode/s, Episode Reward=-186] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 110.92episode/s, Episode Reward=-200]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.58episode/s, Episode Reward=-125]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 121.81episode/s, Episode Reward=-91] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 128.14episode/s, Episode Reward=-90] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.06episode/s, Episode Reward=-53] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 154.94episode/s, Episode Reward=-13] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 163.54episode/s, Episode Reward=-20] 


VBox(children=(Label(value='0.130 MB of 0.529 MB uploaded\r'), FloatProgress(value=0.24660337118792694, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆██▆▄▇▃▂▃▃▆
train_value,▁▄▆▇▇▇▇▇████

0,1
alpha,0.05259
epsilon,0.0
t,11.0
test_value,-439.66
train_value,-58.904


[34m[1mwandb[0m: Agent Starting Run: fijtr76d with config:
[34m[1mwandb[0m: 	action_bins: 28
[34m[1mwandb[0m: 	alpha: 0.5937287479133835
[34m[1mwandb[0m: 	epsilon: 0.3397527678027926
[34m[1mwandb[0m: 	gamma: 0.5827513080972713
[34m[1mwandb[0m: 	vel_bins: 100
[34m[1mwandb[0m: 	x_bins: 33


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.52episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -755.56


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.93episode/s, Episode Reward=-464] 


Nuevo mejor modelo guardado con test_value: -420.52


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 40.87episode/s, Episode Reward=-341] 


Nuevo mejor modelo guardado con test_value: -369.52


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 53.40episode/s, Episode Reward=-331]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 61.99episode/s, Episode Reward=-155]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.56episode/s, Episode Reward=-244]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.70episode/s, Episode Reward=-253] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.37episode/s, Episode Reward=-145] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 83.01episode/s, Episode Reward=-136] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.70episode/s, Episode Reward=-142] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.93episode/s, Episode Reward=-142] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.46episode/s, Episode Reward=-175]


VBox(children=(Label(value='0.742 MB of 0.742 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇█▇▅▆▅▆▅▄▅▇
train_value,▁▄▅▆▇▇▇▇████

0,1
alpha,0.04948
epsilon,0.0
t,11.0
test_value,-399.82
train_value,-127.478


[34m[1mwandb[0m: Agent Starting Run: kfg0n6x6 with config:
[34m[1mwandb[0m: 	action_bins: 34
[34m[1mwandb[0m: 	alpha: 0.5077274352416064
[34m[1mwandb[0m: 	epsilon: 0.1488996591662014
[34m[1mwandb[0m: 	gamma: 0.5031688400675587
[34m[1mwandb[0m: 	vel_bins: 61
[34m[1mwandb[0m: 	x_bins: 19


Training Progress: 100%|██████████| 500/500 [00:27<00:00, 18.45episode/s, Episode Reward=-514] 


Nuevo mejor modelo guardado con test_value: -860.44


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 40.11episode/s, Episode Reward=-443] 


Nuevo mejor modelo guardado con test_value: -724.12


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.91episode/s, Episode Reward=-224] 


Nuevo mejor modelo guardado con test_value: -672.48


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.20episode/s, Episode Reward=-463] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 71.60episode/s, Episode Reward=-53]  
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.96episode/s, Episode Reward=-185] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.75episode/s, Episode Reward=-135] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.08episode/s, Episode Reward=-131]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.93episode/s, Episode Reward=-84] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.79episode/s, Episode Reward=-36] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 129.77episode/s, Episode Reward=-80] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 126.94episode/s, Episode Reward=-39] 


VBox(children=(Label(value='0.193 MB of 0.330 MB uploaded\r'), FloatProgress(value=0.58403919710644, max=1.0))…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆█▄▃▆▃▃▅▅▆▇
train_value,▁▅▆▇▇▇▇▇▇███

0,1
alpha,0.04231
epsilon,0.0
t,11.0
test_value,-699.78
train_value,-31.838


[34m[1mwandb[0m: Agent Starting Run: 3ikleso4 with config:
[34m[1mwandb[0m: 	action_bins: 35
[34m[1mwandb[0m: 	alpha: 0.5918377481749295
[34m[1mwandb[0m: 	epsilon: 0.1593211333550226
[34m[1mwandb[0m: 	gamma: 0.7357464553027332
[34m[1mwandb[0m: 	vel_bins: 95
[34m[1mwandb[0m: 	x_bins: 47


Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.63episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -958.24


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 26.11episode/s, Episode Reward=-572] 


Nuevo mejor modelo guardado con test_value: -760.6


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 35.31episode/s, Episode Reward=-649] 


Nuevo mejor modelo guardado con test_value: -714.56


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.15episode/s, Episode Reward=-312] 


Nuevo mejor modelo guardado con test_value: -708.82


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.77episode/s, Episode Reward=-284]


Nuevo mejor modelo guardado con test_value: -687.36


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.27episode/s, Episode Reward=-215]


Nuevo mejor modelo guardado con test_value: -584.8


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.28episode/s, Episode Reward=-221]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.34episode/s, Episode Reward=-263]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.01episode/s, Episode Reward=-216] 


Nuevo mejor modelo guardado con test_value: -565.82


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.08episode/s, Episode Reward=-322] 


Nuevo mejor modelo guardado con test_value: -564.54


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.79episode/s, Episode Reward=-143] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.93episode/s, Episode Reward=-175] 


Nuevo mejor modelo guardado con test_value: -466.24


VBox(children=(Label(value='0.709 MB of 1.240 MB uploaded\r'), FloatProgress(value=0.5716788044903922, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▄▅▅▆▄▆▇▇▆█
train_value,▁▃▅▆▇▇▇▇████

0,1
alpha,0.04932
epsilon,0.0
t,11.0
test_value,-466.24
train_value,-162.636


[34m[1mwandb[0m: Agent Starting Run: 39bb32xv with config:
[34m[1mwandb[0m: 	action_bins: 26
[34m[1mwandb[0m: 	alpha: 0.5881497762170911
[34m[1mwandb[0m: 	epsilon: 0.18328111768916433
[34m[1mwandb[0m: 	gamma: 0.5889151057060643
[34m[1mwandb[0m: 	vel_bins: 31
[34m[1mwandb[0m: 	x_bins: 91


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.05episode/s, Episode Reward=-746] 


Nuevo mejor modelo guardado con test_value: -828.22


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 38.37episode/s, Episode Reward=-346] 


Nuevo mejor modelo guardado con test_value: -721.52


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 56.61episode/s, Episode Reward=-267]


Nuevo mejor modelo guardado con test_value: -515.22


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.11episode/s, Episode Reward=-214]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.78episode/s, Episode Reward=-282]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.99episode/s, Episode Reward=-263] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.72episode/s, Episode Reward=-163] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 100.84episode/s, Episode Reward=-133]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.69episode/s, Episode Reward=-157]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.30episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.68episode/s, Episode Reward=-92] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.91episode/s, Episode Reward=-129]


Nuevo mejor modelo guardado con test_value: -425.74


VBox(children=(Label(value='0.318 MB of 0.593 MB uploaded\r'), FloatProgress(value=0.5363020796457328, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▆▄▅▃▅▃▄▄▆█
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.04901
epsilon,0.0
t,11.0
test_value,-425.74
train_value,-104.592


[34m[1mwandb[0m: Agent Starting Run: li92bnuv with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.508280561415642
[34m[1mwandb[0m: 	epsilon: 0.3864324859636561
[34m[1mwandb[0m: 	gamma: 0.5734817966344122
[34m[1mwandb[0m: 	vel_bins: 21
[34m[1mwandb[0m: 	x_bins: 84


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.77episode/s, Episode Reward=-581] 


Nuevo mejor modelo guardado con test_value: -808.82


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 54.04episode/s, Episode Reward=-260] 


Nuevo mejor modelo guardado con test_value: -626.16


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 71.45episode/s, Episode Reward=-339]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.58episode/s, Episode Reward=-272] 


Nuevo mejor modelo guardado con test_value: -528.58


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.90episode/s, Episode Reward=-162]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 121.41episode/s, Episode Reward=-143]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 143.19episode/s, Episode Reward=-127]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.59episode/s, Episode Reward=-81] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.98episode/s, Episode Reward=-91] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 164.24episode/s, Episode Reward=-37]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 143.90episode/s, Episode Reward=-41] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 208.82episode/s, Episode Reward=-55]


Nuevo mejor modelo guardado con test_value: -493.36


VBox(children=(Label(value='0.294 MB of 0.294 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▄▇▅▆▃▆▆▆▆█
train_value,▁▅▆▆▇▇██████

0,1
alpha,0.04236
epsilon,0.0
t,11.0
test_value,-493.36
train_value,-36.806


[34m[1mwandb[0m: Agent Starting Run: 92bdxa5u with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.7105575588850305
[34m[1mwandb[0m: 	epsilon: 0.13144351550165503
[34m[1mwandb[0m: 	gamma: 0.649504979841347
[34m[1mwandb[0m: 	vel_bins: 28
[34m[1mwandb[0m: 	x_bins: 80


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.35episode/s, Episode Reward=-364] 


Nuevo mejor modelo guardado con test_value: -839.7


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.98episode/s, Episode Reward=-317] 


Nuevo mejor modelo guardado con test_value: -752.32


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.28episode/s, Episode Reward=-198]


Nuevo mejor modelo guardado con test_value: -589.22


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.76episode/s, Episode Reward=-214] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 92.41episode/s, Episode Reward=-115] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.56episode/s, Episode Reward=-94] 


Nuevo mejor modelo guardado con test_value: -547.88


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.10episode/s, Episode Reward=-127]


Nuevo mejor modelo guardado con test_value: -519.54


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 136.40episode/s, Episode Reward=-47] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.07episode/s, Episode Reward=-75] 


Nuevo mejor modelo guardado con test_value: -487.0


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.96episode/s, Episode Reward=-17] 


Nuevo mejor modelo guardado con test_value: -422.92


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.32episode/s, Episode Reward=-87] 


Nuevo mejor modelo guardado con test_value: -401.02


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.93episode/s, Episode Reward=-17] 


Nuevo mejor modelo guardado con test_value: -342.94


VBox(children=(Label(value='0.493 MB of 0.493 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▅▃▄▅▆▄▆▇▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.05921
epsilon,0.0
t,11.0
test_value,-342.94
train_value,-64.858


[34m[1mwandb[0m: Agent Starting Run: zfw4u35g with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.7469892446184677
[34m[1mwandb[0m: 	epsilon: 0.1580191755149709
[34m[1mwandb[0m: 	gamma: 0.5884266496279863
[34m[1mwandb[0m: 	vel_bins: 11
[34m[1mwandb[0m: 	x_bins: 99


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.73episode/s, Episode Reward=-361] 


Nuevo mejor modelo guardado con test_value: -869.3


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.48episode/s, Episode Reward=-235] 
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 38.10episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.27episode/s, Episode Reward=-348] 


Nuevo mejor modelo guardado con test_value: -792.02


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.13episode/s, Episode Reward=-165] 


Nuevo mejor modelo guardado con test_value: -790.28


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.61episode/s, Episode Reward=-132] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.96episode/s, Episode Reward=-110]


Nuevo mejor modelo guardado con test_value: -705.92


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.83episode/s, Episode Reward=-113]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.87episode/s, Episode Reward=-67] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.35episode/s, Episode Reward=-139]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 151.02episode/s, Episode Reward=-39] 


Nuevo mejor modelo guardado con test_value: -588.04


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 165.95episode/s, Episode Reward=-43] 


VBox(children=(Label(value='0.210 MB of 0.210 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▂▁▄▄▃▆▅▄▄██
train_value,▁▄▄▅▆▇▇▇▇▇██

0,1
alpha,0.06225
epsilon,0.0
t,11.0
test_value,-603.74
train_value,-51.95


[34m[1mwandb[0m: Agent Starting Run: ixghxygr with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.6004852703323239
[34m[1mwandb[0m: 	epsilon: 0.1289264800251747
[34m[1mwandb[0m: 	gamma: 0.823237616404062
[34m[1mwandb[0m: 	vel_bins: 31
[34m[1mwandb[0m: 	x_bins: 64


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.57episode/s, Episode Reward=-217] 


Nuevo mejor modelo guardado con test_value: -654.62


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.47episode/s, Episode Reward=-106]


Nuevo mejor modelo guardado con test_value: -504.68


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.90episode/s, Episode Reward=-55]  


Nuevo mejor modelo guardado con test_value: -448.72


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.07episode/s, Episode Reward=-60] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.26episode/s, Episode Reward=-144]


Nuevo mejor modelo guardado con test_value: -442.78


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 155.18episode/s, Episode Reward=-19] 


Nuevo mejor modelo guardado con test_value: -405.38


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 189.46episode/s, Episode Reward=-47] 


Nuevo mejor modelo guardado con test_value: -385.12


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 173.82episode/s, Episode Reward=-36] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 227.72episode/s, Episode Reward=12]  


Nuevo mejor modelo guardado con test_value: -280.3


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 183.04episode/s, Episode Reward=13] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.19episode/s, Episode Reward=-47] 


Nuevo mejor modelo guardado con test_value: -259.56


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 172.51episode/s, Episode Reward=-53] 


Nuevo mejor modelo guardado con test_value: -233.42


VBox(children=(Label(value='0.271 MB of 0.342 MB uploaded\r'), FloatProgress(value=0.7933085252980591, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▄▃▅▅▅▅▇▇██
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.05004
epsilon,0.0
t,11.0
test_value,-233.42
train_value,-35.656


[34m[1mwandb[0m: Agent Starting Run: jiaxms71 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6649190286285679
[34m[1mwandb[0m: 	epsilon: 0.20095303356718805
[34m[1mwandb[0m: 	gamma: 0.6760263612747804
[34m[1mwandb[0m: 	vel_bins: 37
[34m[1mwandb[0m: 	x_bins: 78


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.82episode/s, Episode Reward=-384] 


Nuevo mejor modelo guardado con test_value: -628.2


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.99episode/s, Episode Reward=-300]


Nuevo mejor modelo guardado con test_value: -565.04


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.24episode/s, Episode Reward=-216] 


Nuevo mejor modelo guardado con test_value: -514.12


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.23episode/s, Episode Reward=-201]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.22episode/s, Episode Reward=-141]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.93episode/s, Episode Reward=-139]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.91episode/s, Episode Reward=-50] 


Nuevo mejor modelo guardado con test_value: -453.7


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 165.76episode/s, Episode Reward=-92] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 184.59episode/s, Episode Reward=-79] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.25episode/s, Episode Reward=-12] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 190.48episode/s, Episode Reward=-16] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.31episode/s, Episode Reward=-12]


VBox(children=(Label(value='0.467 MB of 0.467 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▆▅▁▃█▄▄▆▅█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.05541
epsilon,0.0
t,11.0
test_value,-455.8
train_value,-13.07


[34m[1mwandb[0m: Agent Starting Run: gcyikq32 with config:
[34m[1mwandb[0m: 	action_bins: 38
[34m[1mwandb[0m: 	alpha: 0.6039094778943412
[34m[1mwandb[0m: 	epsilon: 0.17521841708966948
[34m[1mwandb[0m: 	gamma: 0.8198283745265165
[34m[1mwandb[0m: 	vel_bins: 31
[34m[1mwandb[0m: 	x_bins: 54


Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.98episode/s, Episode Reward=-642] 


Nuevo mejor modelo guardado con test_value: -795.4


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 38.21episode/s, Episode Reward=-587]


Nuevo mejor modelo guardado con test_value: -733.9


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 58.99episode/s, Episode Reward=-221]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.93episode/s, Episode Reward=-68] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.74episode/s, Episode Reward=-75]  


Nuevo mejor modelo guardado con test_value: -623.82


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.08episode/s, Episode Reward=-27] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 130.80episode/s, Episode Reward=-18] 


Nuevo mejor modelo guardado con test_value: -589.02


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 149.56episode/s, Episode Reward=-51] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 166.49episode/s, Episode Reward=-104]


Nuevo mejor modelo guardado con test_value: -516.44


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 168.77episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.26episode/s, Episode Reward=-56]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 182.64episode/s, Episode Reward=12] 


Nuevo mejor modelo guardado con test_value: -469.44


VBox(children=(Label(value='0.519 MB of 0.519 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▁▁▅▄▆▅▇▆▇█
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.05033
epsilon,0.0
t,11.0
test_value,-469.44
train_value,-20.324


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6alpfrl3 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.5333071503310332
[34m[1mwandb[0m: 	epsilon: 0.12257225784686848
[34m[1mwandb[0m: 	gamma: 0.7427979600215429
[34m[1mwandb[0m: 	vel_bins: 24
[34m[1mwandb[0m: 	x_bins: 65


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.78episode/s, Episode Reward=-541] 


Nuevo mejor modelo guardado con test_value: -830.34


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.34episode/s, Episode Reward=-294]


Nuevo mejor modelo guardado con test_value: -618.92


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.54episode/s, Episode Reward=-143]


Nuevo mejor modelo guardado con test_value: -575.0


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.88episode/s, Episode Reward=-151]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 136.66episode/s, Episode Reward=-93] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 166.70episode/s, Episode Reward=-54] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.43episode/s, Episode Reward=-153]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 169.94episode/s, Episode Reward=-42] 


Nuevo mejor modelo guardado con test_value: -518.56


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.31episode/s, Episode Reward=-70] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 215.47episode/s, Episode Reward=-8]  


Nuevo mejor modelo guardado con test_value: -498.02


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.74episode/s, Episode Reward=-9]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.51episode/s, Episode Reward=-49] 


Nuevo mejor modelo guardado con test_value: -491.22


VBox(children=(Label(value='0.260 MB of 0.260 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▆▅▆▆▄▇▅█▇█
train_value,▁▅▆▇▇█▇█▇███

0,1
alpha,0.04444
epsilon,0.0
t,11.0
test_value,-491.22
train_value,-32.346


[34m[1mwandb[0m: Agent Starting Run: bunuobub with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.5556785069411135
[34m[1mwandb[0m: 	epsilon: 0.15901035302719208
[34m[1mwandb[0m: 	gamma: 0.9467155280038952
[34m[1mwandb[0m: 	vel_bins: 37
[34m[1mwandb[0m: 	x_bins: 82


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.38episode/s, Episode Reward=-363] 


Nuevo mejor modelo guardado con test_value: -854.14


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 61.21episode/s, Episode Reward=-340]


Nuevo mejor modelo guardado con test_value: -759.54


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.35episode/s, Episode Reward=-165] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.42episode/s, Episode Reward=-80] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.25episode/s, Episode Reward=-78] 


Nuevo mejor modelo guardado con test_value: -699.62


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.32episode/s, Episode Reward=-140]


Nuevo mejor modelo guardado con test_value: -681.68


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 110.01episode/s, Episode Reward=-72] 


Nuevo mejor modelo guardado con test_value: -599.16


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.33episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.11episode/s, Episode Reward=-54] 


Nuevo mejor modelo guardado con test_value: -581.26


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.03episode/s, Episode Reward=-62]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 163.57episode/s, Episode Reward=-85] 


Nuevo mejor modelo guardado con test_value: -559.12


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 154.18episode/s, Episode Reward=-66] 


Nuevo mejor modelo guardado con test_value: -526.7


VBox(children=(Label(value='0.659 MB of 0.659 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▂▂▄▅▆▆▇▆▇█
train_value,▁▆▇█████████

0,1
alpha,0.04631
epsilon,0.0
t,11.0
test_value,-526.7
train_value,-69.762


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: p8qpk2ap with config:
[34m[1mwandb[0m: 	action_bins: 30
[34m[1mwandb[0m: 	alpha: 0.7562795967292599
[34m[1mwandb[0m: 	epsilon: 0.1115722121562035
[34m[1mwandb[0m: 	gamma: 0.8056790195657948
[34m[1mwandb[0m: 	vel_bins: 10
[34m[1mwandb[0m: 	x_bins: 61


Training Progress: 100%|██████████| 500/500 [00:47<00:00, 10.61episode/s, Episode Reward=-150] 


Nuevo mejor modelo guardado con test_value: -886.7


Training Progress: 100%|██████████| 500/500 [00:32<00:00, 15.19episode/s, Episode Reward=-524] 


Nuevo mejor modelo guardado con test_value: -740.66


Training Progress: 100%|██████████| 500/500 [00:23<00:00, 21.33episode/s, Episode Reward=-150] 
Training Progress: 100%|██████████| 500/500 [00:28<00:00, 17.64episode/s, Episode Reward=-739] 
Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.84episode/s, Episode Reward=-104] 
Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.55episode/s, Episode Reward=-198] 
Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.75episode/s, Episode Reward=-94]  
Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.55episode/s, Episode Reward=-128] 
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 36.41episode/s, Episode Reward=-112] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 64.98episode/s, Episode Reward=-50]  


Nuevo mejor modelo guardado con test_value: -687.7


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.91episode/s, Episode Reward=-56] 


Nuevo mejor modelo guardado con test_value: -668.48


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.39episode/s, Episode Reward=-45]  


Nuevo mejor modelo guardado con test_value: -651.74


VBox(children=(Label(value='0.165 MB of 0.165 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▂▅▄▅▄▄▅▇██
train_value,▁▄▅▅▆▆▇▆▇▇██

0,1
alpha,0.06302
epsilon,0.0
t,11.0
test_value,-651.74
train_value,-71.15


[34m[1mwandb[0m: Agent Starting Run: g030kdqy with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.522423362854791
[34m[1mwandb[0m: 	epsilon: 0.10251079091019336
[34m[1mwandb[0m: 	gamma: 0.8031733933354167
[34m[1mwandb[0m: 	vel_bins: 43
[34m[1mwandb[0m: 	x_bins: 76


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.48episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -845.4


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.81episode/s, Episode Reward=-338] 


Nuevo mejor modelo guardado con test_value: -696.96


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 56.38episode/s, Episode Reward=-401]


Nuevo mejor modelo guardado con test_value: -579.0


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.94episode/s, Episode Reward=-176]


Nuevo mejor modelo guardado con test_value: -513.4


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.82episode/s, Episode Reward=-146] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.04episode/s, Episode Reward=-227]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.76episode/s, Episode Reward=-209] 


Nuevo mejor modelo guardado con test_value: -498.3


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.79episode/s, Episode Reward=-63] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.39episode/s, Episode Reward=-103]


Nuevo mejor modelo guardado con test_value: -446.84


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 110.22episode/s, Episode Reward=-135]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 121.26episode/s, Episode Reward=-149]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 126.92episode/s, Episode Reward=-102]


VBox(children=(Label(value='0.707 MB of 0.707 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▆▇▆▆▇▇█▇▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.04354
epsilon,0.0
t,11.0
test_value,-474.86
train_value,-90.032


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: cr6jdzql with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.7063325106452809
[34m[1mwandb[0m: 	epsilon: 0.12969962109594813
[34m[1mwandb[0m: 	gamma: 0.951455848017962
[34m[1mwandb[0m: 	vel_bins: 29
[34m[1mwandb[0m: 	x_bins: 59


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.99episode/s, Episode Reward=-568] 


Nuevo mejor modelo guardado con test_value: -933.92


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 41.43episode/s, Episode Reward=-324] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.29episode/s, Episode Reward=-168] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 91.60episode/s, Episode Reward=-119] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.16episode/s, Episode Reward=-76]  


Nuevo mejor modelo guardado con test_value: -899.0


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 133.30episode/s, Episode Reward=-68]  


Nuevo mejor modelo guardado con test_value: -879.4


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.54episode/s, Episode Reward=5]   
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 157.55episode/s, Episode Reward=10]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 212.75episode/s, Episode Reward=15]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.02episode/s, Episode Reward=8]  


Nuevo mejor modelo guardado con test_value: -830.64


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 235.18episode/s, Episode Reward=15] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 214.44episode/s, Episode Reward=14] 


Nuevo mejor modelo guardado con test_value: -791.38


VBox(children=(Label(value='0.146 MB of 0.297 MB uploaded\r'), FloatProgress(value=0.49202870373643465, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▁▁▂▄▅▃▄▄▆▅█
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.05886
epsilon,0.0
t,11.0
test_value,-791.38
train_value,-3.762


[34m[1mwandb[0m: Agent Starting Run: f3r8vs54 with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.588824345615824
[34m[1mwandb[0m: 	epsilon: 0.25573620100167116
[34m[1mwandb[0m: 	gamma: 0.6553740256668237
[34m[1mwandb[0m: 	vel_bins: 97
[34m[1mwandb[0m: 	x_bins: 51


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.72episode/s, Episode Reward=-881] 


Nuevo mejor modelo guardado con test_value: -881.02


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 38.57episode/s, Episode Reward=-483] 


Nuevo mejor modelo guardado con test_value: -570.14


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.30episode/s, Episode Reward=-629] 


Nuevo mejor modelo guardado con test_value: -492.22


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 64.53episode/s, Episode Reward=-346]


Nuevo mejor modelo guardado con test_value: -446.36


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.82episode/s, Episode Reward=-332]


Nuevo mejor modelo guardado con test_value: -417.42


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.39episode/s, Episode Reward=-330] 


Nuevo mejor modelo guardado con test_value: -389.22


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.80episode/s, Episode Reward=-198] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.03episode/s, Episode Reward=-213] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.64episode/s, Episode Reward=-253] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.90episode/s, Episode Reward=-305] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.63episode/s, Episode Reward=-187] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.70episode/s, Episode Reward=-250]


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▇▇██▇▆▆▇▇▇
train_value,▁▄▆▇▇▇██████

0,1
alpha,0.04907
epsilon,0.0
t,11.0
test_value,-468.3
train_value,-167.12


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: uqboyn5k with config:
[34m[1mwandb[0m: 	action_bins: 41
[34m[1mwandb[0m: 	alpha: 0.6410018479348003
[34m[1mwandb[0m: 	epsilon: 0.11624248692809644
[34m[1mwandb[0m: 	gamma: 0.6925480336351444
[34m[1mwandb[0m: 	vel_bins: 38
[34m[1mwandb[0m: 	x_bins: 97


Training Progress: 100%|██████████| 500/500 [00:29<00:00, 16.98episode/s, Episode Reward=-605] 


Nuevo mejor modelo guardado con test_value: -977.16


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.11episode/s, Episode Reward=-479] 


Nuevo mejor modelo guardado con test_value: -767.26


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.66episode/s, Episode Reward=-225] 


Nuevo mejor modelo guardado con test_value: -683.42


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.04episode/s, Episode Reward=-393] 
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.08episode/s, Episode Reward=-294]
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.75episode/s, Episode Reward=-271]
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 53.21episode/s, Episode Reward=-373]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 55.71episode/s, Episode Reward=-245]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.64episode/s, Episode Reward=-148]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.17episode/s, Episode Reward=-224] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.60episode/s, Episode Reward=-215]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 73.50episode/s, Episode Reward=-177] 


Nuevo mejor modelo guardado con test_value: -676.42


VBox(children=(Label(value='0.162 MB of 1.205 MB uploaded\r'), FloatProgress(value=0.13414857173620287, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆█▇▆▆▇▅▅▅▆█
train_value,▁▄▅▆▇▇▇▇▇███

0,1
alpha,0.05342
epsilon,0.0
t,11.0
test_value,-676.42
train_value,-184.446


[34m[1mwandb[0m: Agent Starting Run: 9jaj07gn with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.630080495468364
[34m[1mwandb[0m: 	epsilon: 0.3446728523639944
[34m[1mwandb[0m: 	gamma: 0.5586005220764443
[34m[1mwandb[0m: 	vel_bins: 33
[34m[1mwandb[0m: 	x_bins: 51


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.66episode/s, Episode Reward=-575] 


Nuevo mejor modelo guardado con test_value: -563.48


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.76episode/s, Episode Reward=-439] 


Nuevo mejor modelo guardado con test_value: -399.34


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.41episode/s, Episode Reward=-162]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.88episode/s, Episode Reward=-63]  


Nuevo mejor modelo guardado con test_value: -356.92


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.34episode/s, Episode Reward=-129] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.87episode/s, Episode Reward=-211]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.64episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.93episode/s, Episode Reward=-94] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 151.32episode/s, Episode Reward=-86] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.42episode/s, Episode Reward=-80] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.15episode/s, Episode Reward=-45] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.72episode/s, Episode Reward=-18]


VBox(children=(Label(value='0.346 MB of 0.346 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▇▆█▇▅▁▃▅▅▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.05251
epsilon,0.0
t,11.0
test_value,-370.32
train_value,-37.446


[34m[1mwandb[0m: Agent Starting Run: ybyvlecs with config:
[34m[1mwandb[0m: 	action_bins: 28
[34m[1mwandb[0m: 	alpha: 0.6499418122404514
[34m[1mwandb[0m: 	epsilon: 0.12463949145583976
[34m[1mwandb[0m: 	gamma: 0.7781468547978617
[34m[1mwandb[0m: 	vel_bins: 36
[34m[1mwandb[0m: 	x_bins: 75


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.11episode/s, Episode Reward=-433] 


Nuevo mejor modelo guardado con test_value: -794.84


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 45.80episode/s, Episode Reward=-390] 


Nuevo mejor modelo guardado con test_value: -590.48


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.52episode/s, Episode Reward=-336]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.16episode/s, Episode Reward=-154]


Nuevo mejor modelo guardado con test_value: -512.32


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.38episode/s, Episode Reward=-223] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.33episode/s, Episode Reward=-198]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.79episode/s, Episode Reward=-85] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 121.17episode/s, Episode Reward=-179]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.85episode/s, Episode Reward=-21] 


Nuevo mejor modelo guardado con test_value: -459.62


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.05episode/s, Episode Reward=-69] 


Nuevo mejor modelo guardado con test_value: -432.34


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 146.89episode/s, Episode Reward=-55] 


Nuevo mejor modelo guardado con test_value: -408.38


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.55episode/s, Episode Reward=-93] 


Nuevo mejor modelo guardado con test_value: -342.32


VBox(children=(Label(value='0.162 MB of 0.609 MB uploaded\r'), FloatProgress(value=0.26537525719963384, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▃▅▄▅▄▅▆▇▇█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.05416
epsilon,0.0
t,11.0
test_value,-342.32
train_value,-47.918


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: x9yt64gm with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.5958807213570544
[34m[1mwandb[0m: 	epsilon: 0.1691716498631921
[34m[1mwandb[0m: 	gamma: 0.7295346708644701
[34m[1mwandb[0m: 	vel_bins: 38
[34m[1mwandb[0m: 	x_bins: 58


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.45episode/s, Episode Reward=-360] 


Nuevo mejor modelo guardado con test_value: -560.94


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.82episode/s, Episode Reward=-182]


Nuevo mejor modelo guardado con test_value: -445.42


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.52episode/s, Episode Reward=-152]


Nuevo mejor modelo guardado con test_value: -441.82


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.89episode/s, Episode Reward=-213] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.08episode/s, Episode Reward=-55] 


Nuevo mejor modelo guardado con test_value: -377.86


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.05episode/s, Episode Reward=-90] 


Nuevo mejor modelo guardado con test_value: -361.16


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 155.81episode/s, Episode Reward=-56] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 165.74episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 177.44episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 176.75episode/s, Episode Reward=-91]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 219.63episode/s, Episode Reward=-45] 


Nuevo mejor modelo guardado con test_value: -309.28


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 219.02episode/s, Episode Reward=-10]


Nuevo mejor modelo guardado con test_value: -297.18


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▄▃▆▆▅▅▄▆██
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.04966
epsilon,0.0
t,11.0
test_value,-297.18
train_value,-6.2


[34m[1mwandb[0m: Agent Starting Run: wnpzrcj2 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.7694541723655108
[34m[1mwandb[0m: 	epsilon: 0.10396768785162484
[34m[1mwandb[0m: 	gamma: 0.7212324099728049
[34m[1mwandb[0m: 	vel_bins: 57
[34m[1mwandb[0m: 	x_bins: 54


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 35.31episode/s, Episode Reward=-295] 


Nuevo mejor modelo guardado con test_value: -742.48


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.19episode/s, Episode Reward=-430]


Nuevo mejor modelo guardado con test_value: -593.2


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.70episode/s, Episode Reward=-230] 


Nuevo mejor modelo guardado con test_value: -556.96


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.59episode/s, Episode Reward=-103]


Nuevo mejor modelo guardado con test_value: -510.44


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 133.94episode/s, Episode Reward=-82] 


Nuevo mejor modelo guardado con test_value: -460.2


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 156.09episode/s, Episode Reward=-139]


Nuevo mejor modelo guardado con test_value: -446.78


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 172.82episode/s, Episode Reward=-23] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 191.22episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 210.99episode/s, Episode Reward=-10] 


Nuevo mejor modelo guardado con test_value: -431.72


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 227.20episode/s, Episode Reward=22]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 256.62episode/s, Episode Reward=23] 


Nuevo mejor modelo guardado con test_value: -380.1


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 220.83episode/s, Episode Reward=-10]


Nuevo mejor modelo guardado con test_value: -365.26


VBox(children=(Label(value='0.495 MB of 0.495 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▄▅▆▆▆▅▇▇██
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.06412
epsilon,0.0
t,11.0
test_value,-365.26
train_value,-7.936


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jyny7azs with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.66068116369139
[34m[1mwandb[0m: 	epsilon: 0.19798177928561497
[34m[1mwandb[0m: 	gamma: 0.5215023042439473
[34m[1mwandb[0m: 	vel_bins: 41
[34m[1mwandb[0m: 	x_bins: 66


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.80episode/s, Episode Reward=-406] 


Nuevo mejor modelo guardado con test_value: -651.44


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 41.88episode/s, Episode Reward=-667] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.92episode/s, Episode Reward=-300]


Nuevo mejor modelo guardado con test_value: -588.04


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.87episode/s, Episode Reward=-254]


Nuevo mejor modelo guardado con test_value: -499.76


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.59episode/s, Episode Reward=-134] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.38episode/s, Episode Reward=-84]  
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.56episode/s, Episode Reward=-96]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.18episode/s, Episode Reward=-103]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.91episode/s, Episode Reward=-118]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.94episode/s, Episode Reward=-243]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 128.60episode/s, Episode Reward=-61] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.82episode/s, Episode Reward=-59] 


Nuevo mejor modelo guardado con test_value: -472.1


VBox(children=(Label(value='0.588 MB of 0.588 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▂▄▇▇▂▁▁▁▄▄█
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.05506
epsilon,0.0
t,11.0
test_value,-472.1
train_value,-83.992


[34m[1mwandb[0m: Agent Starting Run: 9f5y7557 with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.63331215615757
[34m[1mwandb[0m: 	epsilon: 0.14885899037322972
[34m[1mwandb[0m: 	gamma: 0.7557620526409652
[34m[1mwandb[0m: 	vel_bins: 39
[34m[1mwandb[0m: 	x_bins: 52


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.61episode/s, Episode Reward=-337] 


Nuevo mejor modelo guardado con test_value: -715.56


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 58.03episode/s, Episode Reward=-129] 


Nuevo mejor modelo guardado con test_value: -522.12


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.95episode/s, Episode Reward=-225]


Nuevo mejor modelo guardado con test_value: -440.6


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.52episode/s, Episode Reward=-94]  


Nuevo mejor modelo guardado con test_value: -428.06


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.46episode/s, Episode Reward=-175]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.82episode/s, Episode Reward=-158]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 149.80episode/s, Episode Reward=-62] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.43episode/s, Episode Reward=-15] 


Nuevo mejor modelo guardado con test_value: -377.84


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 171.34episode/s, Episode Reward=-17] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 203.96episode/s, Episode Reward=21] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.70episode/s, Episode Reward=19] 


Nuevo mejor modelo guardado con test_value: -305.5


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 213.70episode/s, Episode Reward=-8] 


VBox(children=(Label(value='0.413 MB of 0.413 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▆▆▅▆▆▇▇▇██
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.05278
epsilon,0.0
t,11.0
test_value,-312.66
train_value,-23.648


[34m[1mwandb[0m: Agent Starting Run: gh3eqiy1 with config:
[34m[1mwandb[0m: 	action_bins: 28
[34m[1mwandb[0m: 	alpha: 0.7297184718074943
[34m[1mwandb[0m: 	epsilon: 0.13930534583348697
[34m[1mwandb[0m: 	gamma: 0.6017202840541527
[34m[1mwandb[0m: 	vel_bins: 22
[34m[1mwandb[0m: 	x_bins: 60


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.37episode/s, Episode Reward=-265] 


Nuevo mejor modelo guardado con test_value: -870.62


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.38episode/s, Episode Reward=-302]


Nuevo mejor modelo guardado con test_value: -677.74


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.65episode/s, Episode Reward=-166]


Nuevo mejor modelo guardado con test_value: -611.3


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.32episode/s, Episode Reward=-125] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.61episode/s, Episode Reward=-137]


Nuevo mejor modelo guardado con test_value: -513.54


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.27episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 142.85episode/s, Episode Reward=-18] 


Nuevo mejor modelo guardado con test_value: -509.3


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 149.06episode/s, Episode Reward=-130]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.32episode/s, Episode Reward=-48] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 180.98episode/s, Episode Reward=-69] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 157.13episode/s, Episode Reward=-33] 


Nuevo mejor modelo guardado con test_value: -470.48


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.78episode/s, Episode Reward=21]  


Nuevo mejor modelo guardado con test_value: -383.98


VBox(children=(Label(value='0.177 MB of 0.308 MB uploaded\r'), FloatProgress(value=0.5753623412581788, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▅▄▆▅▆▅▅▅▇█
train_value,▁▅▆▇▇▇▇█▇███

0,1
alpha,0.06081
epsilon,0.0
t,11.0
test_value,-383.98
train_value,-20.268


[34m[1mwandb[0m: Agent Starting Run: gendmp97 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6832794683025527
[34m[1mwandb[0m: 	epsilon: 0.6342427076882168
[34m[1mwandb[0m: 	gamma: 0.5167066835592592
[34m[1mwandb[0m: 	vel_bins: 24
[34m[1mwandb[0m: 	x_bins: 46


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.51episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -540.96


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.68episode/s, Episode Reward=-608] 


Nuevo mejor modelo guardado con test_value: -440.06


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 42.35episode/s, Episode Reward=-146] 


Nuevo mejor modelo guardado con test_value: -438.82


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 61.15episode/s, Episode Reward=-136] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.57episode/s, Episode Reward=-184]


Nuevo mejor modelo guardado con test_value: -426.6


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.87episode/s, Episode Reward=-88]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.24episode/s, Episode Reward=-117]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.50episode/s, Episode Reward=-47] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.60episode/s, Episode Reward=-40] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 197.60episode/s, Episode Reward=-36]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 225.36episode/s, Episode Reward=-44] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 195.44episode/s, Episode Reward=-36]


VBox(children=(Label(value='0.187 MB of 0.187 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▅██▇█▆▁▃▃▃▅█
train_value,▁▂▄▅▆▇▇▇████

0,1
alpha,0.05694
epsilon,0.0
t,11.0
test_value,-447.66
train_value,-34.57


[34m[1mwandb[0m: Agent Starting Run: fu8c1h1v with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.7772290356820079
[34m[1mwandb[0m: 	epsilon: 0.43506648590570607
[34m[1mwandb[0m: 	gamma: 0.598826824325674
[34m[1mwandb[0m: 	vel_bins: 18
[34m[1mwandb[0m: 	x_bins: 49


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.21episode/s, Episode Reward=-478] 


Nuevo mejor modelo guardado con test_value: -677.6


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.77episode/s, Episode Reward=-331] 
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 36.55episode/s, Episode Reward=-430] 


Nuevo mejor modelo guardado con test_value: -547.12


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.19episode/s, Episode Reward=-394] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.16episode/s, Episode Reward=-131] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.93episode/s, Episode Reward=-171] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 100.74episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.14episode/s, Episode Reward=-56] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.60episode/s, Episode Reward=-48] 


Nuevo mejor modelo guardado con test_value: -519.24


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.91episode/s, Episode Reward=-44] 


Nuevo mejor modelo guardado con test_value: -492.14


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.76episode/s, Episode Reward=-45] 


Nuevo mejor modelo guardado con test_value: -370.72


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.41episode/s, Episode Reward=-34] 


Nuevo mejor modelo guardado con test_value: -340.78


VBox(children=(Label(value='0.162 MB of 0.168 MB uploaded\r'), FloatProgress(value=0.9639650908263923, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▂▅▁▄▁▄▃▅▆██
train_value,▁▃▄▅▆▇▇▇████

0,1
alpha,0.06477
epsilon,0.0
t,11.0
test_value,-340.78
train_value,-26.948


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xteg1rvc with config:
[34m[1mwandb[0m: 	action_bins: 26
[34m[1mwandb[0m: 	alpha: 0.6717135882514333
[34m[1mwandb[0m: 	epsilon: 0.4363061910208321
[34m[1mwandb[0m: 	gamma: 0.6131309011295213
[34m[1mwandb[0m: 	vel_bins: 20
[34m[1mwandb[0m: 	x_bins: 59


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.48episode/s, Episode Reward=-403] 


Nuevo mejor modelo guardado con test_value: -694.56


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.40episode/s, Episode Reward=-206] 


Nuevo mejor modelo guardado con test_value: -443.38


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 51.54episode/s, Episode Reward=-341] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.19episode/s, Episode Reward=-201]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.97episode/s, Episode Reward=-130] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.75episode/s, Episode Reward=-176] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.90episode/s, Episode Reward=-56] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 128.87episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.82episode/s, Episode Reward=-106]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.07episode/s, Episode Reward=-43] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 166.88episode/s, Episode Reward=-50] 


Nuevo mejor modelo guardado con test_value: -396.58


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 192.04episode/s, Episode Reward=-61]


Nuevo mejor modelo guardado con test_value: -364.34


VBox(children=(Label(value='0.146 MB of 0.258 MB uploaded\r'), FloatProgress(value=0.5651432242628326, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▇▅▆▆▄▄▂▁▆▇█
train_value,▁▄▅▆▇▇▇▇████

0,1
alpha,0.05598
epsilon,0.0
t,11.0
test_value,-364.34
train_value,-27.362


[34m[1mwandb[0m: Agent Starting Run: 1khfj79k with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6719726707865263
[34m[1mwandb[0m: 	epsilon: 0.25724038055192544
[34m[1mwandb[0m: 	gamma: 0.6203108869604994
[34m[1mwandb[0m: 	vel_bins: 15
[34m[1mwandb[0m: 	x_bins: 65


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.15episode/s, Episode Reward=-328] 


Nuevo mejor modelo guardado con test_value: -786.24


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.68episode/s, Episode Reward=-368] 


Nuevo mejor modelo guardado con test_value: -702.0


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 56.08episode/s, Episode Reward=-296]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.59episode/s, Episode Reward=-239] 


Nuevo mejor modelo guardado con test_value: -586.14


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.23episode/s, Episode Reward=-120] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.00episode/s, Episode Reward=-150]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.23episode/s, Episode Reward=-92] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 146.04episode/s, Episode Reward=-58] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 173.73episode/s, Episode Reward=-43] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.02episode/s, Episode Reward=-4]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 176.34episode/s, Episode Reward=-36]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 180.63episode/s, Episode Reward=-47] 


VBox(children=(Label(value='0.169 MB of 0.169 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▃▅▃█▂▄▄▁▇█▇█
train_value,▁▄▅▆▆▇▇█████

0,1
alpha,0.056
epsilon,0.0
t,11.0
test_value,-590.7
train_value,-36.706


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: pk1oczyp with config:
[34m[1mwandb[0m: 	action_bins: 28
[34m[1mwandb[0m: 	alpha: 0.7569781453698743
[34m[1mwandb[0m: 	epsilon: 0.37983886169139713
[34m[1mwandb[0m: 	gamma: 0.6166130538165109
[34m[1mwandb[0m: 	vel_bins: 35
[34m[1mwandb[0m: 	x_bins: 64


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.50episode/s, Episode Reward=-756] 


Nuevo mejor modelo guardado con test_value: -510.48


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.59episode/s, Episode Reward=-331] 


Nuevo mejor modelo guardado con test_value: -402.84


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.12episode/s, Episode Reward=-381]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.93episode/s, Episode Reward=-158] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.14episode/s, Episode Reward=-246] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.36episode/s, Episode Reward=-98] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.03episode/s, Episode Reward=-77] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.41episode/s, Episode Reward=-51] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.60episode/s, Episode Reward=-30] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 146.90episode/s, Episode Reward=-51] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.91episode/s, Episode Reward=-51] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 151.79episode/s, Episode Reward=-90] 


Nuevo mejor modelo guardado con test_value: -365.9


VBox(children=(Label(value='0.302 MB of 0.508 MB uploaded\r'), FloatProgress(value=0.5943890519917834, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▄▇▇▅▆▄▄▃▁▄▅█
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.06308
epsilon,0.0
t,11.0
test_value,-365.9
train_value,-64.334


[34m[1mwandb[0m: Agent Starting Run: goc8sgbw with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.9506568449867644
[34m[1mwandb[0m: 	epsilon: 0.16298146842460684
[34m[1mwandb[0m: 	gamma: 0.5841976298554536
[34m[1mwandb[0m: 	vel_bins: 19
[34m[1mwandb[0m: 	x_bins: 54


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.19episode/s, Episode Reward=-218] 


Nuevo mejor modelo guardado con test_value: -748.02


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.56episode/s, Episode Reward=-165] 


Nuevo mejor modelo guardado con test_value: -512.32


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 62.43episode/s, Episode Reward=-324] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.64episode/s, Episode Reward=-221] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.85episode/s, Episode Reward=-133] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.34episode/s, Episode Reward=-129]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.66episode/s, Episode Reward=-53] 


Nuevo mejor modelo guardado con test_value: -483.04


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 174.23episode/s, Episode Reward=-45] 


Nuevo mejor modelo guardado con test_value: -471.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.70episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 186.73episode/s, Episode Reward=-40] 


Nuevo mejor modelo guardado con test_value: -419.54


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 188.50episode/s, Episode Reward=-42]


Nuevo mejor modelo guardado con test_value: -364.92


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 177.57episode/s, Episode Reward=-36]


VBox(children=(Label(value='0.185 MB of 0.185 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▃▁▃▃▆▆▅▇██
train_value,▁▅▅▆▇▇▇█████

0,1
alpha,0.07922
epsilon,0.0
t,11.0
test_value,-374.32
train_value,-39.08


[34m[1mwandb[0m: Agent Starting Run: pkbcx1qd with config:
[34m[1mwandb[0m: 	action_bins: 38
[34m[1mwandb[0m: 	alpha: 0.8945546294800264
[34m[1mwandb[0m: 	epsilon: 0.2818242810787873
[34m[1mwandb[0m: 	gamma: 0.5528103387429448
[34m[1mwandb[0m: 	vel_bins: 22
[34m[1mwandb[0m: 	x_bins: 38


Training Progress: 100%|██████████| 500/500 [00:26<00:00, 18.61episode/s, Episode Reward=-236] 


Nuevo mejor modelo guardado con test_value: -700.4


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.30episode/s, Episode Reward=-428] 
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 36.24episode/s, Episode Reward=-435] 
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.62episode/s, Episode Reward=-241] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.04episode/s, Episode Reward=-110]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.55episode/s, Episode Reward=-151] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.01episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 110.59episode/s, Episode Reward=-30] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.32episode/s, Episode Reward=-28] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.34episode/s, Episode Reward=-25] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 169.00episode/s, Episode Reward=14]  


Nuevo mejor modelo guardado con test_value: -676.5


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 144.02episode/s, Episode Reward=-44] 


Nuevo mejor modelo guardado con test_value: -606.52


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▆▅▃▄▅▁▂▃▅▅▆█
train_value,▁▄▅▆▆▇▇▇████

0,1
alpha,0.07455
epsilon,0.0
t,11.0
test_value,-606.52
train_value,-43.69


[34m[1mwandb[0m: Agent Starting Run: r31yicy1 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6281621260957266
[34m[1mwandb[0m: 	epsilon: 0.2131884523473575
[34m[1mwandb[0m: 	gamma: 0.7759442351927364
[34m[1mwandb[0m: 	vel_bins: 58
[34m[1mwandb[0m: 	x_bins: 71


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.17episode/s, Episode Reward=-699] 


Nuevo mejor modelo guardado con test_value: -705.66


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.12episode/s, Episode Reward=-298] 


Nuevo mejor modelo guardado con test_value: -539.08


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.79episode/s, Episode Reward=-304]


Nuevo mejor modelo guardado con test_value: -521.42


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.28episode/s, Episode Reward=-221] 


Nuevo mejor modelo guardado con test_value: -521.28


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.28episode/s, Episode Reward=-104]


Nuevo mejor modelo guardado con test_value: -463.88


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.47episode/s, Episode Reward=-68] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.01episode/s, Episode Reward=-135]


Nuevo mejor modelo guardado con test_value: -451.8


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.14episode/s, Episode Reward=-55] 


Nuevo mejor modelo guardado con test_value: -429.96


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 171.83episode/s, Episode Reward=-61] 


Nuevo mejor modelo guardado con test_value: -391.34


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 189.87episode/s, Episode Reward=-56] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.28episode/s, Episode Reward=13]  


Nuevo mejor modelo guardado con test_value: -335.46


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 208.30episode/s, Episode Reward=-62] 


VBox(children=(Label(value='0.657 MB of 0.657 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▄▄▆▅▆▆▇▆██
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.05235
epsilon,0.0
t,11.0
test_value,-357.74
train_value,-35.744


[34m[1mwandb[0m: Agent Starting Run: n6gpk5cs with config:
[34m[1mwandb[0m: 	action_bins: 29
[34m[1mwandb[0m: 	alpha: 0.8796791055410734
[34m[1mwandb[0m: 	epsilon: 0.12837526796566376
[34m[1mwandb[0m: 	gamma: 0.5995371330019161
[34m[1mwandb[0m: 	vel_bins: 46
[34m[1mwandb[0m: 	x_bins: 72


Training Progress: 100%|██████████| 500/500 [00:20<00:00, 24.84episode/s, Episode Reward=-506] 


Nuevo mejor modelo guardado con test_value: -755.78


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.13episode/s, Episode Reward=-148] 


Nuevo mejor modelo guardado con test_value: -645.36


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.90episode/s, Episode Reward=-377]


Nuevo mejor modelo guardado con test_value: -551.76


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 75.80episode/s, Episode Reward=-253] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.48episode/s, Episode Reward=-221] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.43episode/s, Episode Reward=-179] 


Nuevo mejor modelo guardado con test_value: -537.74


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.60episode/s, Episode Reward=-135]


Nuevo mejor modelo guardado con test_value: -519.2


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.94episode/s, Episode Reward=-170]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.48episode/s, Episode Reward=-133]


Nuevo mejor modelo guardado con test_value: -515.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.85episode/s, Episode Reward=-91] 


Nuevo mejor modelo guardado con test_value: -461.54


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 143.07episode/s, Episode Reward=-36] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 150.05episode/s, Episode Reward=-120]


Nuevo mejor modelo guardado con test_value: -371.22


VBox(children=(Label(value='0.768 MB of 0.768 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▅▄▄▅▅▅▅▆▆█
train_value,▁▅▆▇▇▇▇▇████

0,1
alpha,0.07331
epsilon,0.0
t,11.0
test_value,-371.22
train_value,-74.844


[34m[1mwandb[0m: Agent Starting Run: 66fx3fkj with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.56383390158755
[34m[1mwandb[0m: 	epsilon: 0.20003859894415588
[34m[1mwandb[0m: 	gamma: 0.7641603521754166
[34m[1mwandb[0m: 	vel_bins: 46
[34m[1mwandb[0m: 	x_bins: 57


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.03episode/s, Episode Reward=-888] 


Nuevo mejor modelo guardado con test_value: -617.24


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 58.40episode/s, Episode Reward=-286] 


Nuevo mejor modelo guardado con test_value: -515.3


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.78episode/s, Episode Reward=-191] 


Nuevo mejor modelo guardado con test_value: -486.52


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.22episode/s, Episode Reward=-145] 


Nuevo mejor modelo guardado con test_value: -441.02


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.09episode/s, Episode Reward=-138]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.97episode/s, Episode Reward=-45]  


Nuevo mejor modelo guardado con test_value: -367.9


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 146.63episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 157.68episode/s, Episode Reward=-48] 


Nuevo mejor modelo guardado con test_value: -327.86


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 186.49episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 191.41episode/s, Episode Reward=17] 


Nuevo mejor modelo guardado con test_value: -325.92


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 227.89episode/s, Episode Reward=-10]


Nuevo mejor modelo guardado con test_value: -267.08


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 207.18episode/s, Episode Reward=-15]


Nuevo mejor modelo guardado con test_value: -238.48


VBox(children=(Label(value='0.146 MB of 0.487 MB uploaded\r'), FloatProgress(value=0.3001873784829979, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▃▄▄▆▅▆▆▆▇█
train_value,▁▅▆▆▇▇▇█████

0,1
alpha,0.04699
epsilon,0.0
t,11.0
test_value,-238.48
train_value,-19.736


[34m[1mwandb[0m: Agent Starting Run: y4hgjksf with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.831756821111793
[34m[1mwandb[0m: 	epsilon: 0.19627336831709075
[34m[1mwandb[0m: 	gamma: 0.6265712151205087
[34m[1mwandb[0m: 	vel_bins: 43
[34m[1mwandb[0m: 	x_bins: 66


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.75episode/s, Episode Reward=-235] 


Nuevo mejor modelo guardado con test_value: -624.18


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.05episode/s, Episode Reward=-130] 


Nuevo mejor modelo guardado con test_value: -479.22


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.55episode/s, Episode Reward=-163] 


Nuevo mejor modelo guardado con test_value: -444.68


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.39episode/s, Episode Reward=-163] 


Nuevo mejor modelo guardado con test_value: -426.46


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.64episode/s, Episode Reward=-119]


Nuevo mejor modelo guardado con test_value: -401.42


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 135.35episode/s, Episode Reward=-139]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 144.52episode/s, Episode Reward=-118]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.43episode/s, Episode Reward=-63] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 186.49episode/s, Episode Reward=-97] 


Nuevo mejor modelo guardado con test_value: -356.78


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 196.87episode/s, Episode Reward=20]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 217.80episode/s, Episode Reward=-7]  


Nuevo mejor modelo guardado con test_value: -316.38


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 246.37episode/s, Episode Reward=24] 


Nuevo mejor modelo guardado con test_value: -246.88


VBox(children=(Label(value='0.481 MB of 0.481 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▄▅▅▅▄▅▆▆▇█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.06931
epsilon,0.0
t,11.0
test_value,-246.88
train_value,-7.412


[34m[1mwandb[0m: Agent Starting Run: 9b5ka02x with config:
[34m[1mwandb[0m: 	action_bins: 26
[34m[1mwandb[0m: 	alpha: 0.9611513956347562
[34m[1mwandb[0m: 	epsilon: 0.21113218655125632
[34m[1mwandb[0m: 	gamma: 0.6601701134834584
[34m[1mwandb[0m: 	vel_bins: 44
[34m[1mwandb[0m: 	x_bins: 60


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.82episode/s, Episode Reward=-384] 


Nuevo mejor modelo guardado con test_value: -555.32


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.63episode/s, Episode Reward=-403]


Nuevo mejor modelo guardado con test_value: -472.32


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.10episode/s, Episode Reward=-151] 


Nuevo mejor modelo guardado con test_value: -427.16


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.13episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.34episode/s, Episode Reward=-131]


Nuevo mejor modelo guardado con test_value: -406.44


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 121.53episode/s, Episode Reward=-66] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.28episode/s, Episode Reward=-95] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 163.20episode/s, Episode Reward=-86] 


Nuevo mejor modelo guardado con test_value: -394.4


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 166.99episode/s, Episode Reward=-65] 


Nuevo mejor modelo guardado con test_value: -388.0


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 207.35episode/s, Episode Reward=-49] 


Nuevo mejor modelo guardado con test_value: -349.96


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 191.97episode/s, Episode Reward=-4] 


Nuevo mejor modelo guardado con test_value: -312.04


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 226.35episode/s, Episode Reward=26] 


Nuevo mejor modelo guardado con test_value: -284.54


VBox(children=(Label(value='0.381 MB of 0.553 MB uploaded\r'), FloatProgress(value=0.6878579738871892, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▄▄▅▄▃▅▅▆▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.0801
epsilon,0.0
t,11.0
test_value,-284.54
train_value,-12.696


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jm3jsr2s with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.9257224774635868
[34m[1mwandb[0m: 	epsilon: 0.10823270041053688
[34m[1mwandb[0m: 	gamma: 0.7634349345517435
[34m[1mwandb[0m: 	vel_bins: 61
[34m[1mwandb[0m: 	x_bins: 71


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.65episode/s, Episode Reward=-603] 


Nuevo mejor modelo guardado con test_value: -863.8


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.79episode/s, Episode Reward=-263] 


Nuevo mejor modelo guardado con test_value: -636.84


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 73.71episode/s, Episode Reward=-225]


Nuevo mejor modelo guardado con test_value: -571.24


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.90episode/s, Episode Reward=-122] 


Nuevo mejor modelo guardado con test_value: -491.82


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 100.73episode/s, Episode Reward=-150]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.94episode/s, Episode Reward=-92] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.67episode/s, Episode Reward=-90] 


Nuevo mejor modelo guardado con test_value: -363.56


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 135.55episode/s, Episode Reward=-98] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 145.17episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.14episode/s, Episode Reward=-54] 


Nuevo mejor modelo guardado con test_value: -343.74


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.00episode/s, Episode Reward=-61] 


Nuevo mejor modelo guardado con test_value: -329.12


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 183.06episode/s, Episode Reward=-58] 


Nuevo mejor modelo guardado con test_value: -302.98


VBox(children=(Label(value='0.929 MB of 0.929 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▅▆▆▅▇▇▇▇██
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07714
epsilon,0.0
t,11.0
test_value,-302.98
train_value,-38.686


[34m[1mwandb[0m: Agent Starting Run: o2tz5kq6 with config:
[34m[1mwandb[0m: 	action_bins: 37
[34m[1mwandb[0m: 	alpha: 0.9851003805757106
[34m[1mwandb[0m: 	epsilon: 0.22461603679445497
[34m[1mwandb[0m: 	gamma: 0.6678420162110537
[34m[1mwandb[0m: 	vel_bins: 61
[34m[1mwandb[0m: 	x_bins: 61


Training Progress: 100%|██████████| 500/500 [00:24<00:00, 20.16episode/s, Episode Reward=-624] 


Nuevo mejor modelo guardado con test_value: -842.32


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 37.80episode/s, Episode Reward=-218] 


Nuevo mejor modelo guardado con test_value: -623.28


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.78episode/s, Episode Reward=-140]


Nuevo mejor modelo guardado con test_value: -591.94


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 62.17episode/s, Episode Reward=-190]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 70.13episode/s, Episode Reward=-282]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.38episode/s, Episode Reward=-176] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.78episode/s, Episode Reward=-330] 


Nuevo mejor modelo guardado con test_value: -576.48


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 91.02episode/s, Episode Reward=-298] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.11episode/s, Episode Reward=-66]  
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.93episode/s, Episode Reward=-126] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.89episode/s, Episode Reward=-132]


Nuevo mejor modelo guardado con test_value: -530.08


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.31episode/s, Episode Reward=-88] 


Nuevo mejor modelo guardado con test_value: -466.66


VBox(children=(Label(value='0.380 MB of 1.094 MB uploaded\r'), FloatProgress(value=0.3477266803981371, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▆▅▅▄▆▅▅▆▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.08209
epsilon,0.0
t,11.0
test_value,-466.66
train_value,-105.322


[34m[1mwandb[0m: Agent Starting Run: s2qo8vkq with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.958795050621398
[34m[1mwandb[0m: 	epsilon: 0.29904222885708653
[34m[1mwandb[0m: 	gamma: 0.6043801785902658
[34m[1mwandb[0m: 	vel_bins: 87
[34m[1mwandb[0m: 	x_bins: 76


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.46episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -732.26


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.58episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -406.1


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 64.54episode/s, Episode Reward=-386] 


Nuevo mejor modelo guardado con test_value: -349.76


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.27episode/s, Episode Reward=-175] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.88episode/s, Episode Reward=-248] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.24episode/s, Episode Reward=-238]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.71episode/s, Episode Reward=-62] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.32episode/s, Episode Reward=-297]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.06episode/s, Episode Reward=-211]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.43episode/s, Episode Reward=-212]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.22episode/s, Episode Reward=-171]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.29episode/s, Episode Reward=-202]


VBox(children=(Label(value='0.552 MB of 1.094 MB uploaded\r'), FloatProgress(value=0.5045035468099622, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇███▇█▇▇▇▆▇
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.0799
epsilon,0.0
t,11.0
test_value,-385.48
train_value,-122.528


[34m[1mwandb[0m: Agent Starting Run: sawl2a56 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.9258417936615582
[34m[1mwandb[0m: 	epsilon: 0.11297848282236904
[34m[1mwandb[0m: 	gamma: 0.632810747915294
[34m[1mwandb[0m: 	vel_bins: 61
[34m[1mwandb[0m: 	x_bins: 82


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.69episode/s, Episode Reward=-284] 


Nuevo mejor modelo guardado con test_value: -723.68


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.00episode/s, Episode Reward=-224] 


Nuevo mejor modelo guardado con test_value: -590.64


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.50episode/s, Episode Reward=-280] 


Nuevo mejor modelo guardado con test_value: -556.58


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.70episode/s, Episode Reward=-146]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.61episode/s, Episode Reward=-139]


Nuevo mejor modelo guardado con test_value: -499.94


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.95episode/s, Episode Reward=-213]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.54episode/s, Episode Reward=-134]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.33episode/s, Episode Reward=-140]


Nuevo mejor modelo guardado con test_value: -470.8


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.62episode/s, Episode Reward=-86] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 169.84episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.60episode/s, Episode Reward=-243]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 190.26episode/s, Episode Reward=-17] 


Nuevo mejor modelo guardado con test_value: -451.64


VBox(children=(Label(value='0.412 MB of 0.794 MB uploaded\r'), FloatProgress(value=0.5184748366299592, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▅▄▇▆▄█▆▅▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07715
epsilon,0.0
t,11.0
test_value,-451.64
train_value,-69.118


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fpk8rye9 with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.8546892201748173
[34m[1mwandb[0m: 	epsilon: 0.2595278964367916
[34m[1mwandb[0m: 	gamma: 0.6280514029233557
[34m[1mwandb[0m: 	vel_bins: 26
[34m[1mwandb[0m: 	x_bins: 79


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.45episode/s, Episode Reward=-322] 


Nuevo mejor modelo guardado con test_value: -691.22


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 51.40episode/s, Episode Reward=-348] 


Nuevo mejor modelo guardado con test_value: -647.92


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.90episode/s, Episode Reward=-332]


Nuevo mejor modelo guardado con test_value: -548.3


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.01episode/s, Episode Reward=-304] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.97episode/s, Episode Reward=-124]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.15episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.64episode/s, Episode Reward=-97] 


Nuevo mejor modelo guardado con test_value: -548.06


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 151.44episode/s, Episode Reward=-43] 


Nuevo mejor modelo guardado con test_value: -478.42


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 156.12episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 196.41episode/s, Episode Reward=-12] 


Nuevo mejor modelo guardado con test_value: -430.04


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 194.68episode/s, Episode Reward=-11] 


Nuevo mejor modelo guardado con test_value: -320.28


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 201.89episode/s, Episode Reward=-61] 


Nuevo mejor modelo guardado con test_value: -307.28


VBox(children=(Label(value='0.421 MB of 0.421 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▄▃▃▃▄▅▄▆██
train_value,▁▅▆▆▇▇▇█████

0,1
alpha,0.07122
epsilon,0.0
t,11.0
test_value,-307.28
train_value,-37.972


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: npy0u8hy with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.9852312495429156
[34m[1mwandb[0m: 	epsilon: 0.3421507422798944
[34m[1mwandb[0m: 	gamma: 0.6118763874266767
[34m[1mwandb[0m: 	vel_bins: 27
[34m[1mwandb[0m: 	x_bins: 98


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.12episode/s, Episode Reward=-530] 


Nuevo mejor modelo guardado con test_value: -662.0


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 52.02episode/s, Episode Reward=-609] 


Nuevo mejor modelo guardado con test_value: -505.38


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.67episode/s, Episode Reward=-339]


Nuevo mejor modelo guardado con test_value: -462.98


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.59episode/s, Episode Reward=-177] 


Nuevo mejor modelo guardado con test_value: -441.34


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.16episode/s, Episode Reward=-140]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 130.89episode/s, Episode Reward=-121]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.97episode/s, Episode Reward=-106]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.15episode/s, Episode Reward=-117]


Nuevo mejor modelo guardado con test_value: -417.44


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.33episode/s, Episode Reward=-36] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 183.48episode/s, Episode Reward=-41]


Nuevo mejor modelo guardado con test_value: -416.78


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 214.53episode/s, Episode Reward=-43]


Nuevo mejor modelo guardado con test_value: -368.62


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 252.78episode/s, Episode Reward=-2] 


Nuevo mejor modelo guardado con test_value: -300.76


VBox(children=(Label(value='0.495 MB of 0.495 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▅▅▃▄▄▆▅▆▇█
train_value,▁▄▅▆▇▇▇▇▇███

0,1
alpha,0.0821
epsilon,0.0
t,11.0
test_value,-300.76
train_value,-1.176


[34m[1mwandb[0m: Agent Starting Run: e0589xl5 with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.9563640196766752
[34m[1mwandb[0m: 	epsilon: 0.5168572540670725
[34m[1mwandb[0m: 	gamma: 0.6100720483945419
[34m[1mwandb[0m: 	vel_bins: 19
[34m[1mwandb[0m: 	x_bins: 92


Training Progress: 100%|██████████| 500/500 [00:22<00:00, 21.86episode/s, Episode Reward=-675] 


Nuevo mejor modelo guardado con test_value: -721.72


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.67episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -648.94


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.55episode/s, Episode Reward=-528] 


Nuevo mejor modelo guardado con test_value: -599.14


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 41.26episode/s, Episode Reward=-383] 


Nuevo mejor modelo guardado con test_value: -525.82


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 62.89episode/s, Episode Reward=-687] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.59episode/s, Episode Reward=-60]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.71episode/s, Episode Reward=-94] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 130.33episode/s, Episode Reward=-226]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 129.49episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 187.11episode/s, Episode Reward=-54] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 169.69episode/s, Episode Reward=-53]


Nuevo mejor modelo guardado con test_value: -436.8


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 191.82episode/s, Episode Reward=-53]


Nuevo mejor modelo guardado con test_value: -385.06


VBox(children=(Label(value='0.363 MB of 0.363 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▄▅▃▂▄▂▄▅▇█
train_value,▁▂▃▅▆▇▇█████

0,1
alpha,0.0797
epsilon,0.0
t,11.0
test_value,-385.06
train_value,-39.284


[34m[1mwandb[0m: Agent Starting Run: zfrvtwyq with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.9239733100132664
[34m[1mwandb[0m: 	epsilon: 0.24155078456572837
[34m[1mwandb[0m: 	gamma: 0.5840491860883744
[34m[1mwandb[0m: 	vel_bins: 34
[34m[1mwandb[0m: 	x_bins: 82


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 31.11episode/s, Episode Reward=-500] 


Nuevo mejor modelo guardado con test_value: -651.76


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 56.65episode/s, Episode Reward=-284]


Nuevo mejor modelo guardado con test_value: -528.24


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.09episode/s, Episode Reward=-302] 


Nuevo mejor modelo guardado con test_value: -439.22


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.09episode/s, Episode Reward=-141] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.09episode/s, Episode Reward=-219]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.97episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.27episode/s, Episode Reward=-126]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.27episode/s, Episode Reward=-38] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.71episode/s, Episode Reward=-77] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.73episode/s, Episode Reward=-44] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.31episode/s, Episode Reward=12] 


Nuevo mejor modelo guardado con test_value: -395.26


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 221.23episode/s, Episode Reward=-2] 


Nuevo mejor modelo guardado con test_value: -340.9


VBox(children=(Label(value='0.541 MB of 0.541 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▆▄▅▃▅▃▆▅▇█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.077
epsilon,0.0
t,11.0
test_value,-340.9
train_value,-14.16


[34m[1mwandb[0m: Agent Starting Run: z9j6mer5 with config:
[34m[1mwandb[0m: 	action_bins: 31
[34m[1mwandb[0m: 	alpha: 0.9893779373396484
[34m[1mwandb[0m: 	epsilon: 0.23801646950899205
[34m[1mwandb[0m: 	gamma: 0.649058922754693
[34m[1mwandb[0m: 	vel_bins: 23
[34m[1mwandb[0m: 	x_bins: 89


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.58episode/s, Episode Reward=-388] 


Nuevo mejor modelo guardado con test_value: -707.16


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 42.78episode/s, Episode Reward=-269] 


Nuevo mejor modelo guardado con test_value: -684.18


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 54.30episode/s, Episode Reward=-371]


Nuevo mejor modelo guardado con test_value: -597.62


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 71.24episode/s, Episode Reward=-126] 


Nuevo mejor modelo guardado con test_value: -577.66


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 92.24episode/s, Episode Reward=-278] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.72episode/s, Episode Reward=-132]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.29episode/s, Episode Reward=-85] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.23episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.10episode/s, Episode Reward=-118]


Nuevo mejor modelo guardado con test_value: -549.9


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.44episode/s, Episode Reward=-59] 


Nuevo mejor modelo guardado con test_value: -465.26


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.40episode/s, Episode Reward=-55] 


Nuevo mejor modelo guardado con test_value: -449.1


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.91episode/s, Episode Reward=-45] 


Nuevo mejor modelo guardado con test_value: -399.44


VBox(children=(Label(value='0.287 MB of 0.520 MB uploaded\r'), FloatProgress(value=0.5517832962080873, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▃▄▃▂▄▄▅▇▇█
train_value,▁▅▆▆▇▇▇█████

0,1
alpha,0.08245
epsilon,0.0
t,11.0
test_value,-399.44
train_value,-46.914


[34m[1mwandb[0m: Agent Starting Run: k0dpyg4g with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.94008003198579
[34m[1mwandb[0m: 	epsilon: 0.1021367280738593
[34m[1mwandb[0m: 	gamma: 0.7370042374676822
[34m[1mwandb[0m: 	vel_bins: 31
[34m[1mwandb[0m: 	x_bins: 71


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.28episode/s, Episode Reward=-520] 


Nuevo mejor modelo guardado con test_value: -826.04


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.44episode/s, Episode Reward=-286]


Nuevo mejor modelo guardado con test_value: -721.5


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 92.33episode/s, Episode Reward=-210] 


Nuevo mejor modelo guardado con test_value: -601.68


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.74episode/s, Episode Reward=-49] 


Nuevo mejor modelo guardado con test_value: -558.4


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.21episode/s, Episode Reward=-134]


Nuevo mejor modelo guardado con test_value: -521.46


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.13episode/s, Episode Reward=-53] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 175.27episode/s, Episode Reward=-60] 


Nuevo mejor modelo guardado con test_value: -393.3


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 177.94episode/s, Episode Reward=-18] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 194.30episode/s, Episode Reward=-49]


Nuevo mejor modelo guardado con test_value: -379.04


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 223.94episode/s, Episode Reward=-52]


Nuevo mejor modelo guardado con test_value: -344.14


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 226.61episode/s, Episode Reward=4]  


Nuevo mejor modelo guardado con test_value: -286.0


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.77episode/s, Episode Reward=-42]


Nuevo mejor modelo guardado con test_value: -253.8


VBox(children=(Label(value='0.431 MB of 0.431 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▄▄▅▅▆▆▆▇██
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.07834
epsilon,0.0
t,11.0
test_value,-253.8
train_value,-27.39


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 83jog0qx with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.9833746920497702
[34m[1mwandb[0m: 	epsilon: 0.1517499398982298
[34m[1mwandb[0m: 	gamma: 0.7643074159213292
[34m[1mwandb[0m: 	vel_bins: 43
[34m[1mwandb[0m: 	x_bins: 68


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.30episode/s, Episode Reward=-202] 


Nuevo mejor modelo guardado con test_value: -612.3


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.92episode/s, Episode Reward=-148]


Nuevo mejor modelo guardado con test_value: -483.48


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.63episode/s, Episode Reward=-99] 


Nuevo mejor modelo guardado con test_value: -391.88


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.06episode/s, Episode Reward=-88] 


Nuevo mejor modelo guardado con test_value: -362.86


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 148.47episode/s, Episode Reward=-75] 


Nuevo mejor modelo guardado con test_value: -349.76


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 173.33episode/s, Episode Reward=-87] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 162.62episode/s, Episode Reward=-9]  


Nuevo mejor modelo guardado con test_value: -330.26


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 225.76episode/s, Episode Reward=-42]


Nuevo mejor modelo guardado con test_value: -313.94


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 222.94episode/s, Episode Reward=-44]


Nuevo mejor modelo guardado con test_value: -272.02


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 250.65episode/s, Episode Reward=-48]


Nuevo mejor modelo guardado con test_value: -241.52


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 238.44episode/s, Episode Reward=22] 


Nuevo mejor modelo guardado con test_value: -198.94


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 260.17episode/s, Episode Reward=3]  


Nuevo mejor modelo guardado con test_value: -191.58


VBox(children=(Label(value='0.495 MB of 0.495 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▅▅▅▅▆▆▇▇██
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.08195
epsilon,0.0
t,11.0
test_value,-191.58
train_value,-3.43


[34m[1mwandb[0m: Agent Starting Run: kot4v7hv with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.9262866650713866
[34m[1mwandb[0m: 	epsilon: 0.14565597612891584
[34m[1mwandb[0m: 	gamma: 0.6668780616981389
[34m[1mwandb[0m: 	vel_bins: 42
[34m[1mwandb[0m: 	x_bins: 55


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.37episode/s, Episode Reward=-354] 


Nuevo mejor modelo guardado con test_value: -568.86


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 67.26episode/s, Episode Reward=-256]


Nuevo mejor modelo guardado con test_value: -517.72


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 92.56episode/s, Episode Reward=-188] 


Nuevo mejor modelo guardado con test_value: -491.46


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.32episode/s, Episode Reward=-70] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.98episode/s, Episode Reward=-31] 


Nuevo mejor modelo guardado con test_value: -488.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.89episode/s, Episode Reward=-74] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.62episode/s, Episode Reward=-55] 


Nuevo mejor modelo guardado con test_value: -455.16


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 189.34episode/s, Episode Reward=-6]  


Nuevo mejor modelo guardado con test_value: -385.9


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 205.34episode/s, Episode Reward=-40] 


Nuevo mejor modelo guardado con test_value: -340.66


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 241.60episode/s, Episode Reward=27] 


Nuevo mejor modelo guardado con test_value: -321.74


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 248.01episode/s, Episode Reward=27] 


Nuevo mejor modelo guardado con test_value: -256.44


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 207.99episode/s, Episode Reward=-10]


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▃▃▃▃▄▅▆▇██
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07719
epsilon,0.0
t,11.0
test_value,-258.64
train_value,-26.466


[34m[1mwandb[0m: Agent Starting Run: j5spbbc4 with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.9843862689348062
[34m[1mwandb[0m: 	epsilon: 0.7153034713066594
[34m[1mwandb[0m: 	gamma: 0.6140397370079576
[34m[1mwandb[0m: 	vel_bins: 53
[34m[1mwandb[0m: 	x_bins: 82


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.12episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -811.26


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.48episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -257.94


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 46.70episode/s, Episode Reward=-273] 


Nuevo mejor modelo guardado con test_value: -236.7


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.16episode/s, Episode Reward=-464]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.70episode/s, Episode Reward=-273] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.86episode/s, Episode Reward=-236] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.66episode/s, Episode Reward=-218]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.29episode/s, Episode Reward=-110]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 118.86episode/s, Episode Reward=-144]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.89episode/s, Episode Reward=-216]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 129.82episode/s, Episode Reward=-134]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.66episode/s, Episode Reward=-47] 


VBox(children=(Label(value='0.458 MB of 0.932 MB uploaded\r'), FloatProgress(value=0.4918590873091398, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁███▇▇▆▅▄▅▅▆
train_value,▁▂▄▆▇▇▇█████

0,1
alpha,0.08203
epsilon,0.0
t,11.0
test_value,-382.4
train_value,-94.646


[34m[1mwandb[0m: Agent Starting Run: cnjm53je with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.9265255094315336
[34m[1mwandb[0m: 	epsilon: 0.34396068723610995
[34m[1mwandb[0m: 	gamma: 0.7220160599941642
[34m[1mwandb[0m: 	vel_bins: 52
[34m[1mwandb[0m: 	x_bins: 94


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.12episode/s, Episode Reward=-751] 


Nuevo mejor modelo guardado con test_value: -725.36


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.26episode/s, Episode Reward=-294] 


Nuevo mejor modelo guardado con test_value: -429.62


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.69episode/s, Episode Reward=-295]


Nuevo mejor modelo guardado con test_value: -364.02


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.69episode/s, Episode Reward=-177] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.55episode/s, Episode Reward=-101]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.82episode/s, Episode Reward=-129]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.14episode/s, Episode Reward=-120]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 126.19episode/s, Episode Reward=-130]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.59episode/s, Episode Reward=-174]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 121.46episode/s, Episode Reward=-134]


Nuevo mejor modelo guardado con test_value: -355.88


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.70episode/s, Episode Reward=-143]


Nuevo mejor modelo guardado con test_value: -313.5


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 169.64episode/s, Episode Reward=-53] 


Nuevo mejor modelo guardado con test_value: -300.14


VBox(children=(Label(value='0.931 MB of 0.931 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆▇▆▇▆▇▅▇▇██
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.07721
epsilon,0.0
t,11.0
test_value,-300.14
train_value,-73.59


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: e4y1a9jj with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.8729189598744268
[34m[1mwandb[0m: 	epsilon: 0.5971217542822369
[34m[1mwandb[0m: 	gamma: 0.685963461278088
[34m[1mwandb[0m: 	vel_bins: 70
[34m[1mwandb[0m: 	x_bins: 97


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.49episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -833.8


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 40.19episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -303.7


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 56.07episode/s, Episode Reward=-278] 


Nuevo mejor modelo guardado con test_value: -255.98


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.70episode/s, Episode Reward=-552] 


Nuevo mejor modelo guardado con test_value: -255.14


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.59episode/s, Episode Reward=-277] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.46episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.03episode/s, Episode Reward=-213]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.18episode/s, Episode Reward=-198]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.23episode/s, Episode Reward=-212]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.49episode/s, Episode Reward=-155]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.86episode/s, Episode Reward=-218]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.23episode/s, Episode Reward=-97] 


VBox(children=(Label(value='1.005 MB of 1.124 MB uploaded\r'), FloatProgress(value=0.894651840938563, max=1.0)…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇██▇▇▇▆▆▅▆▇
train_value,▁▃▅▆▇▇██████

0,1
alpha,0.07274
epsilon,0.0
t,11.0
test_value,-364.84
train_value,-127.414


[34m[1mwandb[0m: Agent Starting Run: fo6offqs with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.8398660862349248
[34m[1mwandb[0m: 	epsilon: 0.2980447774134176
[34m[1mwandb[0m: 	gamma: 0.6972592359305821
[34m[1mwandb[0m: 	vel_bins: 51
[34m[1mwandb[0m: 	x_bins: 83


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.19episode/s, Episode Reward=-444] 


Nuevo mejor modelo guardado con test_value: -658.06


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 55.02episode/s, Episode Reward=-286] 


Nuevo mejor modelo guardado con test_value: -400.8


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 73.17episode/s, Episode Reward=-356]


Nuevo mejor modelo guardado con test_value: -364.34


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.39episode/s, Episode Reward=-187] 


Nuevo mejor modelo guardado con test_value: -340.4


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.38episode/s, Episode Reward=-235]


Nuevo mejor modelo guardado con test_value: -301.78


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.09episode/s, Episode Reward=-89] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.88episode/s, Episode Reward=-83] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.01episode/s, Episode Reward=-102]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.23episode/s, Episode Reward=-56] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.90episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 170.01episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 191.33episode/s, Episode Reward=-15] 


Nuevo mejor modelo guardado con test_value: -287.16


VBox(children=(Label(value='0.742 MB of 0.742 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆▇▇█▇▇▆▇▆▇█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.06999
epsilon,0.0
t,11.0
test_value,-287.16
train_value,-41.734


[34m[1mwandb[0m: Agent Starting Run: 33t57v81 with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.984236527104827
[34m[1mwandb[0m: 	epsilon: 0.18659977618954413
[34m[1mwandb[0m: 	gamma: 0.8246239529847597
[34m[1mwandb[0m: 	vel_bins: 48
[34m[1mwandb[0m: 	x_bins: 70


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.50episode/s, Episode Reward=-738] 


Nuevo mejor modelo guardado con test_value: -762.94


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 58.21episode/s, Episode Reward=-343]


Nuevo mejor modelo guardado con test_value: -617.86


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.52episode/s, Episode Reward=-137] 


Nuevo mejor modelo guardado con test_value: -459.56


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.65episode/s, Episode Reward=-97] 


Nuevo mejor modelo guardado con test_value: -395.82


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.58episode/s, Episode Reward=-22] 


Nuevo mejor modelo guardado con test_value: -388.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.17episode/s, Episode Reward=-48] 


Nuevo mejor modelo guardado con test_value: -341.42


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.77episode/s, Episode Reward=-19] 


Nuevo mejor modelo guardado con test_value: -311.5


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.57episode/s, Episode Reward=-17] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 217.14episode/s, Episode Reward=-13]


Nuevo mejor modelo guardado con test_value: -306.24


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 209.34episode/s, Episode Reward=-14]


Nuevo mejor modelo guardado con test_value: -249.26


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 241.74episode/s, Episode Reward=-22]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 255.23episode/s, Episode Reward=-34]


Nuevo mejor modelo guardado con test_value: -217.72


VBox(children=(Label(value='0.672 MB of 0.672 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▅▆▆▆▇▇▇███
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.08202
epsilon,0.0
t,11.0
test_value,-217.72
train_value,0.496


[34m[1mwandb[0m: Agent Starting Run: ufsuu4jm with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.9675207678662524
[34m[1mwandb[0m: 	epsilon: 0.5142327957907499
[34m[1mwandb[0m: 	gamma: 0.7616652821449676
[34m[1mwandb[0m: 	vel_bins: 87
[34m[1mwandb[0m: 	x_bins: 89


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.87episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -910.1


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 37.79episode/s, Episode Reward=-616] 


Nuevo mejor modelo guardado con test_value: -387.68


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 55.08episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -328.8


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 73.15episode/s, Episode Reward=-558]


Nuevo mejor modelo guardado con test_value: -255.98


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 92.34episode/s, Episode Reward=-203] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.13episode/s, Episode Reward=-339]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.18episode/s, Episode Reward=-192]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.01episode/s, Episode Reward=-250]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.53episode/s, Episode Reward=-185]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.20episode/s, Episode Reward=-142]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 130.19episode/s, Episode Reward=-125]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 146.69episode/s, Episode Reward=-122]


VBox(children=(Label(value='0.740 MB of 1.399 MB uploaded\r'), FloatProgress(value=0.528820023480356, max=1.0)…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇▇██▇▇▇▇▇▇▇
train_value,▁▃▅▆▇▇▇█████

0,1
alpha,0.08063
epsilon,0.0
t,11.0
test_value,-381.28
train_value,-111.374


[34m[1mwandb[0m: Agent Starting Run: 6jf3ls4k with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.8557192610541047
[34m[1mwandb[0m: 	epsilon: 0.3777799539950304
[34m[1mwandb[0m: 	gamma: 0.6421858786303195
[34m[1mwandb[0m: 	vel_bins: 33
[34m[1mwandb[0m: 	x_bins: 72


Training Progress: 100%|██████████| 500/500 [00:13<00:00, 35.82episode/s, Episode Reward=-556] 


Nuevo mejor modelo guardado con test_value: -592.94


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.17episode/s, Episode Reward=-178]


Nuevo mejor modelo guardado con test_value: -496.4


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.71episode/s, Episode Reward=-270] 


Nuevo mejor modelo guardado con test_value: -400.62


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.12episode/s, Episode Reward=-130]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.49episode/s, Episode Reward=-96] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.51episode/s, Episode Reward=-147]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 165.30episode/s, Episode Reward=-128]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.95episode/s, Episode Reward=-75] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 214.48episode/s, Episode Reward=-67] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 209.59episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 230.39episode/s, Episode Reward=10] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 250.29episode/s, Episode Reward=21] 


Nuevo mejor modelo guardado con test_value: -376.72


VBox(children=(Label(value='0.387 MB of 0.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▇▅▆▄▆▂▂▄▇█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.07131
epsilon,0.0
t,11.0
test_value,-376.72
train_value,-5.802


[34m[1mwandb[0m: Agent Starting Run: rkaimmyx with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.9221719550707494
[34m[1mwandb[0m: 	epsilon: 0.21544675506087227
[34m[1mwandb[0m: 	gamma: 0.7894085810783231
[34m[1mwandb[0m: 	vel_bins: 54
[34m[1mwandb[0m: 	x_bins: 82


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.26episode/s, Episode Reward=-557] 


Nuevo mejor modelo guardado con test_value: -698.58


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 70.05episode/s, Episode Reward=-139]


Nuevo mejor modelo guardado con test_value: -570.6


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.85episode/s, Episode Reward=-146] 


Nuevo mejor modelo guardado con test_value: -487.14


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.65episode/s, Episode Reward=-134]


Nuevo mejor modelo guardado con test_value: -461.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.71episode/s, Episode Reward=-64] 


Nuevo mejor modelo guardado con test_value: -429.96


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.11episode/s, Episode Reward=-63] 


Nuevo mejor modelo guardado con test_value: -398.18


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.45episode/s, Episode Reward=-54] 


Nuevo mejor modelo guardado con test_value: -396.44


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 211.01episode/s, Episode Reward=-62] 


Nuevo mejor modelo guardado con test_value: -339.66


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 228.22episode/s, Episode Reward=-15] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 213.71episode/s, Episode Reward=-10] 


Nuevo mejor modelo guardado con test_value: -290.46


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 247.40episode/s, Episode Reward=24] 


Nuevo mejor modelo guardado con test_value: -270.48


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 251.84episode/s, Episode Reward=-15]


Nuevo mejor modelo guardado con test_value: -262.92


VBox(children=(Label(value='0.706 MB of 0.706 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▄▅▅▆▆▇▇███
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07685
epsilon,0.0
t,11.0
test_value,-262.92
train_value,-4.97


[34m[1mwandb[0m: Agent Starting Run: 0818pt73 with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.9424449063197003
[34m[1mwandb[0m: 	epsilon: 0.1849293172428916
[34m[1mwandb[0m: 	gamma: 0.8708676513846711
[34m[1mwandb[0m: 	vel_bins: 62
[34m[1mwandb[0m: 	x_bins: 59


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.92episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -885.0


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 43.62episode/s, Episode Reward=-232] 


Nuevo mejor modelo guardado con test_value: -843.7


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.84episode/s, Episode Reward=-173] 


Nuevo mejor modelo guardado con test_value: -746.56


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.72episode/s, Episode Reward=-153] 


Nuevo mejor modelo guardado con test_value: -712.34


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.54episode/s, Episode Reward=-101]


Nuevo mejor modelo guardado con test_value: -510.3


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.81episode/s, Episode Reward=-131]


Nuevo mejor modelo guardado con test_value: -472.42


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.18episode/s, Episode Reward=-28] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 187.98episode/s, Episode Reward=-106]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 210.49episode/s, Episode Reward=-18] 


Nuevo mejor modelo guardado con test_value: -454.0


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.36episode/s, Episode Reward=-84] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 222.68episode/s, Episode Reward=-30]


Nuevo mejor modelo guardado con test_value: -352.24


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 240.42episode/s, Episode Reward=-14]


Nuevo mejor modelo guardado con test_value: -342.64


VBox(children=(Label(value='0.662 MB of 0.672 MB uploaded\r'), FloatProgress(value=0.9846426914449405, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▃▃▆▆▅▅▇▆██
train_value,▁▄▆▇▇▇██████

0,1
alpha,0.07854
epsilon,0.0
t,11.0
test_value,-342.64
train_value,-13.016


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9eq3f7mr with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.9879919824957568
[34m[1mwandb[0m: 	epsilon: 0.17021348328365377
[34m[1mwandb[0m: 	gamma: 0.7235691816669276
[34m[1mwandb[0m: 	vel_bins: 46
[34m[1mwandb[0m: 	x_bins: 39


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.81episode/s, Episode Reward=-217] 


Nuevo mejor modelo guardado con test_value: -613.0


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.78episode/s, Episode Reward=-95]  


Nuevo mejor modelo guardado con test_value: -597.7


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.65episode/s, Episode Reward=-147]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.62episode/s, Episode Reward=-145]


Nuevo mejor modelo guardado con test_value: -579.14


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 152.92episode/s, Episode Reward=-91] 


Nuevo mejor modelo guardado con test_value: -570.9


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.74episode/s, Episode Reward=-47] 


Nuevo mejor modelo guardado con test_value: -527.02


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 187.45episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 203.71episode/s, Episode Reward=-38] 


Nuevo mejor modelo guardado con test_value: -507.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 152.29episode/s, Episode Reward=-46] 


Nuevo mejor modelo guardado con test_value: -424.3


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 144.98episode/s, Episode Reward=-50]


Nuevo mejor modelo guardado con test_value: -367.78


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 192.48episode/s, Episode Reward=24] 


Nuevo mejor modelo guardado con test_value: -337.22


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 270.19episode/s, Episode Reward=24] 


Nuevo mejor modelo guardado con test_value: -312.34


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▂▂▃▂▃▅▇▇█
train_value,▁▅▆▇▇▇▇█▇███

0,1
alpha,0.08233
epsilon,0.0
t,11.0
test_value,-312.34
train_value,-2.686


[34m[1mwandb[0m: Agent Starting Run: xkdrjen7 with config:
[34m[1mwandb[0m: 	action_bins: 42
[34m[1mwandb[0m: 	alpha: 0.9628925920316328
[34m[1mwandb[0m: 	epsilon: 0.8011059702162843
[34m[1mwandb[0m: 	gamma: 0.5963877282708689
[34m[1mwandb[0m: 	vel_bins: 80
[34m[1mwandb[0m: 	x_bins: 99


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.06episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -987.6


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.85episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -939.62


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.97episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -735.0


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.25episode/s, Episode Reward=-486] 


Nuevo mejor modelo guardado con test_value: -438.62


Training Progress: 100%|██████████| 500/500 [00:12<00:00, 40.11episode/s, Episode Reward=-573] 
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.15episode/s, Episode Reward=-494] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.71episode/s, Episode Reward=-438] 
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.65episode/s, Episode Reward=-227] 
Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.60episode/s, Episode Reward=-292] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.62episode/s, Episode Reward=-326] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 55.12episode/s, Episode Reward=-460]
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.43episode/s, Episode Reward=-176]


VBox(children=(Label(value='0.396 MB of 2.605 MB uploaded\r'), FloatProgress(value=0.15200791663661561, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▄█▇▅▅▃▃▃▃▄
train_value,▁▁▁▃▆▇▇▇▇▇██

0,1
alpha,0.08024
epsilon,0.0
t,11.0
test_value,-788.08
train_value,-300.624


[34m[1mwandb[0m: Agent Starting Run: 9cgdn2e4 with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.8613974900050991
[34m[1mwandb[0m: 	epsilon: 0.46576350625506147
[34m[1mwandb[0m: 	gamma: 0.5816944563015665
[34m[1mwandb[0m: 	vel_bins: 59
[34m[1mwandb[0m: 	x_bins: 96


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.79episode/s, Episode Reward=-555] 


Nuevo mejor modelo guardado con test_value: -633.08


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 43.81episode/s, Episode Reward=-232] 


Nuevo mejor modelo guardado con test_value: -366.9


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.40episode/s, Episode Reward=-761] 


Nuevo mejor modelo guardado con test_value: -285.48


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.57episode/s, Episode Reward=-367] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.95episode/s, Episode Reward=-165] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.39episode/s, Episode Reward=-99]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.16episode/s, Episode Reward=-72] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.50episode/s, Episode Reward=-172]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.45episode/s, Episode Reward=-173]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.97episode/s, Episode Reward=-302]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 126.52episode/s, Episode Reward=-126]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 136.57episode/s, Episode Reward=-129]


VBox(children=(Label(value='0.708 MB of 0.941 MB uploaded\r'), FloatProgress(value=0.7526312589920768, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆██▇▆▆▅▄▅▄▅
train_value,▁▄▆▇▇▇▇█████

0,1
alpha,0.07178
epsilon,0.0
t,11.0
test_value,-412.82
train_value,-114.586


[34m[1mwandb[0m: Agent Starting Run: 50mlxbd5 with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.9554497877540866
[34m[1mwandb[0m: 	epsilon: 0.1937173680031605
[34m[1mwandb[0m: 	gamma: 0.8053236911010511
[34m[1mwandb[0m: 	vel_bins: 89
[34m[1mwandb[0m: 	x_bins: 62


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.94episode/s, Episode Reward=-420] 


Nuevo mejor modelo guardado con test_value: -637.82


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.53episode/s, Episode Reward=-360] 


Nuevo mejor modelo guardado con test_value: -465.26


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.83episode/s, Episode Reward=-135] 


Nuevo mejor modelo guardado con test_value: -435.04


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.81episode/s, Episode Reward=-291] 


Nuevo mejor modelo guardado con test_value: -418.92


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.79episode/s, Episode Reward=-132]


Nuevo mejor modelo guardado con test_value: -376.18


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.53episode/s, Episode Reward=-100]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.58episode/s, Episode Reward=-107]


Nuevo mejor modelo guardado con test_value: -346.06


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 143.37episode/s, Episode Reward=-168]


Nuevo mejor modelo guardado con test_value: -335.42


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.64episode/s, Episode Reward=-176]


Nuevo mejor modelo guardado con test_value: -301.12


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.56episode/s, Episode Reward=-47] 


Nuevo mejor modelo guardado con test_value: -289.2


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.52episode/s, Episode Reward=-92] 


Nuevo mejor modelo guardado con test_value: -279.84


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.88episode/s, Episode Reward=-8]  


Nuevo mejor modelo guardado con test_value: -272.0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▅▅▆▆▇▇▇███
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07962
epsilon,0.0
t,11.0
test_value,-272.0
train_value,-38.97


[34m[1mwandb[0m: Agent Starting Run: 5woj1d1m with config:
[34m[1mwandb[0m: 	action_bins: 47
[34m[1mwandb[0m: 	alpha: 0.94564740536768
[34m[1mwandb[0m: 	epsilon: 0.19486224702607047
[34m[1mwandb[0m: 	gamma: 0.908741647729078
[34m[1mwandb[0m: 	vel_bins: 100
[34m[1mwandb[0m: 	x_bins: 60


Training Progress: 100%|██████████| 500/500 [00:29<00:00, 17.21episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -978.92


Training Progress: 100%|██████████| 500/500 [00:22<00:00, 21.91episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -884.82


Training Progress: 100%|██████████| 500/500 [00:21<00:00, 23.58episode/s, Episode Reward=-677] 
Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.95episode/s, Episode Reward=-223] 
Training Progress: 100%|██████████| 500/500 [00:12<00:00, 41.52episode/s, Episode Reward=-428] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.89episode/s, Episode Reward=-204] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.14episode/s, Episode Reward=-188] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.28episode/s, Episode Reward=-151] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.73episode/s, Episode Reward=-126]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.69episode/s, Episode Reward=-121]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.13episode/s, Episode Reward=-120]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.64episode/s, Episode Reward=-113]


VBox(children=(Label(value='0.755 MB of 2.218 MB uploaded\r'), FloatProgress(value=0.34043478802890675, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂█▆▅▁▁▁▂▄▂▂▃
train_value,▁▂▃▄▆▇▇█████

0,1
alpha,0.0788
epsilon,0.0
t,11.0
test_value,-974.76
train_value,-76.05


[34m[1mwandb[0m: Agent Starting Run: 468pykw3 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.9659055361038194
[34m[1mwandb[0m: 	epsilon: 0.4425877656638315
[34m[1mwandb[0m: 	gamma: 0.7945621080292988
[34m[1mwandb[0m: 	vel_bins: 47
[34m[1mwandb[0m: 	x_bins: 72


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.76episode/s, Episode Reward=-680] 


Nuevo mejor modelo guardado con test_value: -700.0


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 61.43episode/s, Episode Reward=-239] 


Nuevo mejor modelo guardado con test_value: -398.98


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 92.86episode/s, Episode Reward=-187] 


Nuevo mejor modelo guardado con test_value: -333.34


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.47episode/s, Episode Reward=-221]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 144.20episode/s, Episode Reward=-70] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 150.05episode/s, Episode Reward=-65] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.88episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 192.46episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 188.93episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.89episode/s, Episode Reward=-41] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 237.18episode/s, Episode Reward=-6] 


Nuevo mejor modelo guardado con test_value: -310.56


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 264.38episode/s, Episode Reward=-17]


Nuevo mejor modelo guardado con test_value: -273.06


VBox(children=(Label(value='0.146 MB of 0.543 MB uploaded\r'), FloatProgress(value=0.2687749602829783, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆▇▇▇▆▆▆▆▇▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.08049
epsilon,0.0
t,11.0
test_value,-273.06
train_value,-3.818


[34m[1mwandb[0m: Agent Starting Run: htn416n8 with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.9825415017630192
[34m[1mwandb[0m: 	epsilon: 0.6774516465999201
[34m[1mwandb[0m: 	gamma: 0.5899748629769371
[34m[1mwandb[0m: 	vel_bins: 28
[34m[1mwandb[0m: 	x_bins: 63


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.54episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -589.3


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 32.25episode/s, Episode Reward=-725] 
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 37.43episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -533.16


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 54.54episode/s, Episode Reward=-255] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.67episode/s, Episode Reward=-204]


Nuevo mejor modelo guardado con test_value: -510.68


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.37episode/s, Episode Reward=-153]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 129.02episode/s, Episode Reward=-63] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 166.47episode/s, Episode Reward=-67] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.18episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.71episode/s, Episode Reward=-23] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 183.91episode/s, Episode Reward=-32] 


Nuevo mejor modelo guardado con test_value: -461.38


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 193.90episode/s, Episode Reward=-31]


Nuevo mejor modelo guardado con test_value: -409.58


VBox(children=(Label(value='0.292 MB of 0.292 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▃▁▄▃▅▂▃▁▁▃▆█
train_value,▁▂▃▅▆▇▇█████

0,1
alpha,0.08188
epsilon,0.0
t,11.0
test_value,-409.58
train_value,-36.486


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: q0jxgre3 with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.9169855644198984
[34m[1mwandb[0m: 	epsilon: 0.3440556219432024
[34m[1mwandb[0m: 	gamma: 0.6703988971894862
[34m[1mwandb[0m: 	vel_bins: 70
[34m[1mwandb[0m: 	x_bins: 64


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.30episode/s, Episode Reward=-415] 


Nuevo mejor modelo guardado con test_value: -560.0


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 56.28episode/s, Episode Reward=-357] 


Nuevo mejor modelo guardado con test_value: -364.56


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.64episode/s, Episode Reward=-216]


Nuevo mejor modelo guardado con test_value: -342.4


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.99episode/s, Episode Reward=-199] 


Nuevo mejor modelo guardado con test_value: -281.4


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.92episode/s, Episode Reward=-216]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.28episode/s, Episode Reward=-175]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.37episode/s, Episode Reward=-128]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.83episode/s, Episode Reward=-127]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.72episode/s, Episode Reward=-99] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 151.97episode/s, Episode Reward=-81] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 178.96episode/s, Episode Reward=-62] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.61episode/s, Episode Reward=-56] 


VBox(children=(Label(value='0.748 MB of 0.748 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆▆█▆▆▅▆▆▇▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07642
epsilon,0.0
t,11.0
test_value,-283.88
train_value,-50.998


[34m[1mwandb[0m: Agent Starting Run: liixiqfl with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.98589145244359
[34m[1mwandb[0m: 	epsilon: 0.27400375944803645
[34m[1mwandb[0m: 	gamma: 0.7662595809602959
[34m[1mwandb[0m: 	vel_bins: 38
[34m[1mwandb[0m: 	x_bins: 64


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.43episode/s, Episode Reward=-266] 


Nuevo mejor modelo guardado con test_value: -640.42


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 57.73episode/s, Episode Reward=-308]


Nuevo mejor modelo guardado con test_value: -613.96


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.68episode/s, Episode Reward=-164]


Nuevo mejor modelo guardado con test_value: -513.72


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 93.80episode/s, Episode Reward=-261] 


Nuevo mejor modelo guardado con test_value: -440.64


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.65episode/s, Episode Reward=-103]


Nuevo mejor modelo guardado con test_value: -439.02


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.92episode/s, Episode Reward=-144]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.89episode/s, Episode Reward=-134]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.00episode/s, Episode Reward=-44] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.63episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.83episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 206.81episode/s, Episode Reward=-6] 


Nuevo mejor modelo guardado con test_value: -399.74


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 222.56episode/s, Episode Reward=-7] 


Nuevo mejor modelo guardado con test_value: -302.68


VBox(children=(Label(value='0.531 MB of 0.531 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▄▅▅▄▃▅▅▅▆█
train_value,▁▅▆▆▇▇▇▇████

0,1
alpha,0.08216
epsilon,0.0
t,11.0
test_value,-302.68
train_value,-4.404


[34m[1mwandb[0m: Agent Starting Run: 4vzkfe0i with config:
[34m[1mwandb[0m: 	action_bins: 97
[34m[1mwandb[0m: 	alpha: 0.5517580605814869
[34m[1mwandb[0m: 	epsilon: 0.4570040300053674
[34m[1mwandb[0m: 	gamma: 0.9876525751995356
[34m[1mwandb[0m: 	vel_bins: 10
[34m[1mwandb[0m: 	x_bins: 18


Training Progress: 100%|██████████| 500/500 [04:15<00:00,  1.95episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -994.34


Training Progress: 100%|██████████| 500/500 [04:28<00:00,  1.86episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -991.12


Training Progress: 100%|██████████| 500/500 [02:53<00:00,  2.88episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -988.76


Training Progress: 100%|██████████| 500/500 [02:16<00:00,  3.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [01:37<00:00,  5.13episode/s, Episode Reward=-58]  


Nuevo mejor modelo guardado con test_value: -945.52


Training Progress: 100%|██████████| 500/500 [00:42<00:00, 11.73episode/s, Episode Reward=-166]
Training Progress: 100%|██████████| 500/500 [00:32<00:00, 15.21episode/s, Episode Reward=-46]  
Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.27episode/s, Episode Reward=-38] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 71.65episode/s, Episode Reward=-73] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 55.10episode/s, Episode Reward=-64]  
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.40episode/s, Episode Reward=19]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 128.53episode/s, Episode Reward=25] 


VBox(children=(Label(value='0.164 MB of 0.164 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▂▂▂█▃▄▄▁▂▄▄
train_value,▁▁▂▃▄▆▆▇█▇██

0,1
alpha,0.04598
epsilon,0.0
t,11.0
test_value,-976.88
train_value,16.506


[34m[1mwandb[0m: Agent Starting Run: btbe95wu with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.9775377934817232
[34m[1mwandb[0m: 	epsilon: 0.19937190204632535
[34m[1mwandb[0m: 	gamma: 0.7019044509286022
[34m[1mwandb[0m: 	vel_bins: 84
[34m[1mwandb[0m: 	x_bins: 42


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 33.12episode/s, Episode Reward=-384] 


Nuevo mejor modelo guardado con test_value: -562.56


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.53episode/s, Episode Reward=-251] 


Nuevo mejor modelo guardado con test_value: -367.54


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 98.43episode/s, Episode Reward=-146] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.77episode/s, Episode Reward=-137]


Nuevo mejor modelo guardado con test_value: -357.3


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.37episode/s, Episode Reward=-103]


Nuevo mejor modelo guardado con test_value: -349.9


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 157.87episode/s, Episode Reward=-176]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 165.28episode/s, Episode Reward=-60] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.33episode/s, Episode Reward=-22] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 142.18episode/s, Episode Reward=-48] 


Nuevo mejor modelo guardado con test_value: -327.62


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 163.60episode/s, Episode Reward=7]  


Nuevo mejor modelo guardado con test_value: -280.12


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 236.02episode/s, Episode Reward=-20]


Nuevo mejor modelo guardado con test_value: -277.58


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.33episode/s, Episode Reward=-41]


Nuevo mejor modelo guardado con test_value: -216.1


VBox(children=(Label(value='0.622 MB of 0.622 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▅▅▅▅▅▅▅▆▇▇█
train_value,▁▅▆▇▇▇██████

0,1
alpha,0.08146
epsilon,0.0
t,11.0
test_value,-216.1
train_value,-17.338


[34m[1mwandb[0m: Agent Starting Run: oun4db39 with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.988723446997306
[34m[1mwandb[0m: 	epsilon: 0.33735289911748617
[34m[1mwandb[0m: 	gamma: 0.6893401110833018
[34m[1mwandb[0m: 	vel_bins: 81
[34m[1mwandb[0m: 	x_bins: 31


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 27.80episode/s, Episode Reward=-241] 


Nuevo mejor modelo guardado con test_value: -484.48


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 49.24episode/s, Episode Reward=-220] 


Nuevo mejor modelo guardado con test_value: -484.28


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 65.42episode/s, Episode Reward=-279] 


Nuevo mejor modelo guardado con test_value: -372.82


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.25episode/s, Episode Reward=-319] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.35episode/s, Episode Reward=-124] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 115.80episode/s, Episode Reward=-94] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.30episode/s, Episode Reward=-83] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.77episode/s, Episode Reward=-85] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.60episode/s, Episode Reward=-85] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 173.28episode/s, Episode Reward=-17] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.37episode/s, Episode Reward=-13] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 215.88episode/s, Episode Reward=-24]


Nuevo mejor modelo guardado con test_value: -358.3


VBox(children=(Label(value='0.549 MB of 0.549 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▅▅█▅▅▆▃▁▁▃▇█
train_value,▁▄▅▆▇▇▇▇▇███

0,1
alpha,0.08239
epsilon,0.0
t,11.0
test_value,-358.3
train_value,-13.592


[34m[1mwandb[0m: Agent Starting Run: rd32qjq1 with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.7634333675318397
[34m[1mwandb[0m: 	epsilon: 0.8480159836629113
[34m[1mwandb[0m: 	gamma: 0.5357577403095728
[34m[1mwandb[0m: 	vel_bins: 17
[34m[1mwandb[0m: 	x_bins: 89


Training Progress: 100%|██████████| 500/500 [00:27<00:00, 18.10episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -754.3


Training Progress: 100%|██████████| 500/500 [00:31<00:00, 15.64episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -730.52


Training Progress: 100%|██████████| 500/500 [00:25<00:00, 19.81episode/s, Episode Reward=-586] 


Nuevo mejor modelo guardado con test_value: -649.9


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.96episode/s, Episode Reward=-753] 
Training Progress: 100%|██████████| 500/500 [00:12<00:00, 39.03episode/s, Episode Reward=-389] 
Training Progress: 100%|██████████| 500/500 [00:08<00:00, 60.91episode/s, Episode Reward=-287]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.95episode/s, Episode Reward=-198] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.33episode/s, Episode Reward=-70] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 118.42episode/s, Episode Reward=-89] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.03episode/s, Episode Reward=-119]


Nuevo mejor modelo guardado con test_value: -645.14


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 154.51episode/s, Episode Reward=-61] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 149.14episode/s, Episode Reward=-84] 


Nuevo mejor modelo guardado con test_value: -433.94


VBox(children=(Label(value='0.318 MB of 0.318 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▃▃▃▃▃▃▂▃▃█
train_value,▁▁▁▃▅▆▇█████

0,1
alpha,0.06362
epsilon,0.0
t,11.0
test_value,-433.94
train_value,-67.57


[34m[1mwandb[0m: Agent Starting Run: hj0hdqtg with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.9860554525990316
[34m[1mwandb[0m: 	epsilon: 0.12124843666402044
[34m[1mwandb[0m: 	gamma: 0.7381738418774886
[34m[1mwandb[0m: 	vel_bins: 76
[34m[1mwandb[0m: 	x_bins: 38


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.72episode/s, Episode Reward=-618] 


Nuevo mejor modelo guardado con test_value: -550.4


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 75.30episode/s, Episode Reward=-179]


Nuevo mejor modelo guardado con test_value: -518.36


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.22episode/s, Episode Reward=-145] 


Nuevo mejor modelo guardado con test_value: -447.4


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.48episode/s, Episode Reward=-135]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 145.08episode/s, Episode Reward=-50] 


Nuevo mejor modelo guardado con test_value: -437.06


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.20episode/s, Episode Reward=-84] 


Nuevo mejor modelo guardado con test_value: -378.58


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 158.07episode/s, Episode Reward=-23] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 175.64episode/s, Episode Reward=-16] 


Nuevo mejor modelo guardado con test_value: -343.56


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 190.80episode/s, Episode Reward=-12] 


Nuevo mejor modelo guardado con test_value: -317.74


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 207.76episode/s, Episode Reward=-15] 


Nuevo mejor modelo guardado con test_value: -298.06


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.19episode/s, Episode Reward=-2] 


Nuevo mejor modelo guardado con test_value: -224.72


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 261.08episode/s, Episode Reward=-9] 


VBox(children=(Label(value='0.224 MB of 0.536 MB uploaded\r'), FloatProgress(value=0.41872482373275377, max=1.…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▂▃▂▃▅▄▅▆▆██
train_value,▁▅▆▇▇▇▇▇████

0,1
alpha,0.08217
epsilon,0.0
t,11.0
test_value,-226.78
train_value,-0.91


[34m[1mwandb[0m: Agent Starting Run: 7rjfj9kk with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.8881985081387486
[34m[1mwandb[0m: 	epsilon: 0.11995497092640547
[34m[1mwandb[0m: 	gamma: 0.6721120678086179
[34m[1mwandb[0m: 	vel_bins: 70
[34m[1mwandb[0m: 	x_bins: 17


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.58episode/s, Episode Reward=-162] 


Nuevo mejor modelo guardado con test_value: -612.74


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.25episode/s, Episode Reward=-174] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 75.02episode/s, Episode Reward=-253] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.44episode/s, Episode Reward=-74] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.12episode/s, Episode Reward=-78] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 130.33episode/s, Episode Reward=-55] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 154.11episode/s, Episode Reward=-61] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 150.33episode/s, Episode Reward=-81] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 172.46episode/s, Episode Reward=-9]  


Nuevo mejor modelo guardado con test_value: -571.64


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 164.59episode/s, Episode Reward=-52] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 164.42episode/s, Episode Reward=17] 


Nuevo mejor modelo guardado con test_value: -535.34


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 190.38episode/s, Episode Reward=-41]


Nuevo mejor modelo guardado con test_value: -497.12


VBox(children=(Label(value='0.233 MB of 0.233 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▅▅▄▄▁▃▄▄▆▅▇█
train_value,▁▅▅▆▇▇██████

0,1
alpha,0.07402
epsilon,0.0
t,11.0
test_value,-497.12
train_value,-32.298


[34m[1mwandb[0m: Agent Starting Run: nipnxbrw with config:
[34m[1mwandb[0m: 	action_bins: 94
[34m[1mwandb[0m: 	alpha: 0.9819271632473108
[34m[1mwandb[0m: 	epsilon: 0.8881188511770913
[34m[1mwandb[0m: 	gamma: 0.9317838734893684
[34m[1mwandb[0m: 	vel_bins: 16
[34m[1mwandb[0m: 	x_bins: 91


Training Progress: 100%|██████████| 500/500 [00:54<00:00,  9.17episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -988.9


Training Progress: 100%|██████████| 500/500 [00:56<00:00,  8.84episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -958.88


Training Progress: 100%|██████████| 500/500 [00:57<00:00,  8.76episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -957.86


Training Progress: 100%|██████████| 500/500 [00:54<00:00,  9.11episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -957.36


Training Progress: 100%|██████████| 500/500 [00:51<00:00,  9.71episode/s, Episode Reward=-429] 
Training Progress: 100%|██████████| 500/500 [00:27<00:00, 18.22episode/s, Episode Reward=-370] 


Nuevo mejor modelo guardado con test_value: -947.34


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.70episode/s, Episode Reward=-141]
Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.60episode/s, Episode Reward=-138]
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 36.13episode/s, Episode Reward=-65] 
Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.75episode/s, Episode Reward=-134]
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 54.52episode/s, Episode Reward=-61] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.57episode/s, Episode Reward=-41] 


Nuevo mejor modelo guardado con test_value: -946.8


VBox(children=(Label(value='0.755 MB of 1.132 MB uploaded\r'), FloatProgress(value=0.66762438948331, max=1.0))…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆▆▆▅█▆▂▄▄▄█
train_value,▁▁▁▁▂▅▇▇▇▇██

0,1
alpha,0.08183
epsilon,0.0
t,11.0
test_value,-946.8
train_value,-88.414


[34m[1mwandb[0m: Agent Starting Run: xyp8eqfz with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.9472113732441874
[34m[1mwandb[0m: 	epsilon: 0.47855085545643905
[34m[1mwandb[0m: 	gamma: 0.8097952501347535
[34m[1mwandb[0m: 	vel_bins: 69
[34m[1mwandb[0m: 	x_bins: 65


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.53episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -624.02


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.84episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -379.14


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.18episode/s, Episode Reward=-399]


Nuevo mejor modelo guardado con test_value: -320.44


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 96.84episode/s, Episode Reward=-241] 


Nuevo mejor modelo guardado con test_value: -254.7


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 130.18episode/s, Episode Reward=-137]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.25episode/s, Episode Reward=-112]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 166.52episode/s, Episode Reward=-79] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 164.23episode/s, Episode Reward=-99] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 186.72episode/s, Episode Reward=-31] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 189.81episode/s, Episode Reward=-22] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 221.73episode/s, Episode Reward=-12] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 241.00episode/s, Episode Reward=13] 


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▆▇██▇▇▇▇▆▆▇
train_value,▁▄▆▆▇▇▇█████

0,1
alpha,0.07893
epsilon,0.0
t,11.0
test_value,-293.14
train_value,-12.718


[34m[1mwandb[0m: Agent Starting Run: yz8uazs8 with config:
[34m[1mwandb[0m: 	action_bins: 22
[34m[1mwandb[0m: 	alpha: 0.9530265049041384
[34m[1mwandb[0m: 	epsilon: 0.26044899297005825
[34m[1mwandb[0m: 	gamma: 0.7787405882156802
[34m[1mwandb[0m: 	vel_bins: 73
[34m[1mwandb[0m: 	x_bins: 39


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 33.02episode/s, Episode Reward=-459] 


Nuevo mejor modelo guardado con test_value: -557.52


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.49episode/s, Episode Reward=-274] 


Nuevo mejor modelo guardado con test_value: -445.46


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 102.61episode/s, Episode Reward=-193]


Nuevo mejor modelo guardado con test_value: -373.28


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.97episode/s, Episode Reward=-107]


Nuevo mejor modelo guardado con test_value: -372.72


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 133.32episode/s, Episode Reward=-70] 


Nuevo mejor modelo guardado con test_value: -349.88


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.58episode/s, Episode Reward=-90] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.11episode/s, Episode Reward=-16] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.85episode/s, Episode Reward=-17] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 174.81episode/s, Episode Reward=-7]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 207.91episode/s, Episode Reward=-19]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 178.32episode/s, Episode Reward=-14]


Nuevo mejor modelo guardado con test_value: -283.56


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.80episode/s, Episode Reward=-15]


Nuevo mejor modelo guardado con test_value: -243.98


VBox(children=(Label(value='0.302 MB of 0.506 MB uploaded\r'), FloatProgress(value=0.5981466214138426, max=1.0…

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▄▅▅▆▅▂▄▅▅▇█
train_value,▁▅▆▇▇▇▇█████

0,1
alpha,0.07942
epsilon,0.0
t,11.0
test_value,-243.98
train_value,-13.7


[34m[1mwandb[0m: Agent Starting Run: byrohj4p with config:
[34m[1mwandb[0m: 	action_bins: 21
[34m[1mwandb[0m: 	alpha: 0.9437996520742048
[34m[1mwandb[0m: 	epsilon: 0.694778371671365
[34m[1mwandb[0m: 	gamma: 0.7565096009199133
[34m[1mwandb[0m: 	vel_bins: 96
[34m[1mwandb[0m: 	x_bins: 35


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.90episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -411.62


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 33.88episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:14<00:00, 34.97episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -345.04


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 47.68episode/s, Episode Reward=-589] 


Nuevo mejor modelo guardado con test_value: -315.2


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 74.95episode/s, Episode Reward=-226] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 100.05episode/s, Episode Reward=-195]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 106.62episode/s, Episode Reward=-115]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.78episode/s, Episode Reward=-100]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.09episode/s, Episode Reward=-206]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.47episode/s, Episode Reward=-66] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 146.23episode/s, Episode Reward=-44] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 155.53episode/s, Episode Reward=-74] 


VBox(children=(Label(value='0.568 MB of 0.568 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▅▄▇█▅▄▁▃▄▂▅█
train_value,▁▂▂▄▆▇▇▇████

0,1
alpha,0.07865
epsilon,0.0
t,11.0
test_value,-321.6
train_value,-73.746


[34m[1mwandb[0m: Agent Starting Run: 46wu4zns with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.9712991434679892
[34m[1mwandb[0m: 	epsilon: 0.16069336715477203
[34m[1mwandb[0m: 	gamma: 0.7570291097379525
[34m[1mwandb[0m: 	vel_bins: 45
[34m[1mwandb[0m: 	x_bins: 77


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.73episode/s, Episode Reward=-415] 


Nuevo mejor modelo guardado con test_value: -653.66


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 62.33episode/s, Episode Reward=-329]


Nuevo mejor modelo guardado con test_value: -560.22


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.60episode/s, Episode Reward=-133]


Nuevo mejor modelo guardado con test_value: -496.6


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 103.54episode/s, Episode Reward=-163]


Nuevo mejor modelo guardado con test_value: -432.6


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.29episode/s, Episode Reward=-225]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.16episode/s, Episode Reward=-135]


Nuevo mejor modelo guardado con test_value: -414.48


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 142.85episode/s, Episode Reward=-75] 


Nuevo mejor modelo guardado con test_value: -400.04


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.95episode/s, Episode Reward=-42] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.74episode/s, Episode Reward=-61] 


Nuevo mejor modelo guardado con test_value: -355.26


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 183.88episode/s, Episode Reward=-46] 


Nuevo mejor modelo guardado con test_value: -299.72


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.28episode/s, Episode Reward=-15]


Nuevo mejor modelo guardado con test_value: -289.22


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 237.01episode/s, Episode Reward=13] 


Nuevo mejor modelo guardado con test_value: -252.84


VBox(children=(Label(value='0.666 MB of 0.666 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▃▄▅▅▅▅▅▆▇▇█
train_value,▁▅▆▇▇▇▇▇████

0,1
alpha,0.08094
epsilon,0.0
t,11.0
test_value,-252.84
train_value,-8.788


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: tsdd7i7k with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.9391278410882882
[34m[1mwandb[0m: 	epsilon: 0.7298175211054873
[34m[1mwandb[0m: 	gamma: 0.6857836259018524
[34m[1mwandb[0m: 	vel_bins: 52
[34m[1mwandb[0m: 	x_bins: 43


Training Progress: 100%|██████████| 500/500 [00:15<00:00, 31.84episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -408.32


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 35.23episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -298.44


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.29episode/s, Episode Reward=-459] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 73.38episode/s, Episode Reward=-271] 


Nuevo mejor modelo guardado con test_value: -262.22


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 108.11episode/s, Episode Reward=-206]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 123.91episode/s, Episode Reward=-102]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.98episode/s, Episode Reward=-53] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 159.77episode/s, Episode Reward=-43] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 187.37episode/s, Episode Reward=-52] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 158.30episode/s, Episode Reward=-85] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 170.72episode/s, Episode Reward=-75] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 191.97episode/s, Episode Reward=-42] 


VBox(children=(Label(value='0.364 MB of 0.364 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,█▇▇▆▅▅▄▄▃▂▂▁
epsilon,██▇▆▅▃▂▁▁▁▁▁
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▅▇▇█▆▅▄▁▁▃▂▄
train_value,▁▂▄▆▇▇██████

0,1
alpha,0.07826
epsilon,0.0
t,11.0
test_value,-429.2
train_value,-46.098


In [16]:
import pickle
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended
from car_model import Car
from mountain_car_agent import MountainCarAgent

# Cargar el agente desde el archivo
with open('mountain_car_stochastic_agent_v2.pkl', 'rb') as f:
    mountain_car_stochastic_agent = pickle.load(f)

with open('mountain_car_agent.pkl', 'rb') as f:
    loaded_agent = pickle.load(f)
    mountain_car_agent = loaded_agent['agent']

# Probamos los agentes
num_test_episodes = 100
average_test_reward = mountain_car_agent.test(num_test_episodes)
average_test_reward_stochastic = mountain_car_stochastic_agent.test(num_test_episodes)
print(f"Average test reward over {num_test_episodes} episodes for stochastic agent: {average_test_reward_stochastic}")
print(f"Average test reward over {num_test_episodes} episodes for agent: {average_test_reward}")

Average test reward over 100 episodes for stochastic agent: -204.39
Average test reward over 100 episodes for agent: -3.17


In [1]:
import wandb
import pickle
from car_model import Car
from stochastic_mountain_car_agent import StochasticMountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep_stochastic_v2():
    wandb.init()
    config = wandb.config

    vel_bins = 58
    x_bins = 58

    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    model = Car(env, x_bins, vel_bins, config.action_bins)

    log_sample_size = int(np.ceil(np.log(config.action_bins)))
    agent = StochasticMountainCarAgent(model, config.alpha, config.gamma, log_sample_size)

    # epsilon base y alpha base para el decay
    base_epsilon = config.epsilon
    base_alpha = config.alpha

    best_test_value = -np.inf
    best_model_path = "mountain_car_stochastic_agent_v2.pkl"


    for t in range(20):
        # alpha decay
        alpha = base_alpha * (1 - t / 20)
        agent.alpha = alpha

        base_epsilon = base_epsilon * (1 - t / 20) 

        train_value = agent.train(500, base_epsilon)
        test_value = agent.test(50)
        
        wandb.log({
            "train_value": train_value, 
            "test_value": test_value, 
            "epsilon": base_epsilon, 
            "alpha": alpha, 
            "t": t
        })

        if test_value > best_test_value:
            best_test_value = test_value
            with open(best_model_path, "wb") as f:
                pickle.dump(agent, f)
            print(f"Nuevo mejor modelo guardado con test_value: {best_test_value}")

    artifact = wandb.Artifact("best_model", type="model")
    artifact.add_file(best_model_path)
    wandb.log_artifact(artifact)

sweep_stochastic_confi = {
    'name': 'bayesian-sweep-stochastic-q-learning-alpha-decay-v2',
    'method': 'bayes',
    'metric': {
        'name': 'test_value',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {
            'distribution': 'uniform',
            'min': 0.83176,
            'max': 0.98606
        },
        'epsilon': {
            'distribution': 'uniform',
            'min': 0.5,
            'max': 0.9
        },
        'gamma': {
            'distribution': 'uniform',
            'min': 0.5,
            'max': 0.99
        },
        'action_bins': {
            'distribution': 'int_uniform',
            'min': 21,
            'max': 25
        },
    }
}

entity = "mateogiraz27-ort"
project = "mountain_car"
sweep_id = wandb.sweep(sweep_stochastic_confi, entity=entity, project=project)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: xmht62ar
Sweep URL: https://wandb.ai/mateogiraz27-ort/mountain_car/sweeps/xmht62ar


In [3]:
sweep_id = "xmht62ar"
wandb.agent(sweep_id, function=sweep_stochastic_v2, count=50, entity=entity, project=project)

[34m[1mwandb[0m: Agent Starting Run: uiep085m with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.8615490507982176
[34m[1mwandb[0m: 	epsilon: 0.5230815619024021
[34m[1mwandb[0m: 	gamma: 0.6498593380017677
[34m[1mwandb[0m: Currently logged in as: [33mmateogiraz27[0m ([33mmateogiraz27-ort[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.00episode/s, Episode Reward=-564] 


Nuevo mejor modelo guardado con test_value: -375.58


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.47episode/s, Episode Reward=-370] 


Nuevo mejor modelo guardado con test_value: -297.46


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.06episode/s, Episode Reward=-234]


Nuevo mejor modelo guardado con test_value: -250.16


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.15episode/s, Episode Reward=-176] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.11episode/s, Episode Reward=-132] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.77episode/s, Episode Reward=-142]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.93episode/s, Episode Reward=-113]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 131.91episode/s, Episode Reward=-64] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.48episode/s, Episode Reward=-65] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 114.49episode/s, Episode Reward=-68] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 142.40episode/s, Episode Reward=-86] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 195.98episode/s, Episode Reward=-47] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 176.32episode/s, Episode Reward=13]  
Training Progress: 100%|██████████| 500/

Nuevo mejor modelo guardado con test_value: -230.94


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 197.38episode/s, Episode Reward=27] 


Nuevo mejor modelo guardado con test_value: -220.82


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 221.90episode/s, Episode Reward=25] 


Nuevo mejor modelo guardado con test_value: -214.26


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 225.76episode/s, Episode Reward=29] 


Nuevo mejor modelo guardado con test_value: -174.96


VBox(children=(Label(value='0.620 MB of 0.620 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▃▅▆▅▆▅▅▃▁▁▃▂▄▄▅▅▆▇▇█
train_value,▁▄▆▆▇▇▇▇▇▇██████████

0,1
alpha,0.04308
epsilon,0.0
t,19.0
test_value,-174.96
train_value,4.962


[34m[1mwandb[0m: Agent Starting Run: 6mzvxv2q with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.9432282860102849
[34m[1mwandb[0m: 	epsilon: 0.886450234665426
[34m[1mwandb[0m: 	gamma: 0.8418301539104027


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.67episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -803.72


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.14episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -491.14


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.77episode/s, Episode Reward=-658] 
Training Progress: 100%|██████████| 500/500 [00:17<00:00, 28.59episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:13<00:00, 38.14episode/s, Episode Reward=-193] 


Nuevo mejor modelo guardado con test_value: -374.72


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 63.38episode/s, Episode Reward=-214] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.82episode/s, Episode Reward=-202] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 101.25episode/s, Episode Reward=-149]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 111.67episode/s, Episode Reward=-146]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.43episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.66episode/s, Episode Reward=-183]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.38episode/s, Episode Reward=-71] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.80episode/s, Episode Reward=-61] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.94episode/s, Episode Reward=-54] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 170.03episode/s, Episode Reward=-132]
Training Progress: 100%|██████████| 500/

Nuevo mejor modelo guardado con test_value: -366.86


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 285.38episode/s, Episode Reward=21] 


Nuevo mejor modelo guardado con test_value: -334.56


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 280.28episode/s, Episode Reward=26] 


Nuevo mejor modelo guardado con test_value: -307.18


VBox(children=(Label(value='0.674 MB of 0.674 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▁▅▃▄▇▅▇▅▅▅▅▅▄▆▅▆▇▇██
train_value,▁▁▂▂▄▆▆▇▇▇▇▇▇▇██████

0,1
alpha,0.04716
epsilon,0.0
t,19.0
test_value,-307.18
train_value,16.83


[34m[1mwandb[0m: Agent Starting Run: 0e8gbl4g with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.9273367895717196
[34m[1mwandb[0m: 	epsilon: 0.8797653309042702
[34m[1mwandb[0m: 	gamma: 0.9184641530361164


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.55episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -634.5


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.45episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.28episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.71episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.08episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.27episode/s, Episode Reward=-423] 
Training Progress: 100%|██████████| 500/500 [00:09<00:00, 50.14episode/s, Episode Reward=-353] 
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.91episode/s, Episode Reward=-225] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.42episode/s, Episode Reward=-136] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.66episode/s, Episode Reward=-147]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.65episode/s, Episode Reward=-64] 
Training Progress: 100%|██████████| 500/

Nuevo mejor modelo guardado con test_value: -580.98


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 222.17episode/s, Episode Reward=18] 


Nuevo mejor modelo guardado con test_value: -498.8


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 234.68episode/s, Episode Reward=16] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 230.36episode/s, Episode Reward=17] 


Nuevo mejor modelo guardado con test_value: -495.02


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 254.43episode/s, Episode Reward=19] 


Nuevo mejor modelo guardado con test_value: -492.6


VBox(children=(Label(value='0.673 MB of 0.673 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▆▁▂▂▃▄▄▅▂▃▅▄▅▆▆▇█▇██
train_value,▁▁▁▁▂▃▅▆▇▇▇▇▇███████

0,1
alpha,0.04637
epsilon,0.0
t,19.0
test_value,-492.6
train_value,12.748


[34m[1mwandb[0m: Agent Starting Run: 8g7624ht with config:
[34m[1mwandb[0m: 	action_bins: 25
[34m[1mwandb[0m: 	alpha: 0.8558178479992982
[34m[1mwandb[0m: 	epsilon: 0.7174019332731605
[34m[1mwandb[0m: 	gamma: 0.8654273453365877


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.96episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -536.68


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.57episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:18<00:00, 26.96episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:15<00:00, 33.09episode/s, Episode Reward=-602] 


Nuevo mejor modelo guardado con test_value: -522.18


Training Progress: 100%|██████████| 500/500 [00:09<00:00, 53.04episode/s, Episode Reward=-725] 


Nuevo mejor modelo guardado con test_value: -446.76


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.35episode/s, Episode Reward=-555] 


Nuevo mejor modelo guardado con test_value: -441.86


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 97.79episode/s, Episode Reward=-151] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.56episode/s, Episode Reward=-86] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 116.13episode/s, Episode Reward=-196]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 120.38episode/s, Episode Reward=-66] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 139.91episode/s, Episode Reward=-73] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.65episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.89episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 170.60episode/s, Episode Reward=-62] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 180.45episode/s, Episode Reward=12]  
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 179.31episode/s, Episode Reward=15] 
Training Progress: 100%|██████████| 500/5

Nuevo mejor modelo guardado con test_value: -396.84


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 249.45episode/s, Episode Reward=23] 


Nuevo mejor modelo guardado con test_value: -349.4


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 255.67episode/s, Episode Reward=27] 


Nuevo mejor modelo guardado con test_value: -307.42


VBox(children=(Label(value='0.381 MB of 0.674 MB uploaded\r'), FloatProgress(value=0.5661642276238589, max=1.0…

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▅▁▂▅▆▆▆▃▄▃▂▄▅▄▄▅▆▇▇█
train_value,▁▂▂▃▅▆▇▇▇▇▇▇▇███████

0,1
alpha,0.04279
epsilon,0.0
t,19.0
test_value,-307.42
train_value,11.08


[34m[1mwandb[0m: Agent Starting Run: kug18w0h with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.847599110673173
[34m[1mwandb[0m: 	epsilon: 0.7590122717876924
[34m[1mwandb[0m: 	gamma: 0.6749533997094211


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 30.74episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -471.54


Training Progress: 100%|██████████| 500/500 [00:14<00:00, 35.56episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -215.72


Training Progress: 100%|██████████| 500/500 [00:10<00:00, 48.79episode/s, Episode Reward=-481] 


Nuevo mejor modelo guardado con test_value: -209.24


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 71.04episode/s, Episode Reward=-174] 


Nuevo mejor modelo guardado con test_value: -184.02


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.53episode/s, Episode Reward=-213] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.46episode/s, Episode Reward=-191]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.81episode/s, Episode Reward=-101]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 127.60episode/s, Episode Reward=-99] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.40episode/s, Episode Reward=-71] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 129.70episode/s, Episode Reward=-138]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 152.54episode/s, Episode Reward=-26] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 165.38episode/s, Episode Reward=-21] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 176.63episode/s, Episode Reward=-13] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 188.14episode/s, Episode Reward=-47] 
Training Progress: 100%|██████████| 500/

VBox(children=(Label(value='0.620 MB of 0.620 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▁▇▇██▆▅▄▃▂▃▃▄▅▄▅▅▆▇▇
train_value,▁▂▄▆▇▇▇▇▇▇██████████

0,1
alpha,0.04238
epsilon,0.0
t,19.0
test_value,-222.92
train_value,-2.07


[34m[1mwandb[0m: Agent Starting Run: u4q4vkpq with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.8646841683995545
[34m[1mwandb[0m: 	epsilon: 0.5344507970363882
[34m[1mwandb[0m: 	gamma: 0.6654122869440569


Training Progress: 100%|██████████| 500/500 [00:17<00:00, 29.31episode/s, Episode Reward=-658] 


Nuevo mejor modelo guardado con test_value: -459.84


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 45.32episode/s, Episode Reward=-641] 


Nuevo mejor modelo guardado con test_value: -298.6


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 66.24episode/s, Episode Reward=-181]


Nuevo mejor modelo guardado con test_value: -260.34


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.79episode/s, Episode Reward=-237] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 100.06episode/s, Episode Reward=-143]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 110.66episode/s, Episode Reward=-59] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.60episode/s, Episode Reward=-66] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 136.02episode/s, Episode Reward=-104]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 132.06episode/s, Episode Reward=-87] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 151.37episode/s, Episode Reward=-53] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.85episode/s, Episode Reward=-51] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 169.86episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 205.73episode/s, Episode Reward=-43]
Training Progress: 100%|██████████| 500/5

Nuevo mejor modelo guardado con test_value: -248.14


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 244.71episode/s, Episode Reward=-20]


Nuevo mejor modelo guardado con test_value: -221.52


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 256.71episode/s, Episode Reward=23] 


Nuevo mejor modelo guardado con test_value: -200.36


VBox(children=(Label(value='0.647 MB of 0.647 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▁▅▆▆▆▅▄▃▃▄▁▂▂▅▃▄▅▇▇█
train_value,▁▄▅▆▇▇▇▇▇▇██████████

0,1
alpha,0.04323
epsilon,0.0
t,19.0
test_value,-200.36
train_value,6.188


[34m[1mwandb[0m: Agent Starting Run: huxp7ql8 with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.873503339285943
[34m[1mwandb[0m: 	epsilon: 0.5462054891744315
[34m[1mwandb[0m: 	gamma: 0.6288998353059121


Training Progress: 100%|██████████| 500/500 [00:16<00:00, 29.68episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -494.64


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 43.22episode/s, Episode Reward=-621] 


Nuevo mejor modelo guardado con test_value: -325.44


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 62.51episode/s, Episode Reward=-454]


Nuevo mejor modelo guardado con test_value: -260.92


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 69.04episode/s, Episode Reward=-227]


Nuevo mejor modelo guardado con test_value: -253.42


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.04episode/s, Episode Reward=-103] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.91episode/s, Episode Reward=-94]  
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.90episode/s, Episode Reward=-57] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 134.70episode/s, Episode Reward=-92] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.98episode/s, Episode Reward=-8]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.00episode/s, Episode Reward=-104]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 140.87episode/s, Episode Reward=-105]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 125.98episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 172.35episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 170.39episode/s, Episode Reward=-3]  
Training Progress: 100%|██████████| 500/

Nuevo mejor modelo guardado con test_value: -232.72


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 225.92episode/s, Episode Reward=25] 


Nuevo mejor modelo guardado con test_value: -210.56


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.04episode/s, Episode Reward=23] 


Nuevo mejor modelo guardado con test_value: -195.98


VBox(children=(Label(value='0.491 MB of 0.647 MB uploaded\r'), FloatProgress(value=0.7584686114882144, max=1.0…

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▁▅▆▇▆▆▅▅▂▂▄▄▅▄▅▆▆▇██
train_value,▁▄▅▆▆▇▇▇▇▇█▇████████

0,1
alpha,0.04368
epsilon,0.0
t,19.0
test_value,-195.98
train_value,4.026


[34m[1mwandb[0m: Agent Starting Run: hkvezkf6 with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.8599959515220124
[34m[1mwandb[0m: 	epsilon: 0.5528012017792985
[34m[1mwandb[0m: 	gamma: 0.5894177975434567


Training Progress: 100%|██████████| 500/500 [00:18<00:00, 27.71episode/s, Episode Reward=-1000]


Nuevo mejor modelo guardado con test_value: -359.72


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 43.90episode/s, Episode Reward=-318] 


Nuevo mejor modelo guardado con test_value: -271.5


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.21episode/s, Episode Reward=-138]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.78episode/s, Episode Reward=-354] 


Nuevo mejor modelo guardado con test_value: -261.02


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 95.84episode/s, Episode Reward=-103] 
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 94.82episode/s, Episode Reward=-187] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 109.26episode/s, Episode Reward=-139]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.70episode/s, Episode Reward=-88] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.75episode/s, Episode Reward=-132]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 147.39episode/s, Episode Reward=-54] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 153.26episode/s, Episode Reward=-71] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.87episode/s, Episode Reward=-65] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 138.74episode/s, Episode Reward=-18] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 210.46episode/s, Episode Reward=-49] 
Training Progress: 100%|██████████| 500/

Nuevo mejor modelo guardado con test_value: -250.34


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 204.09episode/s, Episode Reward=21] 


Nuevo mejor modelo guardado con test_value: -222.96


VBox(children=(Label(value='0.225 MB of 0.647 MB uploaded\r'), FloatProgress(value=0.3478181389314778, max=1.0…

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▄▇▇▇▇▅▅▃▄▁▂▁▂▅▄▆▆▆▇█
train_value,▁▄▅▆▇▇▇▇▇▇██████████

0,1
alpha,0.043
epsilon,0.0
t,19.0
test_value,-222.96
train_value,-3.638


[34m[1mwandb[0m: Agent Starting Run: x0mn9sl7 with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.8887776899822801
[34m[1mwandb[0m: 	epsilon: 0.534575587042083
[34m[1mwandb[0m: 	gamma: 0.6520503737417209


Training Progress: 100%|██████████| 500/500 [00:19<00:00, 25.31episode/s, Episode Reward=-861] 


Nuevo mejor modelo guardado con test_value: -397.38


Training Progress: 100%|██████████| 500/500 [00:11<00:00, 44.40episode/s, Episode Reward=-486] 


Nuevo mejor modelo guardado con test_value: -301.26


Training Progress: 100%|██████████| 500/500 [00:08<00:00, 59.67episode/s, Episode Reward=-250] 


Nuevo mejor modelo guardado con test_value: -286.58


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.02episode/s, Episode Reward=-434] 


Nuevo mejor modelo guardado con test_value: -264.16


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 99.58episode/s, Episode Reward=-163] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.10episode/s, Episode Reward=-69] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 107.81episode/s, Episode Reward=-145]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 113.68episode/s, Episode Reward=-128]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 119.66episode/s, Episode Reward=-97] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.39episode/s, Episode Reward=-53] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 157.50episode/s, Episode Reward=-40] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.02episode/s, Episode Reward=-12] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 198.95episode/s, Episode Reward=-40]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 145.21episode/s, Episode Reward=20] 
Training Progress: 100%|██████████| 500/50

Nuevo mejor modelo guardado con test_value: -250.14


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 177.03episode/s, Episode Reward=-11]


Nuevo mejor modelo guardado con test_value: -247.96


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 184.06episode/s, Episode Reward=-47]


Nuevo mejor modelo guardado con test_value: -223.08


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 181.14episode/s, Episode Reward=-15]


Nuevo mejor modelo guardado con test_value: -205.3


VBox(children=(Label(value='0.147 MB of 0.647 MB uploaded\r'), FloatProgress(value=0.22722187749182352, max=1.…

0,1
alpha,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
epsilon,██▇▆▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁
t,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_value,▂▅▆▆▅▆▃▃▁▁▃▃▄▄▅▆▇▇▇█
train_value,▁▄▆▆▇▇▇▇▇▇██████████

0,1
alpha,0.04444
epsilon,0.0
t,19.0
test_value,-205.3
train_value,-17.758


[34m[1mwandb[0m: Agent Starting Run: 3ajitrn6 with config:
[34m[1mwandb[0m: 	action_bins: 23
[34m[1mwandb[0m: 	alpha: 0.8656082267061916
[34m[1mwandb[0m: 	epsilon: 0.5836129923061331
[34m[1mwandb[0m: 	gamma: 0.7213216151126555


Training Progress:  65%|██████▍   | 324/500 [00:12<00:06, 27.98episode/s, Episode Reward=-477] [34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
Training Progress:  65%|██████▌   | 326/500 [00:12<00:10, 16.30episode/s, Episode Reward=-769] 

Training Progress:  96%|█████████▌| 479/500 [00:19<00:00, 24.93episode/s, Episode Reward=-847] Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-33 (_run_job), stopped 6264041472)>>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1032, in _bootstrap
    self._bootstrap_inner()
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1077, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1391, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/Users/mateogiraz/Library/Caches/pypoetry/virtualenvs/ai-project-yycSt0xa-py3.12/lib/python3.12/site-packages/ipykernel/iostream.py", line 604, in flush
    self.pub_thread.schedule(self._flush)
  File "/Users/mateogiraz/Library/Caches/pypoetry/virtualenvs/ai-project-yycSt0xa-py3.12/lib/python3.12/site-packages/ipyk