Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Continuous Mountain Car"

In [22]:
from tqdm import tqdm
import numpy as np
import wandb
import gym
from car_model import Car
from mountain_car_agent import MountainCarAgent

In [23]:
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

# Cambiar render_mode a rgb_array para entrenar/testear
env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')

In [24]:
x_bins = 20  # Number of bins for position
vel_bins = 20  # Number of bins for velocity
action_bins = 5  # Number of discrete actions to sample from
model = Car(env, x_bins, vel_bins, action_bins)

In [25]:
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
agent = MountainCarAgent(model, alpha, gamma)

In [26]:
# Train the agent
num_training_episodes = 1000
epsilon = 0.2
average_training_rewards = agent.train(num_training_episodes, epsilon)
print(f"Average training reward over {num_training_episodes} episodes: {average_training_rewards}")

Training Progress:   0%|          | 0/1000 [00:00<?, ?episode/s]

Training Progress: 100%|██████████| 1000/1000 [00:03<00:00, 250.65episode/s, Episode Reward=-45]

Average training reward over 1000 episodes: -161.604





In [27]:
# Evaluate the agent
num_evaluation_episodes = 100
average_evaluation_rewards = agent.test(num_evaluation_episodes)
print(f"Average evaluation reward over {num_evaluation_episodes} episodes: {average_evaluation_rewards}")

Average evaluation reward over 100 episodes: -33.56


Obtener el estado a partir de la observación

In [28]:
wandb.init(project="mountain_car",
           config={
               'x_bins': x_bins,
               'vel_bins': vel_bins,
               'action_bins': action_bins,
               'alpha': alpha,
               'gamma': gamma,
               'epsilon': epsilon,
           })
    
epsilon_initial = epsilon
for t in range(10):
    train_value = agent.train(100, epsilon_initial)
    eval_value = agent.test(30)
    wandb.log({'train_value': train_value, 'eval_value': eval_value, "t": t})
    epsilon_initial *= 0.9  # Decay epsilon over iterations

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmateogiraz27[0m ([33mmateogiraz27-ort[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Progress: 100%|██████████| 100/100 [00:00<00:00, 266.57episode/s, Episode Reward=-152]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 230.79episode/s, Episode Reward=-52] 
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 309.43episode/s, Episode Reward=-84]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 385.02episode/s, Episode Reward=-63]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 224.38episode/s, Episode Reward=-184]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 332.26episode/s, Episode Reward=-46] 
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 238.91episode/s, Episode Reward=-82]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 344.05episode/s, Episode Reward=-46]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 310.74episode/s, Episode Reward=-46]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 307.73episode/s, Episode Reward=-40]


In [29]:
import wandb
from car_model import Car
from mountain_car_agent import MountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep():
    wandb.init()
    config = wandb.config

    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    model = Car(env, config.x_bins, config.vel_bins, config.action_bins)
    agent = MountainCarAgent(model, config.alpha, config.gamma)

    base_epsilon = config.epsilon

    for t in range(12):
        train_value = agent.train(500, base_epsilon)
        test_value = agent.test(50)
        wandb.log({"train_value": train_value, "test_value": test_value, "t" : t})
        base_epsilon *= (10-t)/10

sweep_config = {
        'name': 'bayesian-sweep-epsilon-decay',
        'method': 'bayes',
        'metric': {
            'name': 'test_value',
            'goal': 'maximize'
        },
        'parameters': {
            'alpha': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'epsilon': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'gamma': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'action_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            },
            'vel_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            },
            'x_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            }
        }
    }
entity = "mateogiraz27-ort"
project = "mountain_car"
sweep_id = wandb.sweep(sweep_config, entity=entity, project=project)

Create sweep with ID: kvt4c6r7
Sweep URL: https://wandb.ai/mateogiraz27-ort/mountain_car/sweeps/kvt4c6r7


In [30]:
sweep_id = "mgvkiiaf"
wandb.agent(sweep_id, function=sweep, count=100, entity=entity, project=project)

[34m[1mwandb[0m: Agent Starting Run: mtuh3h6m with config:
[34m[1mwandb[0m: 	action_bins: 10
[34m[1mwandb[0m: 	alpha: 0.5349624470038623
[34m[1mwandb[0m: 	epsilon: 0.5111950360798202
[34m[1mwandb[0m: 	gamma: 0.7586095428869182
[34m[1mwandb[0m: 	vel_bins: 47
[34m[1mwandb[0m: 	x_bins: 84


[1;34mwandb[0m: 🚀 View run [33mglamorous-butterfly-512[0m at: [34mhttps://wandb.ai/mateogiraz27-ort/mountain_car/runs/mzk13lz1[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20241213_010839-mzk13lz1/logs[0m


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.63episode/s, Episode Reward=-757]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.53episode/s, Episode Reward=-256]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 203.98episode/s, Episode Reward=-234]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 252.45episode/s, Episode Reward=-148]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 274.79episode/s, Episode Reward=-67] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 323.89episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 325.27episode/s, Episode Reward=-10] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 311.60episode/s, Episode Reward=-7]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 324.06episode/s, Episode Reward=-43] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 402.48episode/s, Episode Reward=-64] 
Training Progress: 100%|██████████| 500

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▄▄▇▄███▁▃███
train_value,▁▅▆▇▇███████

0,1
t,11.0
test_value,-33.06
train_value,-48.896


[34m[1mwandb[0m: Agent Starting Run: fga0p2i5 with config:
[34m[1mwandb[0m: 	action_bins: 17
[34m[1mwandb[0m: 	alpha: 0.6853138323766247
[34m[1mwandb[0m: 	epsilon: 0.5109640314754463
[34m[1mwandb[0m: 	gamma: 0.7399405569303439
[34m[1mwandb[0m: 	vel_bins: 75
[34m[1mwandb[0m: 	x_bins: 87


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.61episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.52episode/s, Episode Reward=-428] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 141.80episode/s, Episode Reward=-525] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.34episode/s, Episode Reward=-289]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 200.12episode/s, Episode Reward=-172]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 220.01episode/s, Episode Reward=-149]
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 244.46episode/s, Episode Reward=-156]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 256.95episode/s, Episode Reward=-103]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 278.53episode/s, Episode Reward=-99] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 284.96episode/s, Episode Reward=-89] 
Training Progress: 100%|██████████| 50

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▇▇▇▆████▇██
train_value,▁▄▅▆▇▇██████

0,1
t,11.0
test_value,-84.64
train_value,-87.74


[34m[1mwandb[0m: Agent Starting Run: gtucxebv with config:
[34m[1mwandb[0m: 	action_bins: 13
[34m[1mwandb[0m: 	alpha: 0.5288530799749362
[34m[1mwandb[0m: 	epsilon: 0.9754818270101902
[34m[1mwandb[0m: 	gamma: 0.5567815836172685
[34m[1mwandb[0m: 	vel_bins: 87
[34m[1mwandb[0m: 	x_bins: 26


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.37episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.62episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.87episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 117.30episode/s, Episode Reward=-501] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 156.76episode/s, Episode Reward=-181]
Training Progress:  66%|██████▌   | 328/500 [00:01<00:00, 221.51episode/s, Episode Reward=-233][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
Training Progress:  73%|███████▎  | 366/500 [00:01<00:00, 203.13episode/s, Episode Reward=-128]

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x1081f8080>> (for post_run_cell), with arguments args (<ExecutionResult object at 10ffd8710, execution_count=30 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 10ffd8e30, raw_cell="sweep_id = "mgvkiiaf"
wandb.agent(sweep_id, functi.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/mateogiraz/Documents/MountainCarContinuous%20%281%29/car-continuous/continuous_mountain_car.ipynb#X13sZmlsZQ%3D%3D> result=None>,),kwargs {}:


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 213.21episode/s, Episode Reward=-99] 
Training Progress:   6%|▌         | 31/500 [00:00<00:02, 221.39episode/s, Episode Reward=-80] 

BrokenPipeError: [Errno 32] Broken pipe

In [None]:
import wandb
from car_model import Car
from mountain_car_agent import MountainCarAgent
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep_v2():
    wandb.init()
    config = wandb.config

    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    model = Car(env, config.x_bins, config.vel_bins, config.action_bins)
    agent = MountainCarAgent(model, config.alpha, config.gamma)

    base_epsilon = config.epsilon

    for t in range(20):
        train_value = agent.train(500, base_epsilon)
        test_value = agent.test(50)
        wandb.log({"train_value": train_value, "test_value": test_value, "t" : t})
        base_epsilon *= (10-t)/10

sweep_config = {
    'name': 'bayesian-sweep-epsilon-decay-v2',
    'method': 'bayes',
    'metric': {
        'name': 'test_value',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {
            'distribution': 'uniform',
            'min': 0.508,
            'max': 0.99
        },
        'epsilon': {
            'distribution': 'uniform',
            'min': 0.503,
            'max': 0.989
        },
        'gamma': {
            'distribution': 'uniform',
            'min': 0.729,
            'max': 0.974
        },
        'action_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 100
        },
        'vel_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 25
        },
        'x_bins': {
            'distribution': 'int_uniform',
            'min': 10,
            'max': 24
        }
    }
}
entity = "mateogiraz27-ort"
project = "mountain_car"
sweep_id = wandb.sweep(sweep_config, entity=entity, project=project)

In [6]:
sweep_id = "l6wrf4n4"
wandb.agent(sweep_id, function=sweep_v2, count=100, entity=entity, project=project)

[34m[1mwandb[0m: Agent Starting Run: ohf1cb0f with config:
[34m[1mwandb[0m: 	action_bins: 20
[34m[1mwandb[0m: 	alpha: 0.6718927213010032
[34m[1mwandb[0m: 	epsilon: 0.830565126667738
[34m[1mwandb[0m: 	gamma: 0.8871260539991896
[34m[1mwandb[0m: 	vel_bins: 12
[34m[1mwandb[0m: 	x_bins: 14


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.60episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.44episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.84episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.54episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.64episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.43episode/s, Episode Reward=-1000] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 104.92episode/s, Episode Reward=-656] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 124.53episode/s, Episode Reward=-273] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 122.63episode/s, Episode Reward=-49]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 161.87episode/s, Episode Reward=-45]  
Training Progress: 100%|██████████|

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▁▁▁▁▆▆▁▁███
train_value,▁▁▁▁▁▂▄▅▆▇██

0,1
t,11.0
test_value,-29.8
train_value,-27.066


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6zd5n5cx with config:
[34m[1mwandb[0m: 	action_bins: 26
[34m[1mwandb[0m: 	alpha: 0.5199549354506932
[34m[1mwandb[0m: 	epsilon: 0.9273838229092624
[34m[1mwandb[0m: 	gamma: 0.8667085989818117
[34m[1mwandb[0m: 	vel_bins: 18
[34m[1mwandb[0m: 	x_bins: 14


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 89.57episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.77episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.99episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.73episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.47episode/s, Episode Reward=-726] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.49episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 136.16episode/s, Episode Reward=-253] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 215.16episode/s, Episode Reward=-86]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 262.38episode/s, Episode Reward=-66]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 321.54episode/s, Episode Reward=-50] 
Training Progress: 100%|██████████| 

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▁▂▁▇███
train_value,▁▁▁▁▂▃▅▇▇███

0,1
t,11.0
test_value,-5.32
train_value,-89.592


[34m[1mwandb[0m: Agent Starting Run: sgqzcmpw with config:
[34m[1mwandb[0m: 	action_bins: 52
[34m[1mwandb[0m: 	alpha: 0.7828419662676925
[34m[1mwandb[0m: 	epsilon: 0.5486364810025504
[34m[1mwandb[0m: 	gamma: 0.9432563783004267
[34m[1mwandb[0m: 	vel_bins: 19
[34m[1mwandb[0m: 	x_bins: 18


Training Progress: 100%|██████████| 500/500 [00:07<00:00, 68.71episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:07<00:00, 64.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.06episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 72.08episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.89episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.11episode/s, Episode Reward=-462]  
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 128.59episode/s, Episode Reward=-164] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.59episode/s, Episode Reward=-174] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 227.89episode/s, Episode Reward=-75]  
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 305.17episode/s, Episode Reward=-70]  
Training Progress: 100%|██████████|

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▂▁▁▁▁▇▁▁▇███
train_value,▁▁▁▁▁▃▅▆▇███

0,1
t,11.0
test_value,-14.0
train_value,-11.198


[34m[1mwandb[0m: Agent Starting Run: dvxuzxnm with config:
[34m[1mwandb[0m: 	action_bins: 60
[34m[1mwandb[0m: 	alpha: 0.7911898107470617
[34m[1mwandb[0m: 	epsilon: 0.9564179604629998
[34m[1mwandb[0m: 	gamma: 0.8979606865250944
[34m[1mwandb[0m: 	vel_bins: 20
[34m[1mwandb[0m: 	x_bins: 23


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.96episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 88.82episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 81.32episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.34episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 82.64episode/s, Episode Reward=-348] 
Training Progress: 100%|██████████| 500/500 [00:04<00:00, 105.11episode/s, Episode Reward=-659] 
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 158.90episode/s, Episode Reward=-477] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 228.45episode/s, Episode Reward=-149] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 217.32episode/s, Episode Reward=-161] 
Training Progress: 100%|██████████| 

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▆▃▆▇█▇█
train_value,▁▁▁▁▁▂▃▆▇▇██

0,1
t,11.0
test_value,5.22
train_value,-91.486


[34m[1mwandb[0m: Agent Starting Run: wz5hony2 with config:
[34m[1mwandb[0m: 	action_bins: 24
[34m[1mwandb[0m: 	alpha: 0.6289323207980988
[34m[1mwandb[0m: 	epsilon: 0.899880060606207
[34m[1mwandb[0m: 	gamma: 0.9216564254309386
[34m[1mwandb[0m: 	vel_bins: 21
[34m[1mwandb[0m: 	x_bins: 21


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.70episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 77.26episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.43episode/s, Episode Reward=-1000] 
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.36episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.93episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 84.92episode/s, Episode Reward=-1000] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 173.03episode/s, Episode Reward=-560] 
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 304.74episode/s, Episode Reward=-214]
Training Progress: 100%|██████████| 500/500 [00:01<00:00, 322.18episode/s, Episode Reward=-66] 
Training Progress: 100%|██████████| 5

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▁▁▆▁▇██
train_value,▁▁▁▁▁▂▅▆▇▇██

0,1
t,11.0
test_value,9.04
train_value,5.914


[34m[1mwandb[0m: Agent Starting Run: ubkhvkl3 with config:
[34m[1mwandb[0m: 	action_bins: 16
[34m[1mwandb[0m: 	alpha: 0.6309399333069997
[34m[1mwandb[0m: 	epsilon: 0.9227180355654708
[34m[1mwandb[0m: 	gamma: 0.9327721802686408
[34m[1mwandb[0m: 	vel_bins: 22
[34m[1mwandb[0m: 	x_bins: 24


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 87.72episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.66episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 86.17episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 83.67episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.91episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.58episode/s, Episode Reward=-1000]
Training Progress: 100%|██████████| 500/500 [00:03<00:00, 137.69episode/s, Episode Reward=-160] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 209.73episode/s, Episode Reward=-130] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 221.55episode/s, Episode Reward=-223] 
Training Progress: 100%|██████████| 500/500 [00:02<00:00, 167.60episode/s, Episode Reward=-278]
Training Progress: 100%|██████████| 5

0,1
t,▁▂▂▃▄▄▅▅▆▇▇█
test_value,▁▁▁▁▁▁▇▇▅▇██
train_value,▁▁▁▁▁▂▅▇▇███

0,1
t,11.0
test_value,-28.08
train_value,-9.772


[34m[1mwandb[0m: Agent Starting Run: 96gj5h68 with config:
[34m[1mwandb[0m: 	action_bins: 27
[34m[1mwandb[0m: 	alpha: 0.709195235908292
[34m[1mwandb[0m: 	epsilon: 0.6145769002543273
[34m[1mwandb[0m: 	gamma: 0.7891815067094485
[34m[1mwandb[0m: 	vel_bins: 12
[34m[1mwandb[0m: 	x_bins: 14


Training Progress:  94%|█████████▍| 472/500 [00:06<00:00, 82.49episode/s, Episode Reward=-1000][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
Training Progress:  95%|█████████▍| 473/500 [00:06<00:00, 82.49episode/s, Episode Reward=-1000]

Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.39episode/s, Episode Reward=-1000]
Training Progress:  24%|██▎       | 118/500 [00:02<00:06, 56.59episode/s, Episode Reward=-1000]Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-631 (_run_job), stopped 6272348160)>>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1032, in _bootstrap
    self._bootstrap_inner()
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1077, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/opt/anaconda3/lib/python3.12/threading.py", line 1391, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/Users/mateogiraz/Library/Caches/pypoetry/virtualenvs/ai-project-yycSt0xa-py3.12/lib/python3.12/site-packages/ipykernel/iostream.py", line 604, in flush
    self.pub_thread.schedule(self._flush)
  File "/Users/mateogiraz

In [21]:
# ejecutar el agente con los mejores parametros

alpha = 0.96276
epsilon = 0.75616
gamma = 0.88364
action_bins = 83
x_bins = 10
vel_bins = 13

env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
model = Car(env, x_bins, vel_bins, action_bins)
agent = MountainCarAgent(model, alpha, gamma)

for t in range(20):
    train_value = agent.train(500, epsilon)
    test_value = agent.test(50)
    epsilon *= (10-t)/10
    print(f"Train value: {train_value}, Test value: {test_value}, Epsilon: {epsilon}")

Training Progress: 100%|██████████| 500/500 [00:06<00:00, 76.44episode/s, Episode Reward=-1000]


Train value: -999.748, Test value: -1000.0, Epsilon: 0.75616


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.21episode/s, Episode Reward=-1000]


Train value: -1000.0, Test value: -1000.0, Epsilon: 0.680544


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 79.50episode/s, Episode Reward=-1000]


Train value: -998.956, Test value: -1000.0, Epsilon: 0.5444352


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 80.58episode/s, Episode Reward=-1000]


Train value: -998.43, Test value: -1000.0, Epsilon: 0.38110464


Training Progress: 100%|██████████| 500/500 [00:06<00:00, 78.69episode/s, Episode Reward=-1000]


Train value: -986.656, Test value: -1000.0, Epsilon: 0.22866278399999998


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 85.90episode/s, Episode Reward=-1000]


Train value: -945.766, Test value: -1000.0, Epsilon: 0.11433139199999999


Training Progress: 100%|██████████| 500/500 [00:05<00:00, 90.31episode/s, Episode Reward=-524]  


Train value: -814.97, Test value: -1000.0, Epsilon: 0.0457325568


Training Progress: 100%|██████████| 500/500 [00:04<00:00, 112.32episode/s, Episode Reward=-1000]


Train value: -512.564, Test value: -100.98, Epsilon: 0.013719767039999999


Training Progress: 100%|██████████| 500/500 [00:03<00:00, 160.87episode/s, Episode Reward=-316] 


Train value: -362.738, Test value: -13.42, Epsilon: 0.002743953408


Training Progress: 100%|██████████| 500/500 [00:02<00:00, 248.43episode/s, Episode Reward=-442] 


Train value: -177.526, Test value: -176.8, Epsilon: 0.0002743953408


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 364.14episode/s, Episode Reward=-7]  


Train value: -51.12, Test value: -3.64, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 365.54episode/s, Episode Reward=-5] 


Train value: -3.296, Test value: -4.46, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 443.29episode/s, Episode Reward=0]  


Train value: -3.54, Test value: -3.22, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 456.52episode/s, Episode Reward=-7] 


Train value: -3.29, Test value: -2.64, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 433.20episode/s, Episode Reward=-8] 


Train value: -3.29, Test value: -3.22, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 437.08episode/s, Episode Reward=-6] 


Train value: -3.224, Test value: -3.22, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 311.42episode/s, Episode Reward=0]  


Train value: -3.234, Test value: -3.6, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 405.51episode/s, Episode Reward=-10]


Train value: -3.02, Test value: -3.22, Epsilon: -0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 435.90episode/s, Episode Reward=-5] 


Train value: -3.198, Test value: -4.16, Epsilon: 0.0


Training Progress: 100%|██████████| 500/500 [00:01<00:00, 441.37episode/s, Episode Reward=2]  


Train value: -2.868, Test value: -3.24, Epsilon: -0.0


In [20]:
# Guardar el modelo entrenado
import pickle
with open('mountain_car_agent.pkl', 'wb') as f:
    pickle.dump({
        'agent': agent,
        'parameters': {
            'alpha': alpha,
            'epsilon': epsilon,
            'gamma': gamma,
            'action_bins': action_bins,
            'x_bins': x_bins,
            'vel_bins': vel_bins
        }
    }, f)

In [None]:
with open('mountain_car_agent.pkl', 'rb') as f:
    loaded_agent = pickle.load(f)

    test_value = loaded_agent['agent'].test(500)
    print(f"Test value: {test_value}")
