Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Continuous Mountain Car"

In [6]:
from tqdm import tqdm
import numpy as np
import wandb
import gym
from car_model import Car
from mountain_car_agent import MountainCarAgent

In [7]:
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

# Cambiar render_mode a rgb_array para entrenar/testear
env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')

In [8]:
x_bins = 20  # Number of bins for position
vel_bins = 20  # Number of bins for velocity
action_bins = 5  # Number of discrete actions to sample from
model = Car(env, x_bins, vel_bins, action_bins)

In [9]:
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
agent = MountainCarAgent(model, alpha, gamma)

In [10]:
# Train the agent
num_training_episodes = 1000
epsilon = 0.2
average_training_rewards = agent.train(num_training_episodes, epsilon)
print(f"Average training reward over {num_training_episodes} episodes: {average_training_rewards}")

Training Progress: 100%|██████████| 1000/1000 [00:03<00:00, 256.01episode/s, Episode Reward=-50]

Average training reward over 1000 episodes: -156.562





In [11]:
# Evaluate the agent
num_evaluation_episodes = 100
average_evaluation_rewards = agent.test(num_evaluation_episodes)
print(f"Average evaluation reward over {num_evaluation_episodes} episodes: {average_evaluation_rewards}")

Average evaluation reward over 100 episodes: -17.84


Obtener el estado a partir de la observación

In [13]:
wandb.init(project="mountain_car",
           config={
               'x_bins': x_bins,
               'vel_bins': vel_bins,
               'action_bins': action_bins,
               'alpha': alpha,
               'gamma': gamma,
               'epsilon': epsilon,
           })
    
epsilon_initial = epsilon
for t in range(10):
    train_value = agent.train(100, epsilon_initial)
    eval_value = agent.test(30)
    wandb.log({'trainValue': train_value, 'evalValue': eval_value, "t": t})
    epsilon_initial *= 0.9  # Decay epsilon over iterations

0,1
evalValue,▇▆▁▆▅███▂▇
t,▁▂▃▃▄▅▆▆▇█
trainValue,▁▅▁▃▃█████

0,1
evalValue,-43.66667
t,9.0
trainValue,-41.29


Training Progress: 100%|██████████| 100/100 [00:00<00:00, 323.24episode/s, Episode Reward=-18]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 376.62episode/s, Episode Reward=-46]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 362.24episode/s, Episode Reward=-45]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 371.80episode/s, Episode Reward=-55]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 366.42episode/s, Episode Reward=-43]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 369.30episode/s, Episode Reward=-44]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 373.83episode/s, Episode Reward=-34]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 365.40episode/s, Episode Reward=-42]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 368.09episode/s, Episode Reward=-35]
Training Progress: 100%|██████████| 100/100 [00:00<00:00, 374.09episode/s, Episode Reward=-39]


In [None]:
import wandb
from car_model import Car
from mountain_car_agent import MountainCarAgent
from tqdm import tqdm
from continuous_mountain_car_env_extended import ContinuousMountainCarEnvExtended

def sweep():
    env = ContinuousMountainCarEnvExtended(render_mode='rgb_array')
    wandb.init()
    config = wandb.config

    model = Car(env, config.x_bins, config.vel_bins, config.action_bins)
    agent = MountainCarAgent(model, config.alpha, config.gamma)

    for t in range(10):
        trainValue = agent.train(250, config.epsilon)
        playValue = agent.test(100)
        wandb.log({"trainValue": trainValue, "playValue": playValue, "t" : t})
        config.epsilon *= 0.9

sweep_config = {
        'name': 'sweep-parameter-importance',
        'method': 'bayes',
        'metric': {
            'name': 'playValue',
            'goal': 'maximize'
        },
        'parameters': {
            'alpha': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'epsilon': {
                'distribution': 'uniform',
                'min': 0.6,
                'max': 0.99
            },
            'gamma': {
                'distribution': 'uniform',
                'min': 0.5,
                'max': 0.99
            },
            'action_bins': {
                'distribution': 'int_uniform',
                'max': 20,
                'min': 3
            },
            'vel_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            },
            'x_bins': {
                'distribution': 'int_uniform',
                'max': 100,
                'min': 10
            }
        }
    }
entity = "mateogiraz27-ort"
project = "mountain_car"
#sweep_id = wandb.sweep(sweep_config, entity=entity, project=project)

sweep_id = "eyuf2hhn"
wandb.agent(sweep_id, function=sweep, count=100, entity=entity, project=project)


[34m[1mwandb[0m: Agent Starting Run: bcpophdk with config:
[34m[1mwandb[0m: 	action_bins: 19
[34m[1mwandb[0m: 	alpha: 0.9128818928657194
[34m[1mwandb[0m: 	epsilon: 0.7157859208816209
[34m[1mwandb[0m: 	gamma: 0.9164145668338208
[34m[1mwandb[0m: 	vel_bins: 67
[34m[1mwandb[0m: 	x_bins: 87
Training Progress: 100%|██████████| 250/250 [00:01<00:00, 128.12episode/s, Episode Reward=-87]  
Exception in thread Thread-54 (_run_job):
Traceback (most recent call last):
  File "/Users/mateogiraz/Library/Caches/pypoetry/virtualenvs/ai-project-yycSt0xa-py3.12/lib/python3.12/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "/var/folders/63/3wsl80595ml9n1q7mqrh7r0h0000gn/T/ipykernel_74574/256021629.py", line 16, in sweep
AttributeError: 'MountainCarAgent' object has no attribute 'play'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/threading.py", l