Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [7]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 
from tqdm import tqdm

In [2]:
env = PendulumEnvExtended(render_mode='rgb_array')

In [4]:
##env = PendulumEnvExtended(render_mode='human')

Discretización de los estados

In [5]:
x_space = np.linspace(-1, 1, 10)
y_space = np.linspace(-1, 1, 10)
vel_space = np.linspace(-8, 8, 100)
vel_space

array([-8.        , -7.83838384, -7.67676768, -7.51515152, -7.35353535,
       -7.19191919, -7.03030303, -6.86868687, -6.70707071, -6.54545455,
       -6.38383838, -6.22222222, -6.06060606, -5.8989899 , -5.73737374,
       -5.57575758, -5.41414141, -5.25252525, -5.09090909, -4.92929293,
       -4.76767677, -4.60606061, -4.44444444, -4.28282828, -4.12121212,
       -3.95959596, -3.7979798 , -3.63636364, -3.47474747, -3.31313131,
       -3.15151515, -2.98989899, -2.82828283, -2.66666667, -2.50505051,
       -2.34343434, -2.18181818, -2.02020202, -1.85858586, -1.6969697 ,
       -1.53535354, -1.37373737, -1.21212121, -1.05050505, -0.88888889,
       -0.72727273, -0.56565657, -0.4040404 , -0.24242424, -0.08080808,
        0.08080808,  0.24242424,  0.4040404 ,  0.56565657,  0.72727273,
        0.88888889,  1.05050505,  1.21212121,  1.37373737,  1.53535354,
        1.6969697 ,  1.85858586,  2.02020202,  2.18181818,  2.34343434,
        2.50505051,  2.66666667,  2.82828283,  2.98989899,  3.15

Obtener el estado a partir de la observación

In [14]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [15]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

(3, 6, 52)

Discretización de las acciones

In [16]:
actions = list(np.linspace(-2, 2, 15))
actions

[-2.0,
 -1.7142857142857144,
 -1.4285714285714286,
 -1.1428571428571428,
 -0.8571428571428572,
 -0.5714285714285716,
 -0.2857142857142858,
 0.0,
 0.2857142857142856,
 0.5714285714285712,
 0.8571428571428568,
 1.1428571428571428,
 1.4285714285714284,
 1.714285714285714,
 2.0]

In [17]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [8]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [18]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [19]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
        print('explore')
    # exploit
    else:
        action = optimal_policy(state, Q)
        print('exploit')
        
    return action

Ejemplo de episodio 

In [11]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
while not done:
    
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    
    # Indice de la accion en Q
    action_idx = actions.index(action)
    
    # Acción del ambiente
    real_action = np.array([action])
     
    obs, reward, done, _, _ = env.step(real_action)
    
    next_state = get_state(obs)
    
    state = next_state
   # Usar action_idx para actualizar Q
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)


[-0.23956344  0.9708807   0.7144003 ]
explore
-> (4, 9, 60) 0.8571428571428568 -3.3376980412542867 [-0.3150153  0.9490866  1.5711323] False
exploit
-> (3, 9, 62) -2.0 -3.827744473287768 [-0.40741357  0.9132438   1.9829472 ] False
exploit
-> (3, 9, 65) -2.0 -4.3589656522339215 [-0.5124317  0.858728   2.36788  ] False
explore
-> (2, 8, 69) 0.8571428571428568 -5.0085018456207875 [-0.6404154   0.76802874  3.1404974 ] False
explore
-> (2, 8, 74) 0.8571428571428568 -6.121016885053888 [-0.7753654  0.6315128  3.8450904] False
explore
-> (1, 7, 77) 0.2857142857142856 -7.520759407454249 [-0.8936315   0.44880146  4.3615823 ] False
exploit
-> (1, 6, 77) -2.0 -9.068220255303832 [-0.97001237  0.24305545  4.3981833 ] False
exploit
-> (1, 5, 76) -2.0 -10.325670159947144 [-0.9995042   0.03148558  4.2804747 ] False
exploit
-> (1, 4, 75) -2.0 -11.508980183009992 [-0.9858016  -0.16791442  4.004089  ] False
explore
-> (1, 3, 75) 1.4285714285714284 -10.443323163688984 [-0.9311159 -0.3647233  4.092439 ] Fals

In [2]:
from Pend_Model import Pend_Model
from pendulum_agent import PendAgent
from pendulum_env_extended import PendulumEnvExtended
import random 
from tqdm import tqdm

env = PendulumEnvExtended(render_mode='rgb_array')

model = Pend_Model(env, 10, 10, 100, 15)
agent = PendAgent(model, 0.9, 0.5)
rewards = agent.train(500, 0.1)
print (rewards)

100%|██████████| 500/500 [11:39<00:00,  1.40s/it]

-4729.194176999615





In [3]:
from Pend_Model import Pend_Model
from pendulum_agent import PendAgent
from pendulum_env_extended import PendulumEnvExtended
import random 
from tqdm import tqdm

env = PendulumEnvExtended(render_mode='rgb_array')

model = Pend_Model(env, 10, 10, 100, 15)
agent = PendAgent(model, 0.9, 0.5)
rewards = agent.train(500, 0.1)
print (rewards)

100%|██████████| 500/500 [12:11<00:00,  1.46s/it]

-4683.8321478587595





In [4]:
from Pend_Model import Pend_Model
from pendulum_agent import PendAgent
from pendulum_env_extended import PendulumEnvExtended
import random 
from tqdm import tqdm

env = PendulumEnvExtended(render_mode='rgb_array')

model = Pend_Model(env, 10, 10, 100, 15)
agent = PendAgent(model, 0.9, 0.5)
rewards = agent.train(2500, 0.85)
print (rewards)

100%|██████████| 2500/2500 [59:50<00:00,  1.44s/it]

-4645.235989252851





In [1]:
from Pend_Model import Pend_Model
from pendulum_agent import PendAgent
from pendulum_env_extended import PendulumEnvExtended
import random 
import wandb
import numpy as np
from tqdm import tqdm

env = PendulumEnvExtended(render_mode='rgb_array')

pendModel = Pend_Model(env, 10, 10, 100, 15)
agent = PendAgent(pendModel, 0.9, 0.5)


wandb.init(project="pendulum",
           config = {
                'x_bins': 10,
                'y_bins': 10,
                'vel_bins': 100,
                'actions': 15,
                'gamma': 0.9,
                'alpha': 0.5,
                'epsilon_initial': 0.99,
          })

epsilon_initial = 0.99
for t in range(10):
      trainValue = agent.train(250, epsilon_initial)
      playValue = agent.play(100)
      wandb.log({'trainValue': trainValue, 'playValue': playValue, "t": t})
      epsilon_initial = epsilon_initial * (10-t)/10


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoaquinrodriguezcaussi[0m ([33mintart[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 250/250 [05:26<00:00,  1.31s/it]
100%|██████████| 250/250 [05:25<00:00,  1.30s/it]
100%|██████████| 250/250 [05:19<00:00,  1.28s/it]
100%|██████████| 250/250 [05:20<00:00,  1.28s/it]
100%|██████████| 250/250 [05:18<00:00,  1.27s/it]
100%|██████████| 250/250 [05:19<00:00,  1.28s/it]
100%|██████████| 250/250 [05:19<00:00,  1.28s/it]
100%|██████████| 250/250 [05:24<00:00,  1.30s/it]
100%|██████████| 250/250 [05:32<00:00,  1.33s/it]
100%|██████████| 250/250 [05:35<00:00,  1.34s/it]


In [1]:
from Pend_Model import Pend_Model
from pendulum_agent import PendAgent
from pendulum_env_extended import PendulumEnvExtended
import random 
import wandb
import numpy as np
from tqdm import tqdm

env = PendulumEnvExtended(render_mode='rgb_array')

pendModel = Pend_Model(env, 12, 12, 100, 15)
agent = PendAgent(pendModel, 0.9, 0.89)


wandb.init(project="pendulum",
           config = {
                'x_bins': 12,
                'y_bins': 12,
                'vel_bins': 100,
                'actions': 15,
                'gamma': 0.9,
                'alpha': 0.85,
                'epsilon_initial': 0.99,
          })

epsilon_initial = 0.99
for t in range(10):
      trainValue = agent.train(500, epsilon_initial)
      playValue = agent.play(250)
      wandb.log({'trainValue': trainValue, 'playValue': playValue, "t": t})
      epsilon_initial = epsilon_initial * (10-t)/10


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoaquinrodriguezcaussi[0m ([33mintart[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 500/500 [11:47<00:00,  1.41s/it]
100%|██████████| 500/500 [11:38<00:00,  1.40s/it]
100%|██████████| 500/500 [11:28<00:00,  1.38s/it]
100%|██████████| 500/500 [11:18<00:00,  1.36s/it]
100%|██████████| 500/500 [11:22<00:00,  1.36s/it]
100%|██████████| 500/500 [11:28<00:00,  1.38s/it]
100%|██████████| 500/500 [11:29<00:00,  1.38s/it]
100%|██████████| 500/500 [11:28<00:00,  1.38s/it]
100%|██████████| 500/500 [11:28<00:00,  1.38s/it]
100%|██████████| 500/500 [11:25<00:00,  1.37s/it]


In [1]:
from Pend_Model import Pend_Model
from pendulum_agent import PendAgent
from pendulum_env_extended import PendulumEnvExtended
import random 
import wandb
import numpy as np
from tqdm import tqdm

def train():
    env = PendulumEnvExtended(render_mode='rgb_array')

    wandb.init()
    config = wandb.config

    pendModel = Pend_Model(env, config.x_bins, config.y_bins, config.vel_bins, config.actions)
    agent = PendAgent(pendModel, config.gamma, config.alpha)

    epsilon_initial = config.epsilon

    for t in range(10):
        trainValue = agent.train(500, epsilon_initial)
        playValue = agent.play(30)
        wandb.log({'trainValue': trainValue, 'playValue': playValue, "t": t})
        epsilon_initial = epsilon_initial * (10-t)/10

sweep_config = {
    'name': 'Pendulum-sweep-2',
        'method': 'bayes',
        'metric': {
            'name': 'trainValue',
            'goal': 'maximize'
        },
        'parameters': {
            'alpha': {
                'distribution': 'uniform',
                'min': 0.7,
                'max': 0.99
            },
            'epsilon': {
                'distribution': 'uniform',
                'min': 0.8,
                'max': 0.99
            },
            'gamma': {
                'distribution': 'uniform',
                'min': 0.85,
                'max': 0.99
            },
            'x_bins': {
                'value': 10
            },
            'y_bins': {
                'distribution': 'int_uniform',
                'min': 10,
                'max': 20
            },
            'vel_bins': {
                'value' : 100
            },
            'actions': {
                'distribution': 'int_uniform',
                'min': 15,
                'max': 20
            },
        }
    }

entity = 'intart'
project = 'pendulum'
sweep_id = 'ocyv1xnp'
#sweep_id = wandb.sweep(sweep_config, project="pendulum")

wandb.agent(sweep_id=sweep_id,entity=entity, project=project, function=train, count=30)




Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: ep1kope4
Sweep URL: https://wandb.ai/intart/pendulum/sweeps/ep1kope4


[34m[1mwandb[0m: Agent Starting Run: 34z9sx6m with config:
[34m[1mwandb[0m: 	actions: 25
[34m[1mwandb[0m: 	alpha: 0.816573274318327
[34m[1mwandb[0m: 	epsilon: 0.9286608555216512
[34m[1mwandb[0m: 	gamma: 0.8785177220502983
[34m[1mwandb[0m: 	vel_bins: 100
[34m[1mwandb[0m: 	x_bins: 10
[34m[1mwandb[0m: 	y_bins: 12
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoaquinrodriguezcaussi[0m ([33mintart[0m). Use [1m`wandb login --relogin`[0m to force relogin


  4%|▎         | 18/500 [00:25<11:38,  1.45s/it][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


  4%|▍         | 19/500 [00:27<11:31,  1.44s/it]